1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v8 v9 v10 v11 v12 v13 v14 v15 38 39BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32 40 41 # Clamp A and C pointers 42 CMP x0, 2 // if mr < 2 43 ADD x2, x2, 3 // kc = (kc + 3) & ~3 44 ADD x15, x3, x4 // a1 = a0 + a_stride 45 ADD x8, x6, x7 // c1 = c0 + cm_stride 46 CSEL x15, x3, x15, LO // a1 = a0 47 CSEL x8, x6, x8, LO // c1 = c0 48 BIC x2, x2, 3 49 50 ADD x13, x15, x4 // a2 = a1 + a_stride 51 ADD x9, x8, x7 // c2 = c1 + cm_stride 52 // if mr <= 2 53 CSEL x13, x15, x13, LS // a2 = a1 54 CSEL x9, x8, x9, LS // c2 = c1 55 56 LDP x12, x11, [sp] // cn_stride, params 57 58 CMP x0, 4 // if mr < 4 59 ADD x4, x13, x4 // a3 = a2 + a_stride 60 ADD x7, x9, x7 // c3 = c2 + cm_stride 61 CSEL x4, x13, x4, LO // a3 = a2 62 CSEL x7, x9, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV x0, x2 // k = kc. assumes kc > 0 78 MOV v27.16b, v24.16b 79 MOV v29.16b, v28.16b 80 MOV v30.16b, v28.16b 81 MOV v31.16b, v28.16b 82 83 # Main loop - 4 bytes of A 84 .p2align 3 851: 86 LD1R {v0.4s}, [x3], 4 87 LDR q4, [x5], 16 88 LD1R {v1.4s}, [x15], 4 89 LD1R {v2.4s}, [x13], 4 90 LD1R {v3.4s}, [x4], 4 91 SDOT v16.4s, v4.16b, v0.16b 92 SDOT v17.4s, v4.16b, v1.16b 93 LDR q5, [x5], 16 94 SDOT v18.4s, v4.16b, v2.16b 95 SDOT v19.4s, v4.16b, v3.16b 96 LDR q6, [x5], 16 97 SDOT v20.4s, v5.16b, v0.16b 98 SDOT v21.4s, v5.16b, v1.16b 99 LDR q7, [x5], 16 100 SDOT v22.4s, v5.16b, v2.16b 101 SDOT v23.4s, v5.16b, v3.16b 102 SUBS x0, x0, 4 103 SDOT v24.4s, v6.16b, v0.16b 104 SDOT v25.4s, v6.16b, v1.16b 105 SDOT v26.4s, v6.16b, v2.16b 106 SDOT v27.4s, v6.16b, v3.16b 107 SDOT v28.4s, v7.16b, v0.16b 108 SDOT v29.4s, v7.16b, v1.16b 109 SDOT v30.4s, v7.16b, v2.16b 110 SDOT v31.4s, v7.16b, v3.16b 111 B.HI 1b 112 113 SCVTF v16.4s, v16.4s 114 SCVTF v17.4s, v17.4s 115 # Apply params - scale, bias and clamp 116 LD1R {v4.4s}, [x11], 4 117 SCVTF v18.4s, v18.4s 118 SCVTF v19.4s, v19.4s 119 SCVTF v20.4s, v20.4s 120 SCVTF v21.4s, v21.4s 121 SCVTF v22.4s, v22.4s 122 SCVTF v23.4s, v23.4s 123 SCVTF v24.4s, v24.4s 124 SCVTF v25.4s, v25.4s 125 SCVTF v26.4s, v26.4s 126 SCVTF v27.4s, v27.4s 127 SCVTF v28.4s, v28.4s 128 SCVTF v29.4s, v29.4s 129 SCVTF v30.4s, v30.4s 130 SCVTF v31.4s, v31.4s 131 132 FMUL v16.4s, v16.4s, v4.4s 133 FMUL v17.4s, v17.4s, v4.4s 134 FMUL v18.4s, v18.4s, v4.4s 135 FMUL v19.4s, v19.4s, v4.4s 136 FMUL v20.4s, v20.4s, v4.4s 137 FMUL v21.4s, v21.4s, v4.4s 138 FMUL v22.4s, v22.4s, v4.4s 139 FMUL v23.4s, v23.4s, v4.4s 140 FMUL v24.4s, v24.4s, v4.4s 141 FMUL v25.4s, v25.4s, v4.4s 142 FMUL v26.4s, v26.4s, v4.4s 143 FMUL v27.4s, v27.4s, v4.4s 144 FMUL v28.4s, v28.4s, v4.4s 145 FMUL v29.4s, v29.4s, v4.4s 146 FMUL v30.4s, v30.4s, v4.4s 147 FMUL v31.4s, v31.4s, v4.4s 148 149 FCVTNS v16.4s, v16.4s 150 FCVTNS v17.4s, v17.4s 151 FCVTNS v18.4s, v18.4s 152 FCVTNS v19.4s, v19.4s 153 FCVTNS v20.4s, v20.4s 154 FCVTNS v21.4s, v21.4s 155 FCVTNS v22.4s, v22.4s 156 FCVTNS v23.4s, v23.4s 157 FCVTNS v24.4s, v24.4s 158 FCVTNS v25.4s, v25.4s 159 FCVTNS v26.4s, v26.4s 160 FCVTNS v27.4s, v27.4s 161 FCVTNS v28.4s, v28.4s 162 FCVTNS v29.4s, v29.4s 163 FCVTNS v30.4s, v30.4s 164 FCVTNS v31.4s, v31.4s 165 166 SQXTN v16.4h, v16.4s 167 SQXTN v17.4h, v17.4s 168 SQXTN v18.4h, v18.4s 169 SQXTN v19.4h, v19.4s 170 SQXTN v24.4h, v24.4s 171 SQXTN v25.4h, v25.4s 172 SQXTN v26.4h, v26.4s 173 SQXTN v27.4h, v27.4s 174 LD1R {v6.8h}, [x11], 2 // add bias 175 176 SQXTN2 v16.8h, v20.4s 177 SQXTN2 v17.8h, v21.4s 178 SQXTN2 v18.8h, v22.4s 179 SQXTN2 v19.8h, v23.4s 180 SQXTN2 v24.8h, v28.4s 181 SQXTN2 v25.8h, v29.4s 182 SQXTN2 v26.8h, v30.4s 183 SQXTN2 v27.8h, v31.4s 184 185 SQADD v16.8h, v16.8h, v6.8h 186 SQADD v17.8h, v17.8h, v6.8h 187 SQADD v18.8h, v18.8h, v6.8h 188 SQADD v19.8h, v19.8h, v6.8h 189 SQADD v24.8h, v24.8h, v6.8h 190 SQADD v25.8h, v25.8h, v6.8h 191 SQADD v26.8h, v26.8h, v6.8h 192 SQADD v27.8h, v27.8h, v6.8h 193 LD1R {v4.16b}, [x11], 1 // clamp min value 194 195 SQXTN v0.8b, v16.8h 196 SQXTN v1.8b, v17.8h 197 SQXTN v2.8b, v18.8h 198 SQXTN v3.8b, v19.8h 199 LD1R {v5.16b}, [x11] // clamp max value 200 SQXTN2 v0.16b, v24.8h 201 SQXTN2 v1.16b, v25.8h 202 SQXTN2 v2.16b, v26.8h 203 SQXTN2 v3.16b, v27.8h 204 SUB x11, x11, 7 // rewind params pointer 205 206 SMAX v0.16b, v0.16b, v4.16b 207 SMAX v1.16b, v1.16b, v4.16b 208 SMAX v2.16b, v2.16b, v4.16b 209 SMAX v3.16b, v3.16b, v4.16b 210 SUBS x1, x1, 16 211 SMIN v0.16b, v0.16b, v5.16b 212 SMIN v1.16b, v1.16b, v5.16b 213 SMIN v2.16b, v2.16b, v5.16b 214 SMIN v3.16b, v3.16b, v5.16b 215 B.LO 2f 216 217 # Store full 4 x 16 218 ST1 {v0.16b}, [x6], x12 219 SUB x3, x3, x2 // a0 -= kc 220 ST1 {v1.16b}, [x8], x12 221 SUB x15, x15, x2 // a1 -= kc 222 ST1 {v2.16b}, [x9], x12 223 SUB x13, x13, x2 // a2 -= kc 224 ST1 {v3.16b}, [x7], x12 225 SUB x4, x4, x2 // a3 -= kc 226 B.NE 0b 227 RET 228 229 # Store odd width 230 .p2align 3 2312: 232 TBZ x1, 3, 3f 233 STR d0, [x6], 8 234 STR d1, [x8], 8 235 DUP d0, v0.d[1] 236 DUP d1, v1.d[1] 237 STR d2, [x9], 8 238 STR d3, [x7], 8 239 DUP d2, v2.d[1] 240 DUP d3, v3.d[1] 2413: 242 TBZ x1, 2, 4f 243 STR s0, [x6], 4 244 STR s1, [x8], 4 245 DUP s0, v0.s[1] 246 DUP s1, v1.s[1] 247 STR s2, [x9], 4 248 STR s3, [x7], 4 249 DUP s2, v2.s[1] 250 DUP s3, v3.s[1] 2514: 252 TBZ x1, 1, 5f 253 STR h0, [x6], 2 254 STR h1, [x8], 2 255 DUP h0, v0.h[1] 256 DUP h1, v1.h[1] 257 STR h2, [x9], 2 258 STR h3, [x7], 2 259 DUP h2, v2.h[1] 260 DUP h3, v3.h[1] 2615: 262 TBZ x1, 0, 6f 263 STR b0, [x6] 264 STR b1, [x8] 265 STR b2, [x9] 266 STR b3, [x7] 2676: 268 RET 269 270END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32 271 272#ifdef __ELF__ 273.section ".note.GNU-stack","",%progbits 274#endif 275