1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v8 v9 v10 v11 v12 v13 v14 v15 38 39BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32 40 41 # Clamp A and C pointers 42 CMP x0, 2 // if mr < 2 43 ADD x2, x2, 3 // kc = (kc + 3) & ~3 44 ADD x15, x3, x4 // a1 = a0 + a_stride 45 ADD x8, x6, x7 // c1 = c0 + cm_stride 46 CSEL x15, x3, x15, LO // a1 = a0 47 CSEL x8, x6, x8, LO // c1 = c0 48 BIC x2, x2, 3 49 50 ADD x13, x15, x4 // a2 = a1 + a_stride 51 ADD x9, x8, x7 // c2 = c1 + cm_stride 52 // if mr <= 2 53 CSEL x13, x15, x13, LS // a2 = a1 54 CSEL x9, x8, x9, LS // c2 = c1 55 56 LDP x12, x11, [sp] // cn_stride, params 57 58 CMP x0, 4 // if mr < 4 59 ADD x4, x13, x4 // a3 = a2 + a_stride 60 ADD x7, x9, x7 // c3 = c2 + cm_stride 61 CSEL x4, x13, x4, LO // a3 = a2 62 CSEL x7, x9, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV x0, x2 // k = kc. assumes kc > 0 78 MOV v27.16b, v24.16b 79 MOV v29.16b, v28.16b 80 MOV v30.16b, v28.16b 81 MOV v31.16b, v28.16b 82 83 # Main loop - 4 bytes of A 84 .p2align 3 851: 86 LD1R {v0.4s}, [x3], 4 87 LDR q4, [x5], 16 88 LD1R {v1.4s}, [x15], 4 89 LD1R {v2.4s}, [x13], 4 90 LD1R {v3.4s}, [x4], 4 91 SDOT v16.4s, v4.16b, v0.16b 92 SDOT v17.4s, v4.16b, v1.16b 93 LDR q5, [x5], 16 94 SDOT v18.4s, v4.16b, v2.16b 95 SDOT v19.4s, v4.16b, v3.16b 96 LDR q6, [x5], 16 97 SDOT v20.4s, v5.16b, v0.16b 98 SDOT v21.4s, v5.16b, v1.16b 99 LDR q7, [x5], 16 100 SDOT v22.4s, v5.16b, v2.16b 101 SDOT v23.4s, v5.16b, v3.16b 102 SUBS x0, x0, 4 103 SDOT v24.4s, v6.16b, v0.16b 104 SDOT v25.4s, v6.16b, v1.16b 105 SDOT v26.4s, v6.16b, v2.16b 106 SDOT v27.4s, v6.16b, v3.16b 107 SDOT v28.4s, v7.16b, v0.16b 108 SDOT v29.4s, v7.16b, v1.16b 109 SDOT v30.4s, v7.16b, v2.16b 110 SDOT v31.4s, v7.16b, v3.16b 111 B.HI 1b 112 113 SCVTF v16.4s, v16.4s 114 SCVTF v17.4s, v17.4s 115 # Load per channel scale values from weights 116 LDR q4, [x5], 16 117 SCVTF v18.4s, v18.4s 118 SCVTF v19.4s, v19.4s 119 LDR q5, [x5], 16 120 SCVTF v20.4s, v20.4s 121 SCVTF v21.4s, v21.4s 122 SCVTF v22.4s, v22.4s 123 SCVTF v23.4s, v23.4s 124 SCVTF v24.4s, v24.4s 125 SCVTF v25.4s, v25.4s 126 SCVTF v26.4s, v26.4s 127 SCVTF v27.4s, v27.4s 128 SCVTF v28.4s, v28.4s 129 SCVTF v29.4s, v29.4s 130 SCVTF v30.4s, v30.4s 131 SCVTF v31.4s, v31.4s 132 133 LDR q6, [x5], 16 134 FMUL v16.4s, v16.4s, v4.4s 135 FMUL v17.4s, v17.4s, v4.4s 136 FMUL v18.4s, v18.4s, v4.4s 137 FMUL v19.4s, v19.4s, v4.4s 138 FMUL v20.4s, v20.4s, v5.4s 139 LDR q4, [x5], 16 140 FMUL v21.4s, v21.4s, v5.4s 141 FMUL v22.4s, v22.4s, v5.4s 142 FMUL v23.4s, v23.4s, v5.4s 143 FMUL v24.4s, v24.4s, v6.4s 144 FMUL v25.4s, v25.4s, v6.4s 145 FMUL v26.4s, v26.4s, v6.4s 146 FMUL v27.4s, v27.4s, v6.4s 147 FMUL v28.4s, v28.4s, v4.4s 148 FMUL v29.4s, v29.4s, v4.4s 149 FMUL v30.4s, v30.4s, v4.4s 150 FMUL v31.4s, v31.4s, v4.4s 151 152 FCVTNS v16.4s, v16.4s 153 FCVTNS v17.4s, v17.4s 154 FCVTNS v18.4s, v18.4s 155 FCVTNS v19.4s, v19.4s 156 FCVTNS v20.4s, v20.4s 157 FCVTNS v21.4s, v21.4s 158 FCVTNS v22.4s, v22.4s 159 FCVTNS v23.4s, v23.4s 160 FCVTNS v24.4s, v24.4s 161 FCVTNS v25.4s, v25.4s 162 FCVTNS v26.4s, v26.4s 163 FCVTNS v27.4s, v27.4s 164 FCVTNS v28.4s, v28.4s 165 FCVTNS v29.4s, v29.4s 166 FCVTNS v30.4s, v30.4s 167 FCVTNS v31.4s, v31.4s 168 169 SQXTN v16.4h, v16.4s 170 SQXTN v17.4h, v17.4s 171 SQXTN v18.4h, v18.4s 172 SQXTN v19.4h, v19.4s 173 SQXTN v24.4h, v24.4s 174 SQXTN v25.4h, v25.4s 175 SQXTN v26.4h, v26.4s 176 SQXTN v27.4h, v27.4s 177 LD1R {v6.8h}, [x11], 2 // add bias 178 179 SQXTN2 v16.8h, v20.4s 180 SQXTN2 v17.8h, v21.4s 181 SQXTN2 v18.8h, v22.4s 182 SQXTN2 v19.8h, v23.4s 183 SQXTN2 v24.8h, v28.4s 184 SQXTN2 v25.8h, v29.4s 185 SQXTN2 v26.8h, v30.4s 186 SQXTN2 v27.8h, v31.4s 187 188 SQADD v16.8h, v16.8h, v6.8h 189 SQADD v17.8h, v17.8h, v6.8h 190 SQADD v18.8h, v18.8h, v6.8h 191 SQADD v19.8h, v19.8h, v6.8h 192 SQADD v24.8h, v24.8h, v6.8h 193 SQADD v25.8h, v25.8h, v6.8h 194 SQADD v26.8h, v26.8h, v6.8h 195 SQADD v27.8h, v27.8h, v6.8h 196 LD1R {v4.16b}, [x11], 1 // clamp min value 197 198 SQXTN v0.8b, v16.8h 199 SQXTN v1.8b, v17.8h 200 SQXTN v2.8b, v18.8h 201 SQXTN v3.8b, v19.8h 202 LD1R {v5.16b}, [x11] // clamp max value 203 SQXTN2 v0.16b, v24.8h 204 SQXTN2 v1.16b, v25.8h 205 SQXTN2 v2.16b, v26.8h 206 SQXTN2 v3.16b, v27.8h 207 SUB x11, x11, 3 // rewind params pointer 208 209 SMAX v0.16b, v0.16b, v4.16b 210 SMAX v1.16b, v1.16b, v4.16b 211 SMAX v2.16b, v2.16b, v4.16b 212 SMAX v3.16b, v3.16b, v4.16b 213 SUBS x1, x1, 16 214 SMIN v0.16b, v0.16b, v5.16b 215 SMIN v1.16b, v1.16b, v5.16b 216 SMIN v2.16b, v2.16b, v5.16b 217 SMIN v3.16b, v3.16b, v5.16b 218 B.LO 2f 219 220 # Store full 4 x 16 221 ST1 {v0.16b}, [x6], x12 222 SUB x3, x3, x2 // a0 -= kc 223 ST1 {v1.16b}, [x8], x12 224 SUB x15, x15, x2 // a1 -= kc 225 ST1 {v2.16b}, [x9], x12 226 SUB x13, x13, x2 // a2 -= kc 227 ST1 {v3.16b}, [x7], x12 228 SUB x4, x4, x2 // a3 -= kc 229 B.NE 0b 230 RET 231 232 # Store odd width 233 .p2align 3 2342: 235 TBZ x1, 3, 3f 236 STR d0, [x6], 8 237 STR d1, [x8], 8 238 DUP d0, v0.d[1] 239 DUP d1, v1.d[1] 240 STR d2, [x9], 8 241 STR d3, [x7], 8 242 DUP d2, v2.d[1] 243 DUP d3, v3.d[1] 2443: 245 TBZ x1, 2, 4f 246 STR s0, [x6], 4 247 STR s1, [x8], 4 248 DUP s0, v0.s[1] 249 DUP s1, v1.s[1] 250 STR s2, [x9], 4 251 STR s3, [x7], 4 252 DUP s2, v2.s[1] 253 DUP s3, v3.s[1] 2544: 255 TBZ x1, 1, 5f 256 STR h0, [x6], 2 257 STR h1, [x8], 2 258 DUP h0, v0.h[1] 259 DUP h1, v1.h[1] 260 STR h2, [x9], 2 261 STR h3, [x7], 2 262 DUP h2, v2.h[1] 263 DUP h3, v3.h[1] 2645: 265 TBZ x1, 0, 6f 266 STR b0, [x6] 267 STR b1, [x8] 268 STR b2, [x9] 269 STR b3, [x7] 2706: 271 RET 272 273END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32 274 275#ifdef __ELF__ 276.section ".note.GNU-stack","",%progbits 277#endif 278