1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v8 v9 v10 v11 v12 v13 v14 v15 38 39BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32 40 41 # Clamp A and C pointers 42 CMP x0, 2 // if mr < 2 43 ADD x2, x2, 3 // kc = (kc + 3) & ~3 44 ADD x15, x3, x4 // a1 = a0 + a_stride 45 ADD x8, x6, x7 // c1 = c0 + cm_stride 46 CSEL x15, x3, x15, LO // a1 = a0 47 CSEL x8, x6, x8, LO // c1 = c0 48 BIC x2, x2, 3 49 50 ADD x13, x15, x4 // a2 = a1 + a_stride 51 ADD x9, x8, x7 // c2 = c1 + cm_stride 52 // if mr <= 2 53 CSEL x13, x15, x13, LS // a2 = a1 54 CSEL x9, x8, x9, LS // c2 = c1 55 56 LDP x12, x11, [sp] // cn_stride, params 57 58 CMP x0, 4 // if mr < 4 59 ADD x4, x13, x4 // a3 = a2 + a_stride 60 ADD x7, x9, x7 // c3 = c2 + cm_stride 61 CSEL x4, x13, x4, LO // a3 = a2 62 CSEL x7, x9, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV x0, x2 // k = kc. assumes kc > 0 78 MOV v27.16b, v24.16b 79 MOV v29.16b, v28.16b 80 MOV v30.16b, v28.16b 81 MOV v31.16b, v28.16b 82 83 # Main loop - 4 bytes of A 84 .p2align 3 851: 86 LD1R {v0.4s}, [x3], 4 87 LDR q4, [x5], 16 88 LD1R {v1.4s}, [x15], 4 89 LD1R {v2.4s}, [x13], 4 90 LD1R {v3.4s}, [x4], 4 91 SDOT v16.4s, v4.16b, v0.16b 92 SDOT v17.4s, v4.16b, v1.16b 93 LDR q5, [x5], 16 94 SDOT v18.4s, v4.16b, v2.16b 95 SDOT v19.4s, v4.16b, v3.16b 96 LDR q6, [x5], 16 97 SDOT v20.4s, v5.16b, v0.16b 98 SDOT v21.4s, v5.16b, v1.16b 99 LDR q7, [x5], 16 100 SDOT v22.4s, v5.16b, v2.16b 101 SDOT v23.4s, v5.16b, v3.16b 102 SUBS x0, x0, 4 103 SDOT v24.4s, v6.16b, v0.16b 104 SDOT v25.4s, v6.16b, v1.16b 105 SDOT v26.4s, v6.16b, v2.16b 106 SDOT v27.4s, v6.16b, v3.16b 107 SDOT v28.4s, v7.16b, v0.16b 108 SDOT v29.4s, v7.16b, v1.16b 109 SDOT v30.4s, v7.16b, v2.16b 110 SDOT v31.4s, v7.16b, v3.16b 111 B.HI 1b 112 113 # Apply params - preshift, scale, postshift, bias and clamp 114 LD1R {v4.4s}, [x11], 4 115 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 116 SQSHL v17.4s, v17.4s, v4.4s 117 SQSHL v18.4s, v18.4s, v4.4s 118 SQSHL v19.4s, v19.4s, v4.4s 119 SQSHL v20.4s, v20.4s, v4.4s 120 SQSHL v21.4s, v21.4s, v4.4s 121 SQSHL v22.4s, v22.4s, v4.4s 122 SQSHL v23.4s, v23.4s, v4.4s 123 LD1R {v5.4s}, [x11], 4 124 SQSHL v24.4s, v24.4s, v4.4s 125 SQSHL v25.4s, v25.4s, v4.4s 126 SQSHL v26.4s, v26.4s, v4.4s 127 SQSHL v27.4s, v27.4s, v4.4s 128 SQSHL v28.4s, v28.4s, v4.4s 129 SQSHL v29.4s, v29.4s, v4.4s 130 SQSHL v30.4s, v30.4s, v4.4s 131 SQSHL v31.4s, v31.4s, v4.4s 132 LD1R {v6.4s}, [x11], 4 133 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 134 SQDMULH v17.4s, v17.4s, v5.4s 135 SQDMULH v18.4s, v18.4s, v5.4s 136 SQDMULH v19.4s, v19.4s, v5.4s 137 SQDMULH v20.4s, v20.4s, v5.4s 138 SQDMULH v21.4s, v21.4s, v5.4s 139 SQDMULH v22.4s, v22.4s, v5.4s 140 SQDMULH v23.4s, v23.4s, v5.4s 141 SQDMULH v24.4s, v24.4s, v5.4s 142 SQDMULH v25.4s, v25.4s, v5.4s 143 SQDMULH v26.4s, v26.4s, v5.4s 144 SQDMULH v27.4s, v27.4s, v5.4s 145 SQDMULH v28.4s, v28.4s, v5.4s 146 SQDMULH v29.4s, v29.4s, v5.4s 147 SQDMULH v30.4s, v30.4s, v5.4s 148 SQDMULH v31.4s, v31.4s, v5.4s 149 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 150 SRSHL v17.4s, v17.4s, v6.4s 151 SRSHL v18.4s, v18.4s, v6.4s 152 SRSHL v19.4s, v19.4s, v6.4s 153 SRSHL v20.4s, v20.4s, v6.4s 154 SRSHL v21.4s, v21.4s, v6.4s 155 SRSHL v22.4s, v22.4s, v6.4s 156 SRSHL v23.4s, v23.4s, v6.4s 157 SRSHL v24.4s, v24.4s, v6.4s 158 SRSHL v25.4s, v25.4s, v6.4s 159 SRSHL v26.4s, v26.4s, v6.4s 160 SRSHL v27.4s, v27.4s, v6.4s 161 SRSHL v28.4s, v28.4s, v6.4s 162 SRSHL v29.4s, v29.4s, v6.4s 163 SRSHL v30.4s, v30.4s, v6.4s 164 SRSHL v31.4s, v31.4s, v6.4s 165 166 SQXTN v16.4h, v16.4s 167 SQXTN v17.4h, v17.4s 168 SQXTN v18.4h, v18.4s 169 SQXTN v19.4h, v19.4s 170 SQXTN v24.4h, v24.4s 171 SQXTN v25.4h, v25.4s 172 SQXTN v26.4h, v26.4s 173 SQXTN v27.4h, v27.4s 174 LD1R {v6.8h}, [x11], 2 // add bias 175 176 SQXTN2 v16.8h, v20.4s 177 SQXTN2 v17.8h, v21.4s 178 SQXTN2 v18.8h, v22.4s 179 SQXTN2 v19.8h, v23.4s 180 SQXTN2 v24.8h, v28.4s 181 SQXTN2 v25.8h, v29.4s 182 SQXTN2 v26.8h, v30.4s 183 SQXTN2 v27.8h, v31.4s 184 185 SQADD v16.8h, v16.8h, v6.8h 186 SQADD v17.8h, v17.8h, v6.8h 187 SQADD v18.8h, v18.8h, v6.8h 188 SQADD v19.8h, v19.8h, v6.8h 189 SQADD v24.8h, v24.8h, v6.8h 190 SQADD v25.8h, v25.8h, v6.8h 191 SQADD v26.8h, v26.8h, v6.8h 192 SQADD v27.8h, v27.8h, v6.8h 193 LD1R {v4.16b}, [x11], 1 // clamp min value 194 195 SQXTN v0.8b, v16.8h 196 SQXTN v1.8b, v17.8h 197 SQXTN v2.8b, v18.8h 198 SQXTN v3.8b, v19.8h 199 LD1R {v5.16b}, [x11] // clamp max value 200 SQXTN2 v0.16b, v24.8h 201 SQXTN2 v1.16b, v25.8h 202 SQXTN2 v2.16b, v26.8h 203 SQXTN2 v3.16b, v27.8h 204 SUB x11, x11, 15 // rewind params pointer 205 206 SMAX v0.16b, v0.16b, v4.16b 207 SMAX v1.16b, v1.16b, v4.16b 208 SMAX v2.16b, v2.16b, v4.16b 209 SMAX v3.16b, v3.16b, v4.16b 210 SUBS x1, x1, 16 211 SMIN v0.16b, v0.16b, v5.16b 212 SMIN v1.16b, v1.16b, v5.16b 213 SMIN v2.16b, v2.16b, v5.16b 214 SMIN v3.16b, v3.16b, v5.16b 215 B.LO 2f 216 217 # Store full 4 x 16 218 ST1 {v0.16b}, [x6], x12 219 SUB x3, x3, x2 // a0 -= kc 220 ST1 {v1.16b}, [x8], x12 221 SUB x15, x15, x2 // a1 -= kc 222 ST1 {v2.16b}, [x9], x12 223 SUB x13, x13, x2 // a2 -= kc 224 ST1 {v3.16b}, [x7], x12 225 SUB x4, x4, x2 // a3 -= kc 226 B.NE 0b 227 RET 228 229 # Store odd width 230 .p2align 3 2312: 232 TBZ x1, 3, 3f 233 STR d0, [x6], 8 234 STR d1, [x8], 8 235 DUP d0, v0.d[1] 236 DUP d1, v1.d[1] 237 STR d2, [x9], 8 238 STR d3, [x7], 8 239 DUP d2, v2.d[1] 240 DUP d3, v3.d[1] 2413: 242 TBZ x1, 2, 4f 243 STR s0, [x6], 4 244 STR s1, [x8], 4 245 DUP s0, v0.s[1] 246 DUP s1, v1.s[1] 247 STR s2, [x9], 4 248 STR s3, [x7], 4 249 DUP s2, v2.s[1] 250 DUP s3, v3.s[1] 2514: 252 TBZ x1, 1, 5f 253 STR h0, [x6], 2 254 STR h1, [x8], 2 255 DUP h0, v0.h[1] 256 DUP h1, v1.h[1] 257 STR h2, [x9], 2 258 STR h3, [x7], 2 259 DUP h2, v2.h[1] 260 DUP h3, v3.h[1] 2615: 262 TBZ x1, 0, 6f 263 STR b0, [x6] 264 STR b1, [x8] 265 STR b2, [x9] 266 STR b3, [x7] 2676: 268 RET 269 270END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32 271 272#ifdef __ELF__ 273.section ".note.GNU-stack","",%progbits 274#endif 275