1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# B x5 v4 v5 v2 v3 32# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 33# temp0 v17 v19 v21 v23 34# x16, x17, x7 tenporary a53 gpr load data 35 36 37BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53 38 39 # Clamp C pointers 40 LDP x10, x8, [sp] // Load cn_stride, a_offset 41 ADD x2, x2, 7 // kc = (kc + 7) & ~7 42 LDP x12, x11, [sp, 16] // Load zero, params pointer 43 BIC x2, x2, 7 44 45 .p2align 3 460: 47 # Load initial bias from w into accumulators 48 LDP s16, s18, [x5], 8 49 LDP s20, s22, [x5], 8 50 LDP s24, s26, [x5], 8 51 LDP s28, s30, [x5], 8 52 MOV x9, x3 // p = ks 53 54 .p2align 3 551: 56 # Load next A pointer 57 LDR x13, [x4], 8 58 CMP x13, x12 // if a0 == zero 59 ADD x13, x13, x8 // a0 += a_offset 60 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 61 62 # Is there at least 16 bytes for epilogue? 63 SUBS x0, x2, 16 // k = kc - 16 64 B.LO 5f 65 66 # Prologue: load A0 and 4 B's 67 LDP d0, d6, [x13], 16 // Read A0 68 LDP d4, d5, [x5] // Read B 69 LDP d2, d3, [x5, 64] // Read B 70 LDR x16, [x5, 16] // Read B 71 72 # Is there at least 16 bytes for main loop? 73 SUBS x0, x0, 16 // k = k - 16 74 B.LO 3f 75 76 # Main loop - 16 bytes of A 77 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 78 # 1 load for A0 = +1 cycle. Total 41 cycles. 79 80 .p2align 3 812: 82 # BLOCK 0 - 6 cycles 83 SMULL v17.8h, v4.8b, v0.8b 84 LDR x17, [x5, 80] 85 SMULL v19.8h, v5.8b, v0.8b 86 LDR d5, [x5, 24] 87 INS v4.d[0], x16 88 SMLAL v17.8h, v2.8b, v6.8b 89 LDR x16, [x5, 32] 90 SMLAL v19.8h, v3.8b, v6.8b 91 LDR d3, [x5, 88] 92 INS v2.d[0], x17 93 94 # BLOCK 1 - 10 cycles 95 SMULL v21.8h, v4.8b, v0.8b 96 LDR x17, [x5, 96] 97 SMULL v23.8h, v5.8b, v0.8b 98 SADALP v16.4s, v17.8h 99 SADALP v18.4s, v19.8h 100 LDR d5, [x5, 40] 101 INS v4.d[0], x16 102 SMLAL v21.8h, v2.8b, v6.8b 103 LDR x16, [x5, 48] 104 SMLAL v23.8h, v3.8b, v6.8b 105 LDR d3, [x5, 104] 106 INS v2.d[0], x17 107 108 # BLOCK 2 - 10 cycles 109 SMULL v17.8h, v4.8b, v0.8b 110 LDR x17, [x5, 112] 111 SMULL v19.8h, v5.8b, v0.8b 112 SADALP v20.4s, v21.8h 113 SADALP v22.4s, v23.8h 114 LDR d5, [x5, 56] 115 INS v4.d[0], x16 116 SMLAL v17.8h, v2.8b, v6.8b 117 LDR x16, [x5, 128] 118 SMLAL v19.8h, v3.8b, v6.8b 119 LDR d3, [x5, 120] 120 INS v2.d[0], x17 121 122 # BLOCK 3 - 15 cycles 123 SMULL v21.8h, v4.8b, v0.8b 124 LDR x7, [x13], 8 // Read A0 125 SMULL v23.8h, v5.8b, v0.8b 126 LDR x17, [x5, 192] // Read B 127 SADALP v24.4s, v17.8h 128 SUBS x0, x0, 16 129 SADALP v26.4s, v19.8h 130 LDR d5, [x5, 136] // Read B 131 INS v4.d[0], x16 132 SMLAL v21.8h, v2.8b, v6.8b 133 LDR x16, [x5, 144] 134 SMLAL v23.8h, v3.8b, v6.8b 135 LDR d6, [x13], 8 // Read A0 136 INS v0.d[0], x7 137 LDR d3, [x5, 200] // Read B 138 INS v2.d[0], x17 139 SADALP v28.4s, v21.8h 140 ADD x5, x5, 128 141 SADALP v30.4s, v23.8h 142 B.HS 2b 143 144 # Epilogue 145 # Same as main loop except no loads at end of loop 146 147 .p2align 3 1483: 149 # BLOCK 0 - 6 cycles 150 SMULL v17.8h, v4.8b, v0.8b 151 LDR x17, [x5, 80] 152 SMULL v19.8h, v5.8b, v0.8b 153 LDR d5, [x5, 24] 154 INS v4.d[0], x16 155 SMLAL v17.8h, v2.8b, v6.8b 156 LDR x16, [x5, 32] 157 SMLAL v19.8h, v3.8b, v6.8b 158 LDR d3, [x5, 88] 159 INS v2.d[0], x17 160 161 # BLOCK 1 - 10 cycles 162 SMULL v21.8h, v4.8b, v0.8b 163 LDR x17, [x5, 96] 164 SMULL v23.8h, v5.8b, v0.8b 165 SADALP v16.4s, v17.8h 166 SADALP v18.4s, v19.8h 167 LDR d5, [x5, 40] 168 INS v4.d[0], x16 169 SMLAL v21.8h, v2.8b, v6.8b 170 LDR x16, [x5, 48] 171 SMLAL v23.8h, v3.8b, v6.8b 172 LDR d3, [x5, 104] 173 INS v2.d[0], x17 174 175 # BLOCK 2 - 10 cycles 176 SMULL v17.8h, v4.8b, v0.8b 177 LDR x17, [x5, 112] 178 SMULL v19.8h, v5.8b, v0.8b 179 SADALP v20.4s, v21.8h 180 SADALP v22.4s, v23.8h 181 LDR d5, [x5, 56] 182 INS v4.d[0], x16 183 SMLAL v17.8h, v2.8b, v6.8b 184 SMLAL v19.8h, v3.8b, v6.8b 185 LDR d3, [x5, 120] 186 INS v2.d[0], x17 187 188 # BLOCK 3 - 12 cycles 189 SMULL v21.8h, v4.8b, v0.8b 190 SMULL v23.8h, v5.8b, v0.8b 191 SADALP v24.4s, v17.8h 192 SADALP v26.4s, v19.8h 193 SMLAL v21.8h, v2.8b, v6.8b 194 SMLAL v23.8h, v3.8b, v6.8b 195 SADALP v28.4s, v21.8h 196 ADD x5, x5, 128 197 SADALP v30.4s, v23.8h 198 199 # Is there a remainder?- 8 bytes of A 200 TBNZ x0, 3, 5f 201 202 # ks loop 203 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 204 B.HI 1b 205 2064: 207 # Add columns 208 ADDP v16.4s, v16.4s, v18.4s 209 ADDP v20.4s, v20.4s, v22.4s 210 ADDP v24.4s, v24.4s, v26.4s 211 ADDP v28.4s, v28.4s, v30.4s 212 ADDP v0.4s, v16.4s, v20.4s 213 ADDP v1.4s, v24.4s, v28.4s 214 215 # Load per channel scale values from weights 216 SCVTF v0.4s, v0.4s 217 LDR q4, [x5], 16 218 SCVTF v1.4s, v1.4s 219 LDR q5, [x5], 16 220 FMUL v0.4s, v0.4s, v4.4s 221 FMUL v1.4s, v1.4s, v5.4s 222 223 FCVTNS v0.4s, v0.4s 224 FCVTNS v1.4s, v1.4s 225 226 LD1R {v5.8h}, [x11], 2 227 SQXTN v0.4h, v0.4s 228 SQXTN2 v0.8h, v1.4s 229 SUBS x1, x1, 8 230 SQADD v0.8h, v0.8h, v5.8h 231 LD1R {v1.16b}, [x11], 1 232 SQXTN v0.8b, v0.8h 233 LD1R {v17.16b}, [x11] 234 SMAX v0.8b, v0.8b, v1.8b 235 SUB x11, x11, 3 // rewind params pointer 236 237 SMIN v0.8b, v0.8b, v17.8b 238 B.LO 6f 239 240 # Store full 1 x 8 241 ST1 {v0.8b}, [x6], x10 242 SUB x4, x4, x3 // a -= ks 243 B.HI 0b 244 RET 245 246 # Remainder - 8 bytes of A 247 .p2align 3 2485: 249 LDR d0, [x13], 8 250 LDP d4, d5, [x5] 251 LDP d6, d7, [x5, 16] 252 SMULL v17.8h, v4.8b, v0.8b 253 SMULL v19.8h, v5.8b, v0.8b 254 SMULL v21.8h, v6.8b, v0.8b 255 SMULL v23.8h, v7.8b, v0.8b 256 LDP d4, d5, [x5, 32] 257 LDP d6, d7, [x5, 48] 258 SADALP v16.4s, v17.8h 259 SADALP v18.4s, v19.8h 260 SADALP v20.4s, v21.8h 261 SADALP v22.4s, v23.8h 262 SMULL v17.8h, v4.8b, v0.8b 263 SMULL v19.8h, v5.8b, v0.8b 264 SMULL v21.8h, v6.8b, v0.8b 265 SMULL v23.8h, v7.8b, v0.8b 266 ADD x5, x5, 64 267 SADALP v24.4s, v17.8h 268 SADALP v26.4s, v19.8h 269 SADALP v28.4s, v21.8h 270 SADALP v30.4s, v23.8h 271 272 # ks loop 273 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 274 B.HI 1b 275 B 4b 276 277 # Store odd width 278 .p2align 3 2796: 280 TBZ x1, 2, 7f 281 STR s0, [x6], 4 282 EXT v0.16b, v0.16b, v0.16b, 4 283 2847: 285 TBZ x1, 1, 8f 286 STR h0, [x6], 2 287 EXT v0.16b, v0.16b, v0.16b, 2 2888: 289 TBZ x1, 0, 9f 290 STR b0, [x6] 2919: 292 RET 293 294END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53 295 296#ifdef __ELF__ 297.section ".note.GNU-stack","",%progbits 298#endif 299