1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# B x5 v4 v5 v2 v3 32# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 33# temp0 v17 v19 v21 v23 34# x16, x17, x7 tenporary a53 gpr load data 35 36 37BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53 38 39 # Clamp C pointers 40 LDP x10, x8, [sp] // Load cn_stride, a_offset 41 ADD x2, x2, 7 // kc = (kc + 7) & ~7 42 LDP x12, x11, [sp, 16] // Load zero, params pointer 43 BIC x2, x2, 7 44 45 .p2align 3 460: 47 # Load initial bias from w into accumulators 48 LDP s16, s18, [x5], 8 49 LDP s20, s22, [x5], 8 50 LDP s24, s26, [x5], 8 51 LDP s28, s30, [x5], 8 52 MOV x9, x3 // p = ks 53 54 .p2align 3 551: 56 # Load next A pointer 57 LDR x13, [x4], 8 58 CMP x13, x12 // if a0 == zero 59 ADD x13, x13, x8 // a0 += a_offset 60 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 61 62 # Is there at least 16 bytes for epilogue? 63 SUBS x0, x2, 16 // k = kc - 16 64 B.LO 5f 65 66 # Prologue: load A0 and 4 B's 67 LDP d0, d6, [x13], 16 // Read A0 68 LDP d4, d5, [x5] // Read B 69 LDP d2, d3, [x5, 64] // Read B 70 LDR x16, [x5, 16] // Read B 71 72 # Is there at least 16 bytes for main loop? 73 SUBS x0, x0, 16 // k = k - 16 74 B.LO 3f 75 76 # Main loop - 16 bytes of A 77 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 78 # 1 load for A0 = +1 cycle. Total 41 cycles. 79 80 .p2align 3 812: 82 # BLOCK 0 - 6 cycles 83 SMULL v17.8h, v4.8b, v0.8b 84 LDR x17, [x5, 80] 85 SMULL v19.8h, v5.8b, v0.8b 86 LDR d5, [x5, 24] 87 INS v4.d[0], x16 88 SMLAL v17.8h, v2.8b, v6.8b 89 LDR x16, [x5, 32] 90 SMLAL v19.8h, v3.8b, v6.8b 91 LDR d3, [x5, 88] 92 INS v2.d[0], x17 93 94 # BLOCK 1 - 10 cycles 95 SMULL v21.8h, v4.8b, v0.8b 96 LDR x17, [x5, 96] 97 SMULL v23.8h, v5.8b, v0.8b 98 SADALP v16.4s, v17.8h 99 PRFM PLDL1KEEP, [x5, 448] 100 SADALP v18.4s, v19.8h 101 PRFM PLDL1KEEP, [x5, 512] 102 LDR d5, [x5, 40] 103 INS v4.d[0], x16 104 SMLAL v21.8h, v2.8b, v6.8b 105 LDR x16, [x5, 48] 106 SMLAL v23.8h, v3.8b, v6.8b 107 LDR d3, [x5, 104] 108 INS v2.d[0], x17 109 110 # BLOCK 2 - 10 cycles 111 SMULL v17.8h, v4.8b, v0.8b 112 LDR x17, [x5, 112] 113 SMULL v19.8h, v5.8b, v0.8b 114 SADALP v20.4s, v21.8h 115 PRFM PLDL1KEEP, [x13, 128] 116 SADALP v22.4s, v23.8h 117 LDR d5, [x5, 56] 118 INS v4.d[0], x16 119 SMLAL v17.8h, v2.8b, v6.8b 120 LDR x16, [x5, 128] 121 SMLAL v19.8h, v3.8b, v6.8b 122 LDR d3, [x5, 120] 123 INS v2.d[0], x17 124 125 # BLOCK 3 - 15 cycles 126 SMULL v21.8h, v4.8b, v0.8b 127 LDR x7, [x13], 8 // Read A0 128 SMULL v23.8h, v5.8b, v0.8b 129 LDR x17, [x5, 192] // Read B 130 SADALP v24.4s, v17.8h 131 SUBS x0, x0, 16 132 SADALP v26.4s, v19.8h 133 LDR d5, [x5, 136] // Read B 134 INS v4.d[0], x16 135 SMLAL v21.8h, v2.8b, v6.8b 136 LDR x16, [x5, 144] 137 SMLAL v23.8h, v3.8b, v6.8b 138 LDR d6, [x13], 8 // Read A0 139 INS v0.d[0], x7 140 LDR d3, [x5, 200] // Read B 141 INS v2.d[0], x17 142 SADALP v28.4s, v21.8h 143 ADD x5, x5, 128 144 SADALP v30.4s, v23.8h 145 B.HS 2b 146 147 # Epilogue 148 # Same as main loop except no loads at end of loop 149 150 .p2align 3 1513: 152 # BLOCK 0 - 6 cycles 153 SMULL v17.8h, v4.8b, v0.8b 154 LDR x17, [x5, 80] 155 SMULL v19.8h, v5.8b, v0.8b 156 LDR d5, [x5, 24] 157 INS v4.d[0], x16 158 SMLAL v17.8h, v2.8b, v6.8b 159 LDR x16, [x5, 32] 160 SMLAL v19.8h, v3.8b, v6.8b 161 LDR d3, [x5, 88] 162 INS v2.d[0], x17 163 164 # BLOCK 1 - 10 cycles 165 SMULL v21.8h, v4.8b, v0.8b 166 LDR x17, [x5, 96] 167 SMULL v23.8h, v5.8b, v0.8b 168 SADALP v16.4s, v17.8h 169 SADALP v18.4s, v19.8h 170 LDR d5, [x5, 40] 171 INS v4.d[0], x16 172 SMLAL v21.8h, v2.8b, v6.8b 173 LDR x16, [x5, 48] 174 SMLAL v23.8h, v3.8b, v6.8b 175 LDR d3, [x5, 104] 176 INS v2.d[0], x17 177 178 # BLOCK 2 - 10 cycles 179 SMULL v17.8h, v4.8b, v0.8b 180 LDR x17, [x5, 112] 181 SMULL v19.8h, v5.8b, v0.8b 182 SADALP v20.4s, v21.8h 183 SADALP v22.4s, v23.8h 184 LDR d5, [x5, 56] 185 INS v4.d[0], x16 186 SMLAL v17.8h, v2.8b, v6.8b 187 SMLAL v19.8h, v3.8b, v6.8b 188 LDR d3, [x5, 120] 189 INS v2.d[0], x17 190 191 # BLOCK 3 - 12 cycles 192 SMULL v21.8h, v4.8b, v0.8b 193 SMULL v23.8h, v5.8b, v0.8b 194 SADALP v24.4s, v17.8h 195 SADALP v26.4s, v19.8h 196 SMLAL v21.8h, v2.8b, v6.8b 197 SMLAL v23.8h, v3.8b, v6.8b 198 SADALP v28.4s, v21.8h 199 ADD x5, x5, 128 200 SADALP v30.4s, v23.8h 201 202 # Is there a remainder?- 8 bytes of A 203 TBNZ x0, 3, 5f 204 205 # ks loop 206 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 207 B.HI 1b 208 2094: 210 # Add columns 211 ADDP v16.4s, v16.4s, v18.4s 212 ADDP v20.4s, v20.4s, v22.4s 213 ADDP v24.4s, v24.4s, v26.4s 214 ADDP v28.4s, v28.4s, v30.4s 215 ADDP v0.4s, v16.4s, v20.4s 216 ADDP v1.4s, v24.4s, v28.4s 217 218 # Load per channel scale values from weights 219 SCVTF v0.4s, v0.4s 220 LDR q4, [x5], 16 221 SCVTF v1.4s, v1.4s 222 LDR q5, [x5], 16 223 FMUL v0.4s, v0.4s, v4.4s 224 FMUL v1.4s, v1.4s, v5.4s 225 226 FCVTNS v0.4s, v0.4s 227 FCVTNS v1.4s, v1.4s 228 229 LD1R {v5.8h}, [x11], 2 230 SQXTN v0.4h, v0.4s 231 SQXTN2 v0.8h, v1.4s 232 SUBS x1, x1, 8 233 SQADD v0.8h, v0.8h, v5.8h 234 LD1R {v1.16b}, [x11], 1 235 SQXTN v0.8b, v0.8h 236 LD1R {v17.16b}, [x11] 237 SMAX v0.8b, v0.8b, v1.8b 238 SUB x11, x11, 3 // rewind params pointer 239 240 SMIN v0.8b, v0.8b, v17.8b 241 B.LO 6f 242 243 # Store full 1 x 8 244 ST1 {v0.8b}, [x6], x10 245 SUB x4, x4, x3 // a -= ks 246 B.HI 0b 247 RET 248 249 # Remainder - 8 bytes of A 250 .p2align 3 2515: 252 LDR d0, [x13], 8 253 LDP d4, d5, [x5] 254 LDP d6, d7, [x5, 16] 255 SMULL v17.8h, v4.8b, v0.8b 256 SMULL v19.8h, v5.8b, v0.8b 257 SMULL v21.8h, v6.8b, v0.8b 258 SMULL v23.8h, v7.8b, v0.8b 259 LDP d4, d5, [x5, 32] 260 LDP d6, d7, [x5, 48] 261 SADALP v16.4s, v17.8h 262 SADALP v18.4s, v19.8h 263 SADALP v20.4s, v21.8h 264 SADALP v22.4s, v23.8h 265 SMULL v17.8h, v4.8b, v0.8b 266 SMULL v19.8h, v5.8b, v0.8b 267 SMULL v21.8h, v6.8b, v0.8b 268 SMULL v23.8h, v7.8b, v0.8b 269 ADD x5, x5, 64 270 SADALP v24.4s, v17.8h 271 SADALP v26.4s, v19.8h 272 SADALP v28.4s, v21.8h 273 SADALP v30.4s, v23.8h 274 275 # ks loop 276 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 277 B.HI 1b 278 B 4b 279 280 # Store odd width 281 .p2align 3 2826: 283 TBZ x1, 2, 7f 284 STR s0, [x6], 4 285 EXT v0.16b, v0.16b, v0.16b, 4 286 2877: 288 TBZ x1, 1, 8f 289 STR h0, [x6], 2 290 EXT v0.16b, v0.16b, v0.16b, 2 2918: 292 TBZ x1, 0, 9f 293 STR b0, [x6] 2949: 295 RET 296 297END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53 298 299#ifdef __ELF__ 300.section ".note.GNU-stack","",%progbits 301#endif 302