1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# B x5 v4 v5 v2 v3 32# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 33# temp0 v17 v19 v21 v23 34 35 36BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal 37 38 # Clamp C pointers 39 LDP x10, x8, [sp] // Load cn_stride, a_offset 40 ADD x2, x2, 7 // kc = (kc + 7) & ~7 41 LDP x12, x11, [sp, 16] // Load zero, params pointer 42 BIC x2, x2, 7 43 44 .p2align 3 450: 46 # Load initial bias from w into accumulators 47 LDP s16, s18, [x5], 8 48 LDP s20, s22, [x5], 8 49 LDP s24, s26, [x5], 8 50 LDP s28, s30, [x5], 8 51 MOV x9, x3 // p = ks 52 53 .p2align 3 541: 55 # Load next A pointer 56 LDR x13, [x4], 8 57 CMP x13, x12 // if a0 == zero 58 ADD x13, x13, x8 // a0 += a_offset 59 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 60 61 # Is there at least 16 bytes for epilogue? 62 SUBS x0, x2, 16 // k = kc - 16 63 B.LO 5f 64 65 # Prologue: load A0 and 4 B's 66 LDP d0, d6, [x13], 16 // Read A0 67 LDP d4, d5, [x5] // Read B 68 LDP d2, d3, [x5, 64] // Read B 69 70 # Is there at least 16 bytes for main loop? 71 SUBS x0, x0, 16 // k = k - 16 72 B.LO 3f 73 74 # Main loop - 16 bytes of A 75 # 4 groups of 2 mul/mla/adap = 6 cycles. 76 # 2 load for A0, A1 = +4 cycle. Total 36 cycles. 77 78 .p2align 3 792: 80 # BLOCK 0 - 4 cycles 81 SMULL v17.8h, v4.8b, v0.8b 82 SMULL v19.8h, v5.8b, v0.8b 83 LDP d4, d5, [x5, 16] 84 SMLAL v17.8h, v2.8b, v6.8b 85 SMLAL v19.8h, v3.8b, v6.8b 86 LDP d2, d3, [x5, 80] 87 88 # BLOCK 1 - 6 cycles 89 SMULL v21.8h, v4.8b, v0.8b 90 SMULL v23.8h, v5.8b, v0.8b 91 SADALP v16.4s, v17.8h 92 SADALP v18.4s, v19.8h 93 LDP d4, d5, [x5, 32] 94 SMLAL v21.8h, v2.8b, v6.8b 95 SMLAL v23.8h, v3.8b, v6.8b 96 LDP d2, d3, [x5, 96] 97 98 # BLOCK 2 - 6 cycles 99 SMULL v17.8h, v4.8b, v0.8b 100 SMULL v19.8h, v5.8b, v0.8b 101 SADALP v20.4s, v21.8h 102 SADALP v22.4s, v23.8h 103 LDP d4, d5, [x5, 48] 104 SMLAL v17.8h, v2.8b, v6.8b 105 SMLAL v19.8h, v3.8b, v6.8b 106 LDP d2, d3, [x5, 112] 107 108 # BLOCK 3 - 14 cycles 109 SMULL v21.8h, v4.8b, v0.8b 110 ADD x5, x5, 128 111 SMULL v23.8h, v5.8b, v0.8b 112 SADALP v24.4s, v17.8h 113 SUBS x0, x0, 16 114 SADALP v26.4s, v19.8h 115 LDP d4, d5, [x5] // Read B 116 SMLAL v21.8h, v2.8b, v6.8b 117 SMLAL v23.8h, v3.8b, v6.8b 118 LDP d0, d6, [x13], 16 // Read A0 119 SADALP v28.4s, v21.8h 120 LDP d2, d3, [x5, 64] // Read B 121 SADALP v30.4s, v23.8h 122 B.HS 2b 123 124 # Epilogue 125 # Same as main loop except no loads at end of loop 126 127 .p2align 3 1283: 129 # BLOCK 0 - 4 cycles 130 SMULL v17.8h, v4.8b, v0.8b 131 SMULL v19.8h, v5.8b, v0.8b 132 LDP d4, d5, [x5, 16] 133 SMLAL v17.8h, v2.8b, v6.8b 134 SMLAL v19.8h, v3.8b, v6.8b 135 LDP d2, d3, [x5, 80] 136 137 # BLOCK 1 - 6 cycles 138 SMULL v21.8h, v4.8b, v0.8b 139 SMULL v23.8h, v5.8b, v0.8b 140 SADALP v16.4s, v17.8h 141 SADALP v18.4s, v19.8h 142 LDP d4, d5, [x5, 32] 143 SMLAL v21.8h, v2.8b, v6.8b 144 SMLAL v23.8h, v3.8b, v6.8b 145 LDP d2, d3, [x5, 96] 146 147 # BLOCK 2 - 6 cycles 148 SMULL v17.8h, v4.8b, v0.8b 149 SMULL v19.8h, v5.8b, v0.8b 150 SADALP v20.4s, v21.8h 151 SADALP v22.4s, v23.8h 152 LDP d4, d5, [x5, 48] 153 SMLAL v17.8h, v2.8b, v6.8b 154 SMLAL v19.8h, v3.8b, v6.8b 155 LDP d2, d3, [x5, 112] 156 157 # BLOCK 3 - 8 cycles 158 SMULL v21.8h, v4.8b, v0.8b 159 ADD x5, x5, 128 160 SMULL v23.8h, v5.8b, v0.8b 161 SADALP v24.4s, v17.8h 162 SUBS x0, x0, 16 163 SADALP v26.4s, v19.8h 164 SMLAL v21.8h, v2.8b, v6.8b 165 SMLAL v23.8h, v3.8b, v6.8b 166 SADALP v28.4s, v21.8h 167 SADALP v30.4s, v23.8h 168 169 # Is there a remainder?- 8 bytes of A 170 TBNZ x0, 3, 5f 171 172 # ks loop 173 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 174 B.HI 1b 175 1764: 177 # Add columns 178 ADDP v16.4s, v16.4s, v18.4s 179 ADDP v20.4s, v20.4s, v22.4s 180 ADDP v24.4s, v24.4s, v26.4s 181 ADDP v28.4s, v28.4s, v30.4s 182 ADDP v0.4s, v16.4s, v20.4s 183 ADDP v1.4s, v24.4s, v28.4s 184 185 # Load per channel scale values from weights 186 SCVTF v0.4s, v0.4s 187 LDR q4, [x5], 16 188 SCVTF v1.4s, v1.4s 189 LDR q5, [x5], 16 190 FMUL v0.4s, v0.4s, v4.4s 191 FMUL v1.4s, v1.4s, v5.4s 192 193 FCVTNS v0.4s, v0.4s 194 FCVTNS v1.4s, v1.4s 195 196 LD1R {v5.8h}, [x11], 2 197 SQXTN v0.4h, v0.4s 198 SQXTN2 v0.8h, v1.4s 199 SUBS x1, x1, 8 200 SQADD v0.8h, v0.8h, v5.8h 201 LD1R {v1.16b}, [x11], 1 202 SQXTN v0.8b, v0.8h 203 LD1R {v17.16b}, [x11] 204 SMAX v0.8b, v0.8b, v1.8b 205 SUB x11, x11, 3 // rewind params pointer 206 SMIN v0.8b, v0.8b, v17.8b 207 B.LO 6f 208 209 # Store full 1 x 8 210 ST1 {v0.8b}, [x6], x10 211 SUB x4, x4, x3 // a -= ks 212 B.HI 0b 213 RET 214 215 # Remainder - 8 bytes of A 216 .p2align 3 2175: 218 LDR d0, [x13], 8 219 LDP d4, d5, [x5] 220 LDP d6, d7, [x5, 16] 221 SMULL v17.8h, v4.8b, v0.8b 222 SMULL v19.8h, v5.8b, v0.8b 223 SMULL v21.8h, v6.8b, v0.8b 224 SMULL v23.8h, v7.8b, v0.8b 225 LDP d4, d5, [x5, 32] 226 LDP d6, d7, [x5, 48] 227 SADALP v16.4s, v17.8h 228 SADALP v18.4s, v19.8h 229 SADALP v20.4s, v21.8h 230 SADALP v22.4s, v23.8h 231 SMULL v17.8h, v4.8b, v0.8b 232 SMULL v19.8h, v5.8b, v0.8b 233 SMULL v21.8h, v6.8b, v0.8b 234 SMULL v23.8h, v7.8b, v0.8b 235 ADD x5, x5, 64 236 SADALP v24.4s, v17.8h 237 SADALP v26.4s, v19.8h 238 SADALP v28.4s, v21.8h 239 SADALP v30.4s, v23.8h 240 241 # ks loop 242 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 243 B.HI 1b 244 B 4b 245 246 # Store odd width 247 .p2align 3 2486: 249 TBZ x1, 2, 7f 250 STR s0, [x6], 4 251 EXT v0.16b, v0.16b, v0.16b, 4 252 2537: 254 TBZ x1, 1, 8f 255 STR h0, [x6], 2 256 EXT v0.16b, v0.16b, v0.16b, 2 2578: 258 TBZ x1, 0, 9f 259 STR b0, [x6] 2609: 261 RET 262 263END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal 264 265#ifdef __ELF__ 266.section ".note.GNU-stack","",%progbits 267#endif 268