1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# B x5 v4 v5 v2 v3 32# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 33# temp0 v17 v19 v21 v23 34 35 36BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm 37 38 # Clamp C pointers 39 LDP x10, x8, [sp] // Load cn_stride, a_offset 40 ADD x2, x2, 7 // kc = (kc + 7) & ~7 41 LDP x12, x11, [sp, 16] // Load zero, params pointer 42 BIC x2, x2, 7 43 44 .p2align 3 450: 46 # Load initial bias from w into accumulators 47 LDP s16, s18, [x5], 8 48 LDP s20, s22, [x5], 8 49 LDP s24, s26, [x5], 8 50 LDP s28, s30, [x5], 8 51 MOV x9, x3 // p = ks 52 53 .p2align 3 541: 55 # Load next A pointer 56 LDR x13, [x4], 8 57 CMP x13, x12 // if a0 == zero 58 ADD x13, x13, x8 // a0 += a_offset 59 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 60 61 # Is there at least 16 bytes for epilogue? 62 SUBS x0, x2, 16 // k = kc - 16 63 B.LO 5f 64 65 # Prologue: load A0 and 4 B's 66 LDP d0, d6, [x13], 16 // Read A0 67 LDP d4, d5, [x5] // Read B 68 LDP d2, d3, [x5, 64] // Read B 69 70 # Is there at least 16 bytes for main loop? 71 SUBS x0, x0, 16 // k = k - 16 72 B.LO 3f 73 74 # Main loop - 16 bytes of A 75 # 4 groups of 2 mul/mla/adap = 6 cycles. 76 # 2 load for A0, A1 = +4 cycle. Total 36 cycles. 77 78 .p2align 3 792: 80 # BLOCK 0 - 4 cycles 81 SMULL v17.8h, v4.8b, v0.8b 82 SMULL v19.8h, v5.8b, v0.8b 83 LDP d4, d5, [x5, 16] 84 SMLAL v17.8h, v2.8b, v6.8b 85 SMLAL v19.8h, v3.8b, v6.8b 86 LDP d2, d3, [x5, 80] 87 88 # BLOCK 1 - 6 cycles 89 SMULL v21.8h, v4.8b, v0.8b 90 SMULL v23.8h, v5.8b, v0.8b 91 PRFM PLDL1KEEP, [x5, 448] 92 SADALP v16.4s, v17.8h 93 PRFM PLDL1KEEP, [x5, 512] 94 SADALP v18.4s, v19.8h 95 LDP d4, d5, [x5, 32] 96 SMLAL v21.8h, v2.8b, v6.8b 97 SMLAL v23.8h, v3.8b, v6.8b 98 LDP d2, d3, [x5, 96] 99 100 # BLOCK 2 - 6 cycles 101 SMULL v17.8h, v4.8b, v0.8b 102 SMULL v19.8h, v5.8b, v0.8b 103 PRFM PLDL1KEEP, [x13, 128] 104 SADALP v20.4s, v21.8h 105 SADALP v22.4s, v23.8h 106 LDP d4, d5, [x5, 48] 107 SMLAL v17.8h, v2.8b, v6.8b 108 SMLAL v19.8h, v3.8b, v6.8b 109 LDP d2, d3, [x5, 112] 110 111 # BLOCK 3 - 14 cycles 112 SMULL v21.8h, v4.8b, v0.8b 113 ADD x5, x5, 128 114 SMULL v23.8h, v5.8b, v0.8b 115 SADALP v24.4s, v17.8h 116 SUBS x0, x0, 16 117 SADALP v26.4s, v19.8h 118 LDP d4, d5, [x5] // Read B 119 SMLAL v21.8h, v2.8b, v6.8b 120 SMLAL v23.8h, v3.8b, v6.8b 121 LDP d0, d6, [x13], 16 // Read A0 122 SADALP v28.4s, v21.8h 123 LDP d2, d3, [x5, 64] // Read B 124 SADALP v30.4s, v23.8h 125 B.HS 2b 126 127 # Epilogue 128 # Same as main loop except no loads at end of loop 129 130 .p2align 3 1313: 132 # BLOCK 0 - 4 cycles 133 SMULL v17.8h, v4.8b, v0.8b 134 SMULL v19.8h, v5.8b, v0.8b 135 LDP d4, d5, [x5, 16] 136 SMLAL v17.8h, v2.8b, v6.8b 137 SMLAL v19.8h, v3.8b, v6.8b 138 LDP d2, d3, [x5, 80] 139 140 # BLOCK 1 - 6 cycles 141 SMULL v21.8h, v4.8b, v0.8b 142 SMULL v23.8h, v5.8b, v0.8b 143 PRFM PLDL1KEEP, [x5, 448] 144 SADALP v16.4s, v17.8h 145 PRFM PLDL1KEEP, [x5, 512] 146 SADALP v18.4s, v19.8h 147 LDP d4, d5, [x5, 32] 148 SMLAL v21.8h, v2.8b, v6.8b 149 SMLAL v23.8h, v3.8b, v6.8b 150 LDP d2, d3, [x5, 96] 151 152 # BLOCK 2 - 6 cycles 153 SMULL v17.8h, v4.8b, v0.8b 154 SMULL v19.8h, v5.8b, v0.8b 155 PRFM PLDL1KEEP, [x13, 128] 156 SADALP v20.4s, v21.8h 157 SADALP v22.4s, v23.8h 158 LDP d4, d5, [x5, 48] 159 SMLAL v17.8h, v2.8b, v6.8b 160 SMLAL v19.8h, v3.8b, v6.8b 161 LDP d2, d3, [x5, 112] 162 163 # BLOCK 3 - 8 cycles 164 SMULL v21.8h, v4.8b, v0.8b 165 ADD x5, x5, 128 166 SMULL v23.8h, v5.8b, v0.8b 167 SADALP v24.4s, v17.8h 168 SUBS x0, x0, 16 169 SADALP v26.4s, v19.8h 170 SMLAL v21.8h, v2.8b, v6.8b 171 SMLAL v23.8h, v3.8b, v6.8b 172 SADALP v28.4s, v21.8h 173 SADALP v30.4s, v23.8h 174 175 # Is there a remainder?- 8 bytes of A 176 TBNZ x0, 3, 5f 177 178 # ks loop 179 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 180 B.HI 1b 181 1824: 183 # Add columns 184 ADDP v16.4s, v16.4s, v18.4s 185 ADDP v20.4s, v20.4s, v22.4s 186 ADDP v24.4s, v24.4s, v26.4s 187 ADDP v28.4s, v28.4s, v30.4s 188 ADDP v0.4s, v16.4s, v20.4s 189 ADDP v1.4s, v24.4s, v28.4s 190 191 # Load per channel scale values from weights 192 SCVTF v0.4s, v0.4s 193 LDR q4, [x5], 16 194 SCVTF v1.4s, v1.4s 195 LDR q5, [x5], 16 196 FMUL v0.4s, v0.4s, v4.4s 197 FMUL v1.4s, v1.4s, v5.4s 198 199 FCVTNS v0.4s, v0.4s 200 FCVTNS v1.4s, v1.4s 201 202 LD1R {v5.8h}, [x11], 2 203 SQXTN v0.4h, v0.4s 204 SQXTN2 v0.8h, v1.4s 205 SUBS x1, x1, 8 206 SQADD v0.8h, v0.8h, v5.8h 207 LD1R {v1.16b}, [x11], 1 208 SQXTN v0.8b, v0.8h 209 LD1R {v17.16b}, [x11] 210 SMAX v0.8b, v0.8b, v1.8b 211 SUB x11, x11, 3 // rewind params pointer 212 SMIN v0.8b, v0.8b, v17.8b 213 B.LO 6f 214 215 # Store full 1 x 8 216 ST1 {v0.8b}, [x6], x10 217 SUB x4, x4, x3 // a -= ks 218 B.HI 0b 219 RET 220 221 # Remainder - 8 bytes of A 222 .p2align 3 2235: 224 LDR d0, [x13], 8 225 LDP d4, d5, [x5] 226 LDP d6, d7, [x5, 16] 227 SMULL v17.8h, v4.8b, v0.8b 228 SMULL v19.8h, v5.8b, v0.8b 229 SMULL v21.8h, v6.8b, v0.8b 230 SMULL v23.8h, v7.8b, v0.8b 231 LDP d4, d5, [x5, 32] 232 LDP d6, d7, [x5, 48] 233 SADALP v16.4s, v17.8h 234 SADALP v18.4s, v19.8h 235 SADALP v20.4s, v21.8h 236 SADALP v22.4s, v23.8h 237 SMULL v17.8h, v4.8b, v0.8b 238 SMULL v19.8h, v5.8b, v0.8b 239 SMULL v21.8h, v6.8b, v0.8b 240 SMULL v23.8h, v7.8b, v0.8b 241 ADD x5, x5, 64 242 SADALP v24.4s, v17.8h 243 SADALP v26.4s, v19.8h 244 SADALP v28.4s, v21.8h 245 SADALP v30.4s, v23.8h 246 247 # ks loop 248 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 249 B.HI 1b 250 B 4b 251 252 # Store odd width 253 .p2align 3 2546: 255 TBZ x1, 2, 7f 256 STR s0, [x6], 4 257 EXT v0.16b, v0.16b, v0.16b, 4 258 2597: 260 TBZ x1, 1, 8f 261 STR h0, [x6], 2 262 EXT v0.16b, v0.16b, v0.16b, 2 2638: 264 TBZ x1, 0, 9f 265 STR b0, [x6] 2669: 267 RET 268 269END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm 270 271#ifdef __ELF__ 272.section ".note.GNU-stack","",%progbits 273#endif 274