1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# B x5 v4 v5 v2 v3 32# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 33# temp0 v17 v19 v21 v23 34 35 36BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm 37 38 # Clamp C pointers 39 LDP x10, x8, [sp] // Load cn_stride, a_offset 40 ADD x2, x2, 7 // kc = (kc + 7) & ~7 41 LDP x12, x11, [sp, 16] // Load zero, params pointer 42 BIC x2, x2, 7 43 44 .p2align 3 450: 46 # Load initial bias from w into accumulators 47 LDP s16, s18, [x5], 8 48 LDP s20, s22, [x5], 8 49 LDP s24, s26, [x5], 8 50 LDP s28, s30, [x5], 8 51 MOV x9, x3 // p = ks 52 53 .p2align 3 541: 55 # Load next A pointer 56 LDR x13, [x4], 8 57 CMP x13, x12 // if a0 == zero 58 ADD x13, x13, x8 // a0 += a_offset 59 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 60 61 # Is there at least 16 bytes for epilogue? 62 SUBS x0, x2, 16 // k = kc - 16 63 B.LO 5f 64 65 # Prologue: load A0 and 4 B's 66 LDP d0, d6, [x13], 16 // Read A0 67 LDP d4, d5, [x5] // Read B 68 LDP d2, d3, [x5, 64] // Read B 69 70 # Is there at least 16 bytes for main loop? 71 SUBS x0, x0, 16 // k = k - 16 72 B.LO 3f 73 74 # Main loop - 16 bytes of A 75 # 4 groups of 2 mul/mla/adap = 6 cycles. 76 # 2 load for A0, A1 = +4 cycle. Total 36 cycles. 77 78 .p2align 3 792: 80 # BLOCK 0 - 4 cycles 81 SMULL v17.8h, v4.8b, v0.8b 82 SMULL v19.8h, v5.8b, v0.8b 83 LDP d4, d5, [x5, 16] 84 SMLAL v17.8h, v2.8b, v6.8b 85 SMLAL v19.8h, v3.8b, v6.8b 86 LDP d2, d3, [x5, 80] 87 88 # BLOCK 1 - 6 cycles 89 SMULL v21.8h, v4.8b, v0.8b 90 SMULL v23.8h, v5.8b, v0.8b 91 PRFM PLDL1KEEP, [x5, 448] 92 SADALP v16.4s, v17.8h 93 PRFM PLDL1KEEP, [x5, 512] 94 SADALP v18.4s, v19.8h 95 LDP d4, d5, [x5, 32] 96 SMLAL v21.8h, v2.8b, v6.8b 97 SMLAL v23.8h, v3.8b, v6.8b 98 LDP d2, d3, [x5, 96] 99 100 # BLOCK 2 - 6 cycles 101 SMULL v17.8h, v4.8b, v0.8b 102 SMULL v19.8h, v5.8b, v0.8b 103 PRFM PLDL1KEEP, [x13, 128] 104 SADALP v20.4s, v21.8h 105 SADALP v22.4s, v23.8h 106 LDP d4, d5, [x5, 48] 107 SMLAL v17.8h, v2.8b, v6.8b 108 SMLAL v19.8h, v3.8b, v6.8b 109 LDP d2, d3, [x5, 112] 110 111 # BLOCK 3 - 14 cycles 112 SMULL v21.8h, v4.8b, v0.8b 113 ADD x5, x5, 128 114 SMULL v23.8h, v5.8b, v0.8b 115 SADALP v24.4s, v17.8h 116 SUBS x0, x0, 16 117 SADALP v26.4s, v19.8h 118 LDP d4, d5, [x5] // Read B 119 SMLAL v21.8h, v2.8b, v6.8b 120 SMLAL v23.8h, v3.8b, v6.8b 121 LDP d0, d6, [x13], 16 // Read A0 122 SADALP v28.4s, v21.8h 123 LDP d2, d3, [x5, 64] // Read B 124 SADALP v30.4s, v23.8h 125 B.HS 2b 126 127 # Epilogue 128 # Same as main loop except no loads at end of loop 129 130 .p2align 3 1313: 132 # BLOCK 0 - 4 cycles 133 SMULL v17.8h, v4.8b, v0.8b 134 SMULL v19.8h, v5.8b, v0.8b 135 LDP d4, d5, [x5, 16] 136 SMLAL v17.8h, v2.8b, v6.8b 137 SMLAL v19.8h, v3.8b, v6.8b 138 LDP d2, d3, [x5, 80] 139 140 # BLOCK 1 - 6 cycles 141 SMULL v21.8h, v4.8b, v0.8b 142 SMULL v23.8h, v5.8b, v0.8b 143 PRFM PLDL1KEEP, [x5, 448] 144 SADALP v16.4s, v17.8h 145 PRFM PLDL1KEEP, [x5, 512] 146 SADALP v18.4s, v19.8h 147 LDP d4, d5, [x5, 32] 148 SMLAL v21.8h, v2.8b, v6.8b 149 SMLAL v23.8h, v3.8b, v6.8b 150 LDP d2, d3, [x5, 96] 151 152 # BLOCK 2 - 6 cycles 153 SMULL v17.8h, v4.8b, v0.8b 154 SMULL v19.8h, v5.8b, v0.8b 155 PRFM PLDL1KEEP, [x13, 128] 156 SADALP v20.4s, v21.8h 157 SADALP v22.4s, v23.8h 158 LDP d4, d5, [x5, 48] 159 SMLAL v17.8h, v2.8b, v6.8b 160 SMLAL v19.8h, v3.8b, v6.8b 161 LDP d2, d3, [x5, 112] 162 163 # BLOCK 3 - 8 cycles 164 SMULL v21.8h, v4.8b, v0.8b 165 ADD x5, x5, 128 166 SMULL v23.8h, v5.8b, v0.8b 167 SADALP v24.4s, v17.8h 168 SUBS x0, x0, 16 169 SADALP v26.4s, v19.8h 170 SMLAL v21.8h, v2.8b, v6.8b 171 SMLAL v23.8h, v3.8b, v6.8b 172 SADALP v28.4s, v21.8h 173 SADALP v30.4s, v23.8h 174 175 # Is there a remainder?- 8 bytes of A 176 TBNZ x0, 3, 5f 177 178 # ks loop 179 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 180 B.HI 1b 181 1824: 183 # Add columns 184 ADDP v16.4s, v16.4s, v18.4s 185 ADDP v20.4s, v20.4s, v22.4s 186 ADDP v24.4s, v24.4s, v26.4s 187 ADDP v28.4s, v28.4s, v30.4s 188 ADDP v0.4s, v16.4s, v20.4s 189 ADDP v1.4s, v24.4s, v28.4s 190 191 # Apply params - scale, bias and clamp 192 SCVTF v0.4s, v0.4s 193 LD1R {v4.4s}, [x11], 4 194 SCVTF v1.4s, v1.4s 195 FMUL v0.4s, v0.4s, v4.4s 196 FMUL v1.4s, v1.4s, v4.4s 197 198 FCVTNS v0.4s, v0.4s 199 FCVTNS v1.4s, v1.4s 200 201 LD1R {v5.8h}, [x11], 2 202 SQXTN v0.4h, v0.4s 203 SQXTN2 v0.8h, v1.4s 204 SUBS x1, x1, 8 205 SQADD v0.8h, v0.8h, v5.8h 206 LD1R {v1.16b}, [x11], 1 207 SQXTN v0.8b, v0.8h 208 LD1R {v17.16b}, [x11] 209 SMAX v0.8b, v0.8b, v1.8b 210 SUB x11, x11, 7 // rewind params pointer 211 SMIN v0.8b, v0.8b, v17.8b 212 B.LO 6f 213 214 # Store full 1 x 8 215 ST1 {v0.8b}, [x6], x10 216 SUB x4, x4, x3 // a -= ks 217 B.HI 0b 218 RET 219 220 # Remainder - 8 bytes of A 221 .p2align 3 2225: 223 LDR d0, [x13], 8 224 LDP d4, d5, [x5] 225 LDP d6, d7, [x5, 16] 226 SMULL v17.8h, v4.8b, v0.8b 227 SMULL v19.8h, v5.8b, v0.8b 228 SMULL v21.8h, v6.8b, v0.8b 229 SMULL v23.8h, v7.8b, v0.8b 230 LDP d4, d5, [x5, 32] 231 LDP d6, d7, [x5, 48] 232 SADALP v16.4s, v17.8h 233 SADALP v18.4s, v19.8h 234 SADALP v20.4s, v21.8h 235 SADALP v22.4s, v23.8h 236 SMULL v17.8h, v4.8b, v0.8b 237 SMULL v19.8h, v5.8b, v0.8b 238 SMULL v21.8h, v6.8b, v0.8b 239 SMULL v23.8h, v7.8b, v0.8b 240 ADD x5, x5, 64 241 SADALP v24.4s, v17.8h 242 SADALP v26.4s, v19.8h 243 SADALP v28.4s, v21.8h 244 SADALP v30.4s, v23.8h 245 246 # ks loop 247 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 248 B.HI 1b 249 B 4b 250 251 # Store odd width 252 .p2align 3 2536: 254 TBZ x1, 2, 7f 255 STR s0, [x6], 4 256 EXT v0.16b, v0.16b, v0.16b, 4 257 2587: 259 TBZ x1, 1, 8f 260 STR h0, [x6], 2 261 EXT v0.16b, v0.16b, v0.16b, 2 2628: 263 TBZ x1, 0, 9f 264 STR b0, [x6] 2659: 266 RET 267 268END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm 269 270#ifdef __ELF__ 271.section ".note.GNU-stack","",%progbits 272#endif 273