1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8 9#include <xnnpack/assembly.h> 10 11$DATATYPE = "qc8" if CHANNELWISE else "qs8" 12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 14# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8c8__aarch64_neon_mlal${"_prfm" if PREFETCH else ""}_cortex_a53( 15# size_t mr, x0 16# size_t nc, x1 17# size_t kc, x2 / x0 18# size_t ks, x3 / x9 19# const int8_t**restrict a, x4 20# const int8_t* restrict w, x5 21# int8_t* restrict c, x6 22# size_t cm_stride, (x7) 23# size_t cn_stride, [sp] -> x10 24# size_t a_offset, [sp + 8] -> x8 25# const int8_t* zero, [sp + 16] -> x12 26# const union ${PARAMS_UNION} params [sp + 24] -> x11 27 28# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 29 30# Register usage 31# A0 x13 v0 v6 32# B x5 v4 v5 v2 v3 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# temp0 v17 v19 v21 v23 35# x16, x17, x7 tenporary a53 gpr load data 36 37 38BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8c8__aarch64_neon_mlal${"_prfm" if PREFETCH else ""}_cortex_a53 39 40 # Clamp C pointers 41 LDP x10, x8, [sp] // Load cn_stride, a_offset 42 ADD x2, x2, 7 // kc = (kc + 7) & ~7 43 LDP x12, x11, [sp, 16] // Load zero, params pointer 44 BIC x2, x2, 7 45 46 .p2align 3 470: 48 # Load initial bias from w into accumulators 49 LDP s16, s18, [x5], 8 50 LDP s20, s22, [x5], 8 51 LDP s24, s26, [x5], 8 52 LDP s28, s30, [x5], 8 53 MOV x9, x3 // p = ks 54 55 .p2align 3 561: 57 # Load next A pointer 58 LDR x13, [x4], 8 59 CMP x13, x12 // if a0 == zero 60 ADD x13, x13, x8 // a0 += a_offset 61 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 62 63 # Is there at least 16 bytes for epilogue? 64 SUBS x0, x2, 16 // k = kc - 16 65 B.LO 5f 66 67 # Prologue: load A0 and 4 B's 68 LDP d0, d6, [x13], 16 // Read A0 69 LDP d4, d5, [x5] // Read B 70 LDP d2, d3, [x5, 64] // Read B 71 LDR x16, [x5, 16] // Read B 72 73 # Is there at least 16 bytes for main loop? 74 SUBS x0, x0, 16 // k = k - 16 75 B.LO 3f 76 77 # Main loop - 16 bytes of A 78 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 79 # 1 load for A0 = +1 cycle. Total 41 cycles. 80 81 .p2align 3 822: 83 # BLOCK 0 - 6 cycles 84 SMULL v17.8h, v4.8b, v0.8b 85 LDR x17, [x5, 80] 86 SMULL v19.8h, v5.8b, v0.8b 87 LDR d5, [x5, 24] 88 INS v4.d[0], x16 89 SMLAL v17.8h, v2.8b, v6.8b 90 LDR x16, [x5, 32] 91 SMLAL v19.8h, v3.8b, v6.8b 92 LDR d3, [x5, 88] 93 INS v2.d[0], x17 94 95 # BLOCK 1 - 10 cycles 96 SMULL v21.8h, v4.8b, v0.8b 97 LDR x17, [x5, 96] 98 SMULL v23.8h, v5.8b, v0.8b 99 SADALP v16.4s, v17.8h 100 $if PREFETCH: 101 PRFM PLDL1KEEP, [x5, 448] 102 SADALP v18.4s, v19.8h 103 $if PREFETCH: 104 PRFM PLDL1KEEP, [x5, 512] 105 LDR d5, [x5, 40] 106 INS v4.d[0], x16 107 SMLAL v21.8h, v2.8b, v6.8b 108 LDR x16, [x5, 48] 109 SMLAL v23.8h, v3.8b, v6.8b 110 LDR d3, [x5, 104] 111 INS v2.d[0], x17 112 113 # BLOCK 2 - 10 cycles 114 SMULL v17.8h, v4.8b, v0.8b 115 LDR x17, [x5, 112] 116 SMULL v19.8h, v5.8b, v0.8b 117 SADALP v20.4s, v21.8h 118 $if PREFETCH: 119 PRFM PLDL1KEEP, [x13, 128] 120 SADALP v22.4s, v23.8h 121 LDR d5, [x5, 56] 122 INS v4.d[0], x16 123 SMLAL v17.8h, v2.8b, v6.8b 124 LDR x16, [x5, 128] 125 SMLAL v19.8h, v3.8b, v6.8b 126 LDR d3, [x5, 120] 127 INS v2.d[0], x17 128 129 # BLOCK 3 - 15 cycles 130 SMULL v21.8h, v4.8b, v0.8b 131 LDR x7, [x13], 8 // Read A0 132 SMULL v23.8h, v5.8b, v0.8b 133 LDR x17, [x5, 192] // Read B 134 SADALP v24.4s, v17.8h 135 SUBS x0, x0, 16 136 SADALP v26.4s, v19.8h 137 LDR d5, [x5, 136] // Read B 138 INS v4.d[0], x16 139 SMLAL v21.8h, v2.8b, v6.8b 140 LDR x16, [x5, 144] 141 SMLAL v23.8h, v3.8b, v6.8b 142 LDR d6, [x13], 8 // Read A0 143 INS v0.d[0], x7 144 LDR d3, [x5, 200] // Read B 145 INS v2.d[0], x17 146 SADALP v28.4s, v21.8h 147 ADD x5, x5, 128 148 SADALP v30.4s, v23.8h 149 B.HS 2b 150 151 # Epilogue 152 # Same as main loop except no loads at end of loop 153 154 .p2align 3 1553: 156 # BLOCK 0 - 6 cycles 157 SMULL v17.8h, v4.8b, v0.8b 158 LDR x17, [x5, 80] 159 SMULL v19.8h, v5.8b, v0.8b 160 LDR d5, [x5, 24] 161 INS v4.d[0], x16 162 SMLAL v17.8h, v2.8b, v6.8b 163 LDR x16, [x5, 32] 164 SMLAL v19.8h, v3.8b, v6.8b 165 LDR d3, [x5, 88] 166 INS v2.d[0], x17 167 168 # BLOCK 1 - 10 cycles 169 SMULL v21.8h, v4.8b, v0.8b 170 LDR x17, [x5, 96] 171 SMULL v23.8h, v5.8b, v0.8b 172 SADALP v16.4s, v17.8h 173 SADALP v18.4s, v19.8h 174 LDR d5, [x5, 40] 175 INS v4.d[0], x16 176 SMLAL v21.8h, v2.8b, v6.8b 177 LDR x16, [x5, 48] 178 SMLAL v23.8h, v3.8b, v6.8b 179 LDR d3, [x5, 104] 180 INS v2.d[0], x17 181 182 # BLOCK 2 - 10 cycles 183 SMULL v17.8h, v4.8b, v0.8b 184 LDR x17, [x5, 112] 185 SMULL v19.8h, v5.8b, v0.8b 186 SADALP v20.4s, v21.8h 187 SADALP v22.4s, v23.8h 188 LDR d5, [x5, 56] 189 INS v4.d[0], x16 190 SMLAL v17.8h, v2.8b, v6.8b 191 SMLAL v19.8h, v3.8b, v6.8b 192 LDR d3, [x5, 120] 193 INS v2.d[0], x17 194 195 # BLOCK 3 - 12 cycles 196 SMULL v21.8h, v4.8b, v0.8b 197 SMULL v23.8h, v5.8b, v0.8b 198 SADALP v24.4s, v17.8h 199 SADALP v26.4s, v19.8h 200 SMLAL v21.8h, v2.8b, v6.8b 201 SMLAL v23.8h, v3.8b, v6.8b 202 SADALP v28.4s, v21.8h 203 ADD x5, x5, 128 204 SADALP v30.4s, v23.8h 205 206 # Is there a remainder?- 8 bytes of A 207 TBNZ x0, 3, 5f 208 209 # ks loop 210 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 211 B.HI 1b 212 2134: 214 # Add columns 215 ADDP v16.4s, v16.4s, v18.4s 216 ADDP v20.4s, v20.4s, v22.4s 217 $if REQUANTIZATION == "RNDNU": 218 LD1R {v4.4s}, [x11], 4 219 ADDP v24.4s, v24.4s, v26.4s 220 ADDP v28.4s, v28.4s, v30.4s 221 $if REQUANTIZATION == "RNDNU": 222 LD1R {v7.4s}, [x11], 4 223 ADDP v0.4s, v16.4s, v20.4s 224 ADDP v1.4s, v24.4s, v28.4s 225 226 $if REQUANTIZATION == "RNDNU": 227 # Apply params - preshift, scale, postshift, bias and clamp 228 LD1R {v5.4s}, [x11], 4 229 SQSHL v0.4s, v0.4s, v4.4s // shift to upper bits 230 SQSHL v1.4s, v1.4s, v4.4s 231 SQDMULH v0.4s, v0.4s, v7.4s // scale without rounding 232 SQDMULH v1.4s, v1.4s, v7.4s 233 SRSHL v0.4s, v0.4s, v5.4s // signed rounding shift left 234 SRSHL v1.4s, v1.4s, v5.4s 235 $elif REQUANTIZATION == "FP32": 236 $if not CHANNELWISE: 237 # Apply params - scale, bias and clamp 238 SCVTF v0.4s, v0.4s 239 LD1R {v4.4s}, [x11], 4 240 SCVTF v1.4s, v1.4s 241 FMUL v0.4s, v0.4s, v4.4s 242 FMUL v1.4s, v1.4s, v4.4s 243 $else: 244 # Load per channel scale values from weights 245 SCVTF v0.4s, v0.4s 246 LDR q4, [x5], 16 247 SCVTF v1.4s, v1.4s 248 LDR q5, [x5], 16 249 FMUL v0.4s, v0.4s, v4.4s 250 FMUL v1.4s, v1.4s, v5.4s 251 252 FCVTNS v0.4s, v0.4s 253 FCVTNS v1.4s, v1.4s 254 255 LD1R {v5.8h}, [x11], 2 256 SQXTN v0.4h, v0.4s 257 SQXTN2 v0.8h, v1.4s 258 SUBS x1, x1, 8 259 SQADD v0.8h, v0.8h, v5.8h 260 LD1R {v1.16b}, [x11], 1 261 SQXTN v0.8b, v0.8h 262 LD1R {v17.16b}, [x11] 263 SMAX v0.8b, v0.8b, v1.8b 264 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 265 266 SMIN v0.8b, v0.8b, v17.8b 267 B.LO 6f 268 269 # Store full 1 x 8 270 ST1 {v0.8b}, [x6], x10 271 SUB x4, x4, x3 // a -= ks 272 B.HI 0b 273 RET 274 275 # Remainder - 8 bytes of A 276 .p2align 3 2775: 278 LDR d0, [x13], 8 279 LDP d4, d5, [x5] 280 LDP d6, d7, [x5, 16] 281 SMULL v17.8h, v4.8b, v0.8b 282 SMULL v19.8h, v5.8b, v0.8b 283 SMULL v21.8h, v6.8b, v0.8b 284 SMULL v23.8h, v7.8b, v0.8b 285 LDP d4, d5, [x5, 32] 286 LDP d6, d7, [x5, 48] 287 SADALP v16.4s, v17.8h 288 SADALP v18.4s, v19.8h 289 SADALP v20.4s, v21.8h 290 SADALP v22.4s, v23.8h 291 SMULL v17.8h, v4.8b, v0.8b 292 SMULL v19.8h, v5.8b, v0.8b 293 SMULL v21.8h, v6.8b, v0.8b 294 SMULL v23.8h, v7.8b, v0.8b 295 ADD x5, x5, 64 296 SADALP v24.4s, v17.8h 297 SADALP v26.4s, v19.8h 298 SADALP v28.4s, v21.8h 299 SADALP v30.4s, v23.8h 300 301 # ks loop 302 SUBS x9, x9, 8 // ks -= MR * sizeof(int8_t*) 303 B.HI 1b 304 B 4b 305 306 # Store odd width 307 .p2align 3 3086: 309 TBZ x1, 2, 7f 310 STR s0, [x6], 4 311 EXT v0.16b, v0.16b, v0.16b, 4 312 3137: 314 TBZ x1, 1, 8f 315 STR h0, [x6], 2 316 EXT v0.16b, v0.16b, v0.16b, 2 3178: 318 TBZ x1, 0, 9f 319 STR b0, [x6] 3209: 321 RET 322 323END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8c8__aarch64_neon_mlal${"_prfm" if PREFETCH else ""}_cortex_a53 324 325#ifdef __ELF__ 326.section ".note.GNU-stack","",%progbits 327#endif 328