1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35 16// size_t mr, (r0) 17// size_t nc, r1 18// size_t kc, (r2) -> sp + 56 -> r5 19// size_t ks, (r3) -> sp + 60 -> r14 20// const int8_t**restrict a, sp + 88 -> r2 21// const void*restrict w, sp + 92 -> r9 22// int8_t*restrict c, sp + 96 -> r11 23// size_t cm_stride, sp + 100 -> r6 24// size_t cn_stride, sp + 104 -> r12 25// size_t a_offset, sp + 108 -> (r5) 26// const int8_t* zero, sp + 112 -> r7 27// xnn_qs8_minmax_params*params); sp + 116 -> (r5) 28 29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 30 31// Based on cortex_a53 microkernel but with Neon loads 32 33// Register usage 34// A0 r3 d0-d1 q0 35 36// B r9 d8-d9 q4 q5 37 38// C0 r11 d16-d17 q8 d18-d19 q9 39// q2, q3 acc2 40 41// Unused r4, r8, r10, d15, q10-q15, q1-q3 42 43// params structure is 4 bytes 44// struct { 45// int16_t output_zero_point; d13[2] 46// int8_t output_min; d13[6] 47// int8_t output_max; d13[7] 48// } xnn_qs8_minmax_params.neonv8; 49 50BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35 51 # Push 88 bytes 52 # r2, r3 will be reloaded in outer loop. 53 PUSH {r2, r3, r5, r6, r7, r9, r11, lr} // +32 54 SUB sp, sp, 8 // +8 55 VPUSH {d8-d13} // +48 = 88 56 57 LDR r2, [sp, 88] // a 58 LDR r9, [sp, 92] // w 59 LDR r11, [sp, 96] // c 60 LDR r6, [sp, 100] // cm_stride 61 LDR r12, [sp, 104] // cn_stride 62 LDR r7, [sp, 112] // zero 63 LDR r5, [sp, 116] // params 64 MOV r14, r3 // p = ks 65 66 # Load params values 67 VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 68 69 70 .p2align 3 710: 72 # Load initial bias from w into accumulators 73 VLDM r9!, {d16-d19} // Bias 74 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 75 VMOV.I32 q3, 0 76 77 .p2align 3 781: 79 # Load next A pointer 80 LDR r3, [r2, 0] 81 82 # Add a_offset 83 LDR r5, [sp, 108] // a_offset 84 ADD r2, r2, 4 85 CMP r3, r7 // if a0 == zero 86 ADD r3, r3, r5 // a0 += a_offset 87 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 88 89 LDR r5, [sp, 56] // kc 90 SUBS r5, r5, 8 // kc - 8 91 BLO 5f // less than 8 channels? 92 93 // Prologue - load A0 and B0 94 VLD1.8 {d0}, [r3]! // A0 95 SUBS r5, r5, 8 // k = k - 8 96 VLD1.8 {d8}, [r9]! // B0 97 BLO 3f // less than 8 channels? 98 99 // Main loop - 8 bytes 100 // 64 bytes for weights. 101 102 .p2align 3 1032: 104 // Extend 105 VMOVL.S8 q0, d0 106 VMOVL.S8 q4, d8 107 108 // BLOCK 0 109 VLD1.8 {d10}, [r9]! // B1 110 VMLAL.S16 q8, d8, d0[0] 111 VMLAL.S16 q9, d9, d0[0] 112 VMOVL.S8 q5, d10 113 114 // BLOCK 1 115 VLD1.8 {d8}, [r9]! // B2 116 VMLAL.S16 q2, d10, d0[1] 117 VMLAL.S16 q3, d11, d0[1] 118 VMOVL.S8 q4, d8 119 120 // BLOCK 2 121 VLD1.8 {d10}, [r9]! // B3 122 VMLAL.S16 q8, d8, d0[2] 123 VMLAL.S16 q9, d9, d0[2] 124 VMOVL.S8 q5, d10 125 126 // BLOCK 3 127 VLD1.8 {d8}, [r9]! // B4 128 VMLAL.S16 q2, d10, d0[3] 129 VMLAL.S16 q3, d11, d0[3] 130 VLD1.8 {d0}, [r3]! // A0 131 VMOVL.S8 q4, d8 132 133 // BLOCK 4 134 VLD1.8 {d10}, [r9]! // B5 135 VMLAL.S16 q8, d8, d1[0] 136 VMLAL.S16 q9, d9, d1[0] 137 VMOVL.S8 q5, d10 138 139 // BLOCK 5 140 VLD1.8 {d8}, [r9]! // B6 141 VMLAL.S16 q2, d10, d1[1] 142 VMLAL.S16 q3, d11, d1[1] 143 VMOVL.S8 q4, d8 144 145 // BLOCK 6 146 VLD1.8 {d10}, [r9]! // B7 147 VMLAL.S16 q8, d8, d1[2] 148 VMLAL.S16 q9, d9, d1[2] 149 VMOVL.S8 q5, d10 150 SUBS r5, r5, 8 151 152 // BLOCK 7 153 VLD1.8 {d8}, [r9]! // B0 154 VMLAL.S16 q2, d10, d1[3] 155 VMLAL.S16 q3, d11, d1[3] 156 BHS 2b 157 158 // Epilogue 159 160 .p2align 3 1613: 162 // Extend 163 VMOVL.S8 q0, d0 164 VMOVL.S8 q4, d8 165 166 // BLOCK 0 167 VLD1.8 {d10}, [r9]! // B1 168 VMLAL.S16 q8, d8, d0[0] 169 VMLAL.S16 q9, d9, d0[0] 170 VMOVL.S8 q5, d10 171 172 // BLOCK 1 173 VLD1.8 {d8}, [r9]! // B2 174 VMLAL.S16 q2, d10, d0[1] 175 VMLAL.S16 q3, d11, d0[1] 176 VMOVL.S8 q4, d8 177 178 // BLOCK 2 179 VLD1.8 {d10}, [r9]! // B3 180 VMLAL.S16 q8, d8, d0[2] 181 VMLAL.S16 q9, d9, d0[2] 182 VMOVL.S8 q5, d10 183 184 // BLOCK 3 185 VLD1.8 {d8}, [r9]! // B4 186 VMLAL.S16 q2, d10, d0[3] 187 VMLAL.S16 q3, d11, d0[3] 188 VMOVL.S8 q4, d8 189 190 // BLOCK 4 191 VLD1.8 {d10}, [r9]! // B5 192 VMLAL.S16 q8, d8, d1[0] 193 VMLAL.S16 q9, d9, d1[0] 194 VMOVL.S8 q5, d10 195 196 // BLOCK 5 197 VLD1.8 {d8}, [r9]! // B6 198 VMLAL.S16 q2, d10, d1[1] 199 VMLAL.S16 q3, d11, d1[1] 200 VMOVL.S8 q4, d8 201 202 // BLOCK 6 203 VLD1.8 {d10}, [r9]! // B7 204 VMLAL.S16 q8, d8, d1[2] 205 VMLAL.S16 q9, d9, d1[2] 206 VMOVL.S8 q5, d10 207 ADDS r5, r5, 8 208 209 VMLAL.S16 q2, d10, d1[3] 210 VMLAL.S16 q3, d11, d1[3] 211 212 # Is there a remainder?- 1-7 bytes of A 213 BNE 6f 214 2154: 216 # ks loop 217 SUBS r14, r14, 4 // ks -= MR * sizeof(void*) 218 BHI 1b 219 220 LDR r14, [sp, 60] // p = ks 221 222 VADD.S32 q8, q8, q2 223 VADD.S32 q9, q9, q3 224 225 # QC8 FP32 quantization 226 VLD1.8 {q0-q1}, [r9]! 227 228 VCVT.F32.S32 q8, q8 229 VCVT.F32.S32 q9, q9 230 231 VMUL.F32 q8, q8, q0 // multiplier 232 VMUL.F32 q9, q9, q1 233 234 VCVTN.S32.F32 q8, q8 235 VCVTN.S32.F32 q9, q9 236 237 VDUP.16 q0, d13[2] // output_zero_point 238 239 VQMOVN.S32 d16, q8 240 VQMOVN.S32 d17, q9 241 242 VQADD.S16 q8, q8, q0 243 244 VDUP.8 d24, d13[6] // output_min 245 246 VQMOVN.S16 d0, q8 247 248 VDUP.8 d25, d13[7] // output_max 249 250 VMAX.S8 d0, d0, d24 251 252 SUBS r1, r1, 8 253 254 VMIN.S8 d0, d0, d25 255 256 # Store full 1 x 8 257 BLO 7f 258 VST1.8 {d0}, [r11], r12 259 SUB r2, r2, r14 // a -= ks 260 BHI 0b 261 262 VPOP {d8-d13} 263 ADD sp, sp, 16 // skip pad of 8, r2, r3 264 POP {r5, r6, r7, r9, r11, pc} 265 266 # Remainder- 1 to 7 bytes of A 267 .p2align 3 2685: 269 AND r5, r5, 7 // kc remainder 1 to 7 2706: 271 VLD1.8 {d0}, [r3] 272 VLD1.8 {d8}, [r9]! 273 274 VMOVL.S8 q0, d0 275 VMOVL.S8 q4, d8 276 VMLAL.S16 q8, d8, d0[0] 277 VMLAL.S16 q9, d9, d0[0] 278 CMP r5, 2 279 BLO 4b 280 281 VLD1.8 {d8}, [r9]! 282 VMOVL.S8 q4, d8 283 VMLAL.S16 q8, d8, d0[1] 284 VMLAL.S16 q9, d9, d0[1] 285 BEQ 4b 286 287 VLD1.8 {d8}, [r9]! 288 VMOVL.S8 q4, d8 289 VMLAL.S16 q8, d8, d0[2] 290 VMLAL.S16 q9, d9, d0[2] 291 CMP r5, 4 292 BLO 4b 293 294 VLD1.8 {d8}, [r9]! 295 VMOVL.S8 q4, d8 296 VMLAL.S16 q8, d8, d0[3] 297 VMLAL.S16 q9, d9, d0[3] 298 BEQ 4b 299 300 VLD1.8 {d8}, [r9]! 301 VMOVL.S8 q4, d8 302 VMLAL.S16 q8, d8, d1[0] 303 VMLAL.S16 q9, d9, d1[0] 304 CMP r5, 6 305 BLO 4b 306 307 VLD1.8 {d8}, [r9]! 308 VMOVL.S8 q4, d8 309 VMLAL.S16 q8, d8, d1[1] 310 VMLAL.S16 q9, d9, d1[1] 311 BEQ 4b 312 313 VLD1.8 {d8}, [r9]! 314 VMOVL.S8 q4, d8 315 VMLAL.S16 q8, d8, d1[2] 316 VMLAL.S16 q9, d9, d1[2] 317 B 4b 318 319 # Store odd width 320 .p2align 3 3217: 322 TST r1, 4 323 BEQ 8f 324 VST1.32 {d0[0]}, [r11]! 325 VEXT.8 q0, q0, q0, 4 3268: 327 TST r1, 2 328 BEQ 9f 329 VST1.16 {d0[0]}, [r11]! 330 VEXT.8 q0, q0, q0, 2 331 3329: 333 TST r1, 1 334 BEQ 10f 335 VST1.8 {d0[0]}, [r11] 336 33710: 338 VPOP {d8-d13} 339 ADD sp, sp, 16 // skip pad of 8, r2, r3 340 POP {r5, r6, r7, r9, r11, pc} 341 342END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35 343 344#ifdef __ELF__ 345.section ".note.GNU-stack","",%progbits 346#endif 347