1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 16// size_t mr, (r0) 17// size_t nc, r1 18// size_t kc, (r2) -> sp + 56 -> r5 19// size_t ks, (r3) -> sp + 60 -> r14 20// const int8_t**restrict a, sp + 88 -> r2 21// const void*restrict w, sp + 92 -> r9 22// int8_t*restrict c, sp + 96 -> r11 23// size_t cm_stride, sp + 100 -> r6 24// size_t cn_stride, sp + 104 -> r12 25// size_t a_offset, sp + 108 -> (r5) 26// const int8_t* zero, sp + 112 -> r7 27// xnn_qs8_minmax_params*params); sp + 116 -> (r5) 28 29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 30 31// Based on cortex_a53 microkernel but with Neon loads 32 33// Register usage 34// A0 r3 d0-d1 q0 35 36// B r9 d8-d9 q4 q5 37 38// C0 r11 d16-d17 q8 d18-d19 q9 39// q2, q3 acc2 40 41// Unused r4, r8, r10, d15, q10-q15, q1-q3 42 43// params structure is 10 bytes 44// struct { 45// float magic_bias; d12[0] 46// int32_t magic_bias_less_output_zero_point; d12[1] 47// int8_t output_min; d13[6] 48// int8_t output_max; d13[7] 49// } xnn_qs8_minmax_params.neon; 50 51BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 52 # Push 88 bytes 53 # r2, r3 will be reloaded in outer loop. 54 PUSH {r2, r3, r5, r6, r7, r9, r11, lr} // +32 55 SUB sp, sp, 8 // +8 56 VPUSH {d8-d13} // +48 = 88 57 58 LDR r2, [sp, 88] // a 59 LDR r9, [sp, 92] // w 60 LDR r11, [sp, 96] // c 61 LDR r6, [sp, 100] // cm_stride 62 LDR r12, [sp, 104] // cn_stride 63 LDR r7, [sp, 112] // zero 64 LDR r5, [sp, 116] // params 65 MOV r14, r3 // p = ks 66 67 # Load params values 68 VLDM r5!, {d12} // QC8 neon params 69 VLD1.16 {d13[]}, [r5] 70 71 72 .p2align 3 730: 74 # Load initial bias from w into accumulators 75 VLDM r9!, {d16-d19} // Bias 76 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 77 VMOV.I32 q3, 0 78 79 .p2align 3 801: 81 # Load next A pointer 82 LDR r3, [r2, 0] 83 84 # Add a_offset 85 LDR r5, [sp, 108] // a_offset 86 ADD r2, r2, 4 87 CMP r3, r7 // if a0 == zero 88 ADD r3, r3, r5 // a0 += a_offset 89 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 90 91 LDR r5, [sp, 56] // kc 92 SUBS r5, r5, 8 // kc - 8 93 BLO 5f // less than 8 channels? 94 95 // Prologue - load A0 and B0 96 VLD1.8 {d0}, [r3]! // A0 97 SUBS r5, r5, 8 // k = k - 8 98 VLD1.8 {d8}, [r9]! // B0 99 BLO 3f // less than 8 channels? 100 101 // Main loop - 8 bytes 102 // 64 bytes for weights. 103 104 .p2align 3 1052: 106 // Extend 107 VMOVL.S8 q0, d0 108 VMOVL.S8 q4, d8 109 110 // BLOCK 0 111 VLD1.8 {d10}, [r9]! // B1 112 VMLAL.S16 q8, d8, d0[0] 113 VMLAL.S16 q9, d9, d0[0] 114 VMOVL.S8 q5, d10 115 116 // BLOCK 1 117 VLD1.8 {d8}, [r9]! // B2 118 VMLAL.S16 q2, d10, d0[1] 119 VMLAL.S16 q3, d11, d0[1] 120 VMOVL.S8 q4, d8 121 122 // BLOCK 2 123 VLD1.8 {d10}, [r9]! // B3 124 VMLAL.S16 q8, d8, d0[2] 125 VMLAL.S16 q9, d9, d0[2] 126 VMOVL.S8 q5, d10 127 128 // BLOCK 3 129 VLD1.8 {d8}, [r9]! // B4 130 VMLAL.S16 q2, d10, d0[3] 131 VMLAL.S16 q3, d11, d0[3] 132 VLD1.8 {d0}, [r3]! // A0 133 VMOVL.S8 q4, d8 134 135 // BLOCK 4 136 VLD1.8 {d10}, [r9]! // B5 137 VMLAL.S16 q8, d8, d1[0] 138 VMLAL.S16 q9, d9, d1[0] 139 VMOVL.S8 q5, d10 140 141 // BLOCK 5 142 VLD1.8 {d8}, [r9]! // B6 143 VMLAL.S16 q2, d10, d1[1] 144 VMLAL.S16 q3, d11, d1[1] 145 VMOVL.S8 q4, d8 146 147 // BLOCK 6 148 VLD1.8 {d10}, [r9]! // B7 149 VMLAL.S16 q8, d8, d1[2] 150 VMLAL.S16 q9, d9, d1[2] 151 VMOVL.S8 q5, d10 152 SUBS r5, r5, 8 153 154 // BLOCK 7 155 VLD1.8 {d8}, [r9]! // B0 156 VMLAL.S16 q2, d10, d1[3] 157 VMLAL.S16 q3, d11, d1[3] 158 BHS 2b 159 160 // Epilogue 161 162 .p2align 3 1633: 164 // Extend 165 VMOVL.S8 q0, d0 166 VMOVL.S8 q4, d8 167 168 // BLOCK 0 169 VLD1.8 {d10}, [r9]! // B1 170 VMLAL.S16 q8, d8, d0[0] 171 VMLAL.S16 q9, d9, d0[0] 172 VMOVL.S8 q5, d10 173 174 // BLOCK 1 175 VLD1.8 {d8}, [r9]! // B2 176 VMLAL.S16 q2, d10, d0[1] 177 VMLAL.S16 q3, d11, d0[1] 178 VMOVL.S8 q4, d8 179 180 // BLOCK 2 181 VLD1.8 {d10}, [r9]! // B3 182 VMLAL.S16 q8, d8, d0[2] 183 VMLAL.S16 q9, d9, d0[2] 184 VMOVL.S8 q5, d10 185 186 // BLOCK 3 187 VLD1.8 {d8}, [r9]! // B4 188 VMLAL.S16 q2, d10, d0[3] 189 VMLAL.S16 q3, d11, d0[3] 190 VMOVL.S8 q4, d8 191 192 // BLOCK 4 193 VLD1.8 {d10}, [r9]! // B5 194 VMLAL.S16 q8, d8, d1[0] 195 VMLAL.S16 q9, d9, d1[0] 196 VMOVL.S8 q5, d10 197 198 // BLOCK 5 199 VLD1.8 {d8}, [r9]! // B6 200 VMLAL.S16 q2, d10, d1[1] 201 VMLAL.S16 q3, d11, d1[1] 202 VMOVL.S8 q4, d8 203 204 // BLOCK 6 205 VLD1.8 {d10}, [r9]! // B7 206 VMLAL.S16 q8, d8, d1[2] 207 VMLAL.S16 q9, d9, d1[2] 208 VMOVL.S8 q5, d10 209 ADDS r5, r5, 8 210 211 VMLAL.S16 q2, d10, d1[3] 212 VMLAL.S16 q3, d11, d1[3] 213 214 # Is there a remainder?- 1-7 bytes of A 215 BNE 6f 216 2174: 218 # ks loop 219 SUBS r14, r14, 4 // ks -= MR * sizeof(void*) 220 BHI 1b 221 222 LDR r14, [sp, 60] // p = ks 223 224 VADD.S32 q8, q8, q2 225 VADD.S32 q9, q9, q3 226 227 # QC8 FP32 quantization 228 VLD1.8 {q0-q1}, [r9]! 229 230 VDUP.32 q2, d12[0] // magic_bias 231 VDUP.32 q3, d12[1] // magic_bias_less_output_zero_point 232 233 VCVT.F32.S32 q8, q8 234 VCVT.F32.S32 q9, q9 235 236 VMUL.F32 q8, q8, q0 // multiplier 237 VMUL.F32 q9, q9, q1 238 239 VADD.F32 q8, q8, q2 // magic_bias 240 VADD.F32 q9, q9, q2 241 242 VQSUB.S32 q8, q8, q3 // magic_bias_less_output_zero_point 243 VQSUB.S32 q9, q9, q3 244 245 246 VQMOVN.S32 d16, q8 247 VQMOVN.S32 d17, q9 248 249 250 VDUP.8 d24, d13[6] // output_min 251 252 VQMOVN.S16 d0, q8 253 254 VDUP.8 d25, d13[7] // output_max 255 256 VMAX.S8 d0, d0, d24 257 258 SUBS r1, r1, 8 259 260 VMIN.S8 d0, d0, d25 261 262 # Store full 1 x 8 263 BLO 7f 264 VST1.8 {d0}, [r11], r12 265 SUB r2, r2, r14 // a -= ks 266 BHI 0b 267 268 VPOP {d8-d13} 269 ADD sp, sp, 16 // skip pad of 8, r2, r3 270 POP {r5, r6, r7, r9, r11, pc} 271 272 # Remainder- 1 to 7 bytes of A 273 .p2align 3 2745: 275 AND r5, r5, 7 // kc remainder 1 to 7 2766: 277 VLD1.8 {d0}, [r3] 278 VLD1.8 {d8}, [r9]! 279 280 VMOVL.S8 q0, d0 281 VMOVL.S8 q4, d8 282 VMLAL.S16 q8, d8, d0[0] 283 VMLAL.S16 q9, d9, d0[0] 284 CMP r5, 2 285 BLO 4b 286 287 VLD1.8 {d8}, [r9]! 288 VMOVL.S8 q4, d8 289 VMLAL.S16 q8, d8, d0[1] 290 VMLAL.S16 q9, d9, d0[1] 291 BEQ 4b 292 293 VLD1.8 {d8}, [r9]! 294 VMOVL.S8 q4, d8 295 VMLAL.S16 q8, d8, d0[2] 296 VMLAL.S16 q9, d9, d0[2] 297 CMP r5, 4 298 BLO 4b 299 300 VLD1.8 {d8}, [r9]! 301 VMOVL.S8 q4, d8 302 VMLAL.S16 q8, d8, d0[3] 303 VMLAL.S16 q9, d9, d0[3] 304 BEQ 4b 305 306 VLD1.8 {d8}, [r9]! 307 VMOVL.S8 q4, d8 308 VMLAL.S16 q8, d8, d1[0] 309 VMLAL.S16 q9, d9, d1[0] 310 CMP r5, 6 311 BLO 4b 312 313 VLD1.8 {d8}, [r9]! 314 VMOVL.S8 q4, d8 315 VMLAL.S16 q8, d8, d1[1] 316 VMLAL.S16 q9, d9, d1[1] 317 BEQ 4b 318 319 VLD1.8 {d8}, [r9]! 320 VMOVL.S8 q4, d8 321 VMLAL.S16 q8, d8, d1[2] 322 VMLAL.S16 q9, d9, d1[2] 323 B 4b 324 325 # Store odd width 326 .p2align 3 3277: 328 TST r1, 4 329 BEQ 8f 330 VST1.32 {d0[0]}, [r11]! 331 VEXT.8 q0, q0, q0, 4 3328: 333 TST r1, 2 334 BEQ 9f 335 VST1.16 {d0[0]}, [r11]! 336 VEXT.8 q0, q0, q0, 2 337 3389: 339 TST r1, 1 340 BEQ 10f 341 VST1.8 {d0[0]}, [r11] 342 34310: 344 VPOP {d8-d13} 345 ADD sp, sp, 16 // skip pad of 8, r2, r3 346 POP {r5, r6, r7, r9, r11, pc} 347 348END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 349 350#ifdef __ELF__ 351.section ".note.GNU-stack","",%progbits 352#endif 353