1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const int8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// int8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41// params structure is 10 bytes 42// struct { 43// float magic_bias; d12[0] 44// int32_t magic_bias_less_output_zero_point; d12[1] 45// int8_t output_min; d13[6] 46// int8_t output_max; d13[7] 47// } xnn_qs8_minmax_params.neon; 48 49BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 50 # Push 96 bytes 51 PUSH {r5, r7, r9, r11} // 16 52 SUB sp, sp, 32 // +32 53 VPUSH {d8-d13} // +48 = 96 54 55 LDR r11, [sp, 104] // c 56 LDR r9, [sp, 100] // w 57 LDR r5, [sp, 116] // params 58 59 # Load params values 60 VLDM r5!, {d12} // QC8 neon params 61 VLD1.16 {d13[]}, [r5] // output_min/max 62 LDR r7, [sp, 112] // cn_stride 63 64 65 .p2align 3 660: 67 # Load initial bias from w into accumulators 68 VLDM r9!, {d16-d19} // Bias 69 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 70 SUBS r5, r2, 8 // k = kc - 8 71 VMOV.I32 q3, 0 72 BLO 4f // less than 8 channels? 73 74 // Prologue - load A0 and B0 75 VLD1.8 {d0}, [r3]! // A0 76 SUBS r5, r5, 8 // k = k - 8 77 VLD1.8 {d8}, [r9]! // B0 78 BLO 2f // less than 8 channels? 79 80 // Main loop - 8 bytes 81 // 64 bytes for weights. 82 83 .p2align 3 841: 85 // Extend 86 VMOVL.S8 q0, d0 87 VMOVL.S8 q4, d8 88 89 // BLOCK 0 90 VLD1.8 {d10}, [r9]! // B1 91 VMLAL.S16 q8, d8, d0[0] 92 VMLAL.S16 q9, d9, d0[0] 93 VMOVL.S8 q5, d10 94 95 // BLOCK 1 96 VLD1.8 {d8}, [r9]! // B2 97 VMLAL.S16 q2, d10, d0[1] 98 VMLAL.S16 q3, d11, d0[1] 99 VMOVL.S8 q4, d8 100 101 // BLOCK 2 102 VLD1.8 {d10}, [r9]! // B3 103 VMLAL.S16 q8, d8, d0[2] 104 VMLAL.S16 q9, d9, d0[2] 105 VMOVL.S8 q5, d10 106 107 // BLOCK 3 108 VLD1.8 {d8}, [r9]! // B4 109 VMLAL.S16 q2, d10, d0[3] 110 VMLAL.S16 q3, d11, d0[3] 111 VLD1.8 {d0}, [r3]! // A0 112 VMOVL.S8 q4, d8 113 114 // BLOCK 4 115 VLD1.8 {d10}, [r9]! // B5 116 VMLAL.S16 q8, d8, d1[0] 117 VMLAL.S16 q9, d9, d1[0] 118 VMOVL.S8 q5, d10 119 120 // BLOCK 5 121 VLD1.8 {d8}, [r9]! // B6 122 VMLAL.S16 q2, d10, d1[1] 123 VMLAL.S16 q3, d11, d1[1] 124 VMOVL.S8 q4, d8 125 126 // BLOCK 6 127 VLD1.8 {d10}, [r9]! // B7 128 VMLAL.S16 q8, d8, d1[2] 129 VMLAL.S16 q9, d9, d1[2] 130 VMOVL.S8 q5, d10 131 132 // BLOCK 7 133 VLD1.8 {d8}, [r9]! // B0 134 VMLAL.S16 q2, d10, d1[3] 135 VMLAL.S16 q3, d11, d1[3] 136 SUBS r5, r5, 8 137 BHS 1b 138 139 // Epilogue 140 141 .p2align 3 1422: 143 VMOVL.S8 q0, d0 144 VMOVL.S8 q4, d8 145 146 VLD1.8 {d10}, [r9]! // B1 147 VMLAL.S16 q8, d8, d0[0] 148 VMLAL.S16 q9, d9, d0[0] 149 VMOVL.S8 q5, d10 150 151 VLD1.8 {d8}, [r9]! // B2 152 VMLAL.S16 q2, d10, d0[1] 153 VMLAL.S16 q3, d11, d0[1] 154 VMOVL.S8 q4, d8 155 156 VLD1.8 {d10}, [r9]! // B3 157 VMLAL.S16 q8, d8, d0[2] 158 VMLAL.S16 q9, d9, d0[2] 159 VMOVL.S8 q5, d10 160 161 VLD1.8 {d8}, [r9]! // B4 162 VMLAL.S16 q2, d10, d0[3] 163 VMLAL.S16 q3, d11, d0[3] 164 VMOVL.S8 q4, d8 165 166 VLD1.8 {d10}, [r9]! // B5 167 VMLAL.S16 q8, d8, d1[0] 168 VMLAL.S16 q9, d9, d1[0] 169 VMOVL.S8 q5, d10 170 171 VLD1.8 {d8}, [r9]! // B6 172 VMLAL.S16 q2, d10, d1[1] 173 VMLAL.S16 q3, d11, d1[1] 174 VMOVL.S8 q4, d8 175 176 VLD1.8 {d10}, [r9]! // B7 177 VMLAL.S16 q8, d8, d1[2] 178 VMLAL.S16 q9, d9, d1[2] 179 VMOVL.S8 q5, d10 180 ADDS r5, r5, 8 181 182 VMLAL.S16 q2, d10, d1[3] 183 VMLAL.S16 q3, d11, d1[3] 184 185 # Is there a remainder?- 1-7 bytes of A 186 BNE 4f 187 1883: 189 VADD.S32 q8, q8, q2 190 VADD.S32 q9, q9, q3 191 192 # QC8 FP32 quantization 193 VLD1.8 {q0-q1}, [r9]! 194 195 VDUP.32 q2, d12[0] // magic_bias 196 VDUP.32 q3, d12[1] // magic_bias_less_output_zero_point 197 198 VCVT.F32.S32 q8, q8 199 VCVT.F32.S32 q9, q9 200 201 VMUL.F32 q8, q8, q0 // multiplier 202 VMUL.F32 q9, q9, q1 203 204 VADD.F32 q8, q8, q2 // magic_bias 205 VADD.F32 q9, q9, q2 206 207 VQSUB.S32 q8, q8, q3 // magic_bias_less_output_zero_point 208 VQSUB.S32 q9, q9, q3 209 210 211 VQMOVN.S32 d16, q8 212 VQMOVN.S32 d17, q9 213 214 215 VDUP.8 d24, d13[6] // output_min 216 217 VQMOVN.S16 d0, q8 218 219 VDUP.8 d25, d13[7] // output_max 220 221 VMAX.S8 d0, d0, d24 222 223 SUBS r1, r1, 8 224 225 VMIN.S8 d0, d0, d25 226 227 # Store full 1 x 8 228 BLO 5f 229 VST1.8 {d0}, [r11], r7 230 SUB r3, r3, r2 231 BHI 0b 232 233 VPOP {d8-d13} 234 ADD sp, sp, 16 // skip pad of 8 + d14 235 ADD sp, sp, 16 236 POP {r5, r7, r9, r11} 237 BX lr 238 239 # Remainder- 1 to 7 bytes of A 240 .p2align 3 2414: 242 AND r5, r5, 7 // kc remainder 1 to 7 243 244 VLD1.8 {d0}, [r3], r5 245 VLD1.8 {d8}, [r9]! 246 247 VMOVL.S8 q0, d0 248 VMOVL.S8 q4, d8 249 VMLAL.S16 q8, d8, d0[0] 250 VMLAL.S16 q9, d9, d0[0] 251 CMP r5, 2 252 BLO 3b 253 254 VLD1.8 {d8}, [r9]! 255 VMOVL.S8 q4, d8 256 VMLAL.S16 q8, d8, d0[1] 257 VMLAL.S16 q9, d9, d0[1] 258 BEQ 3b 259 260 VLD1.8 {d8}, [r9]! 261 VMOVL.S8 q4, d8 262 VMLAL.S16 q8, d8, d0[2] 263 VMLAL.S16 q9, d9, d0[2] 264 CMP r5, 4 265 BLO 3b 266 267 VLD1.8 {d8}, [r9]! 268 VMOVL.S8 q4, d8 269 VMLAL.S16 q8, d8, d0[3] 270 VMLAL.S16 q9, d9, d0[3] 271 BEQ 3b 272 273 VLD1.8 {d8}, [r9]! 274 VMOVL.S8 q4, d8 275 VMLAL.S16 q8, d8, d1[0] 276 VMLAL.S16 q9, d9, d1[0] 277 CMP r5, 6 278 BLO 3b 279 280 VLD1.8 {d8}, [r9]! 281 VMOVL.S8 q4, d8 282 VMLAL.S16 q8, d8, d1[1] 283 VMLAL.S16 q9, d9, d1[1] 284 BEQ 3b 285 286 VLD1.8 {d8}, [r9]! 287 VMOVL.S8 q4, d8 288 VMLAL.S16 q8, d8, d1[2] 289 VMLAL.S16 q9, d9, d1[2] 290 B 3b 291 292 # Store odd width 293 .p2align 3 2945: 295 TST r1, 4 296 BEQ 6f 297 VST1.32 {d0[0]}, [r11]! 298 VEXT.8 q0, q0, q0, 4 2996: 300 TST r1, 2 301 BEQ 7f 302 VST1.16 {d0[0]}, [r11]! 303 VEXT.8 q0, q0, q0, 2 3047: 305 TST r1, 1 306 BEQ 8f 307 VST1.8 {d0[0]}, [r11] 3088: 309 VPOP {d8-d13} 310 ADD sp, sp, 16 // skip pad of 8 + d14 311 ADD sp, sp, 16 312 POP {r5, r7, r9, r11} 313 BX lr 314 315END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 316 317#ifdef __ELF__ 318.section ".note.GNU-stack","",%progbits 319#endif 320 321