1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const int8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// int8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_conv_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41// params structure is 16 bytes 42// struct { 43// int32_t right_pre_shift; d12[0] 44// int32_t multiplier; d12[1] 45// int32_t right_post_shift; d13[0] 46// int16_t output_zero_point; d13[2] 47// int8_t output_min; d13[6] 48// int8_t output_max; d13[7] 49// } rndnu_neon; 50 51BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 52 # Push 96 bytes 53 PUSH {r5, r7, r9, r11} // 16 54 SUB sp, sp, 32 // +32 55 VPUSH {d8-d13} // +48 = 96 56 57 LDR r11, [sp, 104] // c 58 LDR r9, [sp, 100] // w 59 LDR r5, [sp, 116] // params 60 61 # Load params values 62 VLDM r5, {d12-d13} // RNDNU params 63 LDR r7, [sp, 112] // cn_stride 64 65 66 .p2align 3 670: 68 # Load initial bias from w into accumulators 69 VLDM r9!, {d16-d19} // Bias 70 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 71 SUBS r5, r2, 8 // k = kc - 8 72 VMOV.I32 q3, 0 73 BLO 4f // less than 8 channels? 74 75 // Prologue - load A0 and B0 76 VLD1.8 {d0}, [r3]! // A0 77 SUBS r5, r5, 8 // k = k - 8 78 VLD1.8 {d8}, [r9]! // B0 79 BLO 2f // less than 8 channels? 80 81 // Main loop - 8 bytes 82 // 64 bytes for weights. 83 84 .p2align 3 851: 86 // Extend 87 VMOVL.S8 q0, d0 88 VMOVL.S8 q4, d8 89 90 // BLOCK 0 91 VLD1.8 {d10}, [r9]! // B1 92 VMLAL.S16 q8, d8, d0[0] 93 VMLAL.S16 q9, d9, d0[0] 94 VMOVL.S8 q5, d10 95 96 // BLOCK 1 97 VLD1.8 {d8}, [r9]! // B2 98 VMLAL.S16 q2, d10, d0[1] 99 VMLAL.S16 q3, d11, d0[1] 100 VMOVL.S8 q4, d8 101 102 // BLOCK 2 103 VLD1.8 {d10}, [r9]! // B3 104 VMLAL.S16 q8, d8, d0[2] 105 VMLAL.S16 q9, d9, d0[2] 106 VMOVL.S8 q5, d10 107 108 // BLOCK 3 109 VLD1.8 {d8}, [r9]! // B4 110 VMLAL.S16 q2, d10, d0[3] 111 VMLAL.S16 q3, d11, d0[3] 112 VLD1.8 {d0}, [r3]! // A0 113 VMOVL.S8 q4, d8 114 115 // BLOCK 4 116 VLD1.8 {d10}, [r9]! // B5 117 VMLAL.S16 q8, d8, d1[0] 118 VMLAL.S16 q9, d9, d1[0] 119 VMOVL.S8 q5, d10 120 121 // BLOCK 5 122 VLD1.8 {d8}, [r9]! // B6 123 VMLAL.S16 q2, d10, d1[1] 124 VMLAL.S16 q3, d11, d1[1] 125 VMOVL.S8 q4, d8 126 127 // BLOCK 6 128 VLD1.8 {d10}, [r9]! // B7 129 VMLAL.S16 q8, d8, d1[2] 130 VMLAL.S16 q9, d9, d1[2] 131 VMOVL.S8 q5, d10 132 133 // BLOCK 7 134 VLD1.8 {d8}, [r9]! // B0 135 VMLAL.S16 q2, d10, d1[3] 136 VMLAL.S16 q3, d11, d1[3] 137 SUBS r5, r5, 8 138 BHS 1b 139 140 // Epilogue 141 142 .p2align 3 1432: 144 VMOVL.S8 q0, d0 145 VMOVL.S8 q4, d8 146 147 VLD1.8 {d10}, [r9]! // B1 148 VMLAL.S16 q8, d8, d0[0] 149 VMLAL.S16 q9, d9, d0[0] 150 VMOVL.S8 q5, d10 151 152 VLD1.8 {d8}, [r9]! // B2 153 VMLAL.S16 q2, d10, d0[1] 154 VMLAL.S16 q3, d11, d0[1] 155 VMOVL.S8 q4, d8 156 157 VLD1.8 {d10}, [r9]! // B3 158 VMLAL.S16 q8, d8, d0[2] 159 VMLAL.S16 q9, d9, d0[2] 160 VMOVL.S8 q5, d10 161 162 VLD1.8 {d8}, [r9]! // B4 163 VMLAL.S16 q2, d10, d0[3] 164 VMLAL.S16 q3, d11, d0[3] 165 VMOVL.S8 q4, d8 166 167 VLD1.8 {d10}, [r9]! // B5 168 VMLAL.S16 q8, d8, d1[0] 169 VMLAL.S16 q9, d9, d1[0] 170 VMOVL.S8 q5, d10 171 172 VLD1.8 {d8}, [r9]! // B6 173 VMLAL.S16 q2, d10, d1[1] 174 VMLAL.S16 q3, d11, d1[1] 175 VMOVL.S8 q4, d8 176 177 VLD1.8 {d10}, [r9]! // B7 178 VMLAL.S16 q8, d8, d1[2] 179 VMLAL.S16 q9, d9, d1[2] 180 VMOVL.S8 q5, d10 181 ADDS r5, r5, 8 182 183 VMLAL.S16 q2, d10, d1[3] 184 VMLAL.S16 q3, d11, d1[3] 185 186 # Is there a remainder?- 1-7 bytes of A 187 BNE 4f 188 1893: 190 VADD.S32 q8, q8, q2 191 VADD.S32 q9, q9, q3 192 193 # RNDNU quantization 194 VDUP.32 q0, d12[0] // right_pre_shift 195 196 VQSHL.S32 q8, q8, q0 197 VQSHL.S32 q9, q9, q0 198 199 VDUP.32 q2, d13[0] // right_post_shift 200 201 VQDMULH.S32 q8, q8, d12[1] // multiplier 202 VQDMULH.S32 q9, q9, d12[1] 203 204 VRSHL.S32 q8, q8, q2 205 VRSHL.S32 q9, q9, q2 206 207 VDUP.16 q0, d13[2] // output_zero_point 208 209 VQMOVN.S32 d16, q8 210 VQMOVN.S32 d17, q9 211 212 VQADD.S16 q8, q8, q0 213 214 VDUP.8 d24, d13[6] // output_min 215 216 VQMOVN.S16 d0, q8 217 218 VDUP.8 d25, d13[7] // output_max 219 220 VMAX.S8 d0, d0, d24 221 222 SUBS r1, r1, 8 223 224 VMIN.S8 d0, d0, d25 225 226 # Store full 1 x 8 227 BLO 5f 228 VST1.8 {d0}, [r11], r7 229 SUB r3, r3, r2 230 BHI 0b 231 232 VPOP {d8-d13} 233 ADD sp, sp, 16 // skip pad of 8 + d14 234 ADD sp, sp, 16 235 POP {r5, r7, r9, r11} 236 BX lr 237 238 # Remainder- 1 to 7 bytes of A 239 .p2align 3 2404: 241 AND r5, r5, 7 // kc remainder 1 to 7 242 243 VLD1.8 {d0}, [r3], r5 244 VLD1.8 {d8}, [r9]! 245 246 VMOVL.S8 q0, d0 247 VMOVL.S8 q4, d8 248 VMLAL.S16 q8, d8, d0[0] 249 VMLAL.S16 q9, d9, d0[0] 250 CMP r5, 2 251 BLO 3b 252 253 VLD1.8 {d8}, [r9]! 254 VMOVL.S8 q4, d8 255 VMLAL.S16 q8, d8, d0[1] 256 VMLAL.S16 q9, d9, d0[1] 257 BEQ 3b 258 259 VLD1.8 {d8}, [r9]! 260 VMOVL.S8 q4, d8 261 VMLAL.S16 q8, d8, d0[2] 262 VMLAL.S16 q9, d9, d0[2] 263 CMP r5, 4 264 BLO 3b 265 266 VLD1.8 {d8}, [r9]! 267 VMOVL.S8 q4, d8 268 VMLAL.S16 q8, d8, d0[3] 269 VMLAL.S16 q9, d9, d0[3] 270 BEQ 3b 271 272 VLD1.8 {d8}, [r9]! 273 VMOVL.S8 q4, d8 274 VMLAL.S16 q8, d8, d1[0] 275 VMLAL.S16 q9, d9, d1[0] 276 CMP r5, 6 277 BLO 3b 278 279 VLD1.8 {d8}, [r9]! 280 VMOVL.S8 q4, d8 281 VMLAL.S16 q8, d8, d1[1] 282 VMLAL.S16 q9, d9, d1[1] 283 BEQ 3b 284 285 VLD1.8 {d8}, [r9]! 286 VMOVL.S8 q4, d8 287 VMLAL.S16 q8, d8, d1[2] 288 VMLAL.S16 q9, d9, d1[2] 289 B 3b 290 291 # Store odd width 292 .p2align 3 2935: 294 TST r1, 4 295 BEQ 6f 296 VST1.32 {d0[0]}, [r11]! 297 VEXT.8 q0, q0, q0, 4 2986: 299 TST r1, 2 300 BEQ 7f 301 VST1.16 {d0[0]}, [r11]! 302 VEXT.8 q0, q0, q0, 2 3037: 304 TST r1, 1 305 BEQ 8f 306 VST1.8 {d0[0]}, [r11] 3078: 308 VPOP {d8-d13} 309 ADD sp, sp, 16 // skip pad of 8 + d14 310 ADD sp, sp, 16 311 POP {r5, r7, r9, r11} 312 BX lr 313 314END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 315 316#ifdef __ELF__ 317.section ".note.GNU-stack","",%progbits 318#endif 319 320