1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const uint8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// uint8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_conv_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41# params structure is 20 bytes 42# struct { 43# uint8_t kernel_zero_point[4]; d14 44# int32_t right_pre_shift; d12[0] 45# int32_t multiplier; d12[1] 46# int32_t right_post_shift; d13[0] 47# int16_t output_zero_point; d13[2] 48# uint8_t output_min; d13[6] 49# uint8_t output_max; d13[7] 50# } rndnu_neon; 51 52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 53 # Push 96 bytes 54 PUSH {r5, r7, r9, r11} // 16 55 SUB sp, sp, 24 // +24 56 VPUSH {d8-d14} // +56 = 96 57 58 LDR r11, [sp, 104] // c 59 LDR r9, [sp, 100] // w 60 LDR r5, [sp, 116] // params 61 62 # Load params values 63 VLD1.32 {d14[]}, [r5]! // QU8 kernel_zero_point 64 VLDM r5, {d12-d13} // RNDNU params 65 LDR r7, [sp, 112] // cn_stride 66 67 68 .p2align 3 690: 70 # Load initial bias from w into accumulators 71 VLDM r9!, {d16-d19} // Bias 72 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 73 SUBS r5, r2, 8 // k = kc - 8 74 VMOV.I32 q3, 0 75 BLO 4f // less than 8 channels? 76 77 // Prologue - load A0 and B0 78 VLD1.8 {d0}, [r3]! // A0 79 SUBS r5, r5, 8 // k = k - 8 80 VLD1.8 {d8}, [r9]! // B0 81 BLO 2f // less than 8 channels? 82 83 // Main loop - 8 bytes 84 // 64 bytes for weights. 85 86 .p2align 3 871: 88 // Extend 89 VMOVL.U8 q0, d0 90 VSUBL.U8 q4, d8, d14 91 92 // BLOCK 0 93 VLD1.8 {d10}, [r9]! // B1 94 VMLAL.S16 q8, d8, d0[0] 95 VMLAL.S16 q9, d9, d0[0] 96 VSUBL.U8 q5, d10, d14 97 98 // BLOCK 1 99 VLD1.8 {d8}, [r9]! // B2 100 VMLAL.S16 q2, d10, d0[1] 101 VMLAL.S16 q3, d11, d0[1] 102 VSUBL.U8 q4, d8, d14 103 104 // BLOCK 2 105 VLD1.8 {d10}, [r9]! // B3 106 VMLAL.S16 q8, d8, d0[2] 107 VMLAL.S16 q9, d9, d0[2] 108 VSUBL.U8 q5, d10, d14 109 110 // BLOCK 3 111 VLD1.8 {d8}, [r9]! // B4 112 VMLAL.S16 q2, d10, d0[3] 113 VMLAL.S16 q3, d11, d0[3] 114 VLD1.8 {d0}, [r3]! // A0 115 VSUBL.U8 q4, d8, d14 116 117 // BLOCK 4 118 VLD1.8 {d10}, [r9]! // B5 119 VMLAL.S16 q8, d8, d1[0] 120 VMLAL.S16 q9, d9, d1[0] 121 VSUBL.U8 q5, d10, d14 122 123 // BLOCK 5 124 VLD1.8 {d8}, [r9]! // B6 125 VMLAL.S16 q2, d10, d1[1] 126 VMLAL.S16 q3, d11, d1[1] 127 VSUBL.U8 q4, d8, d14 128 129 // BLOCK 6 130 VLD1.8 {d10}, [r9]! // B7 131 VMLAL.S16 q8, d8, d1[2] 132 VMLAL.S16 q9, d9, d1[2] 133 VSUBL.U8 q5, d10, d14 134 135 // BLOCK 7 136 VLD1.8 {d8}, [r9]! // B0 137 VMLAL.S16 q2, d10, d1[3] 138 VMLAL.S16 q3, d11, d1[3] 139 SUBS r5, r5, 8 140 BHS 1b 141 142 // Epilogue 143 144 .p2align 3 1452: 146 VMOVL.U8 q0, d0 147 VSUBL.U8 q4, d8, d14 148 149 VLD1.8 {d10}, [r9]! // B1 150 VMLAL.S16 q8, d8, d0[0] 151 VMLAL.S16 q9, d9, d0[0] 152 VSUBL.U8 q5, d10, d14 153 154 VLD1.8 {d8}, [r9]! // B2 155 VMLAL.S16 q2, d10, d0[1] 156 VMLAL.S16 q3, d11, d0[1] 157 VSUBL.U8 q4, d8, d14 158 159 VLD1.8 {d10}, [r9]! // B3 160 VMLAL.S16 q8, d8, d0[2] 161 VMLAL.S16 q9, d9, d0[2] 162 VSUBL.U8 q5, d10, d14 163 164 VLD1.8 {d8}, [r9]! // B4 165 VMLAL.S16 q2, d10, d0[3] 166 VMLAL.S16 q3, d11, d0[3] 167 VSUBL.U8 q4, d8, d14 168 169 VLD1.8 {d10}, [r9]! // B5 170 VMLAL.S16 q8, d8, d1[0] 171 VMLAL.S16 q9, d9, d1[0] 172 VSUBL.U8 q5, d10, d14 173 174 VLD1.8 {d8}, [r9]! // B6 175 VMLAL.S16 q2, d10, d1[1] 176 VMLAL.S16 q3, d11, d1[1] 177 VSUBL.U8 q4, d8, d14 178 179 VLD1.8 {d10}, [r9]! // B7 180 VMLAL.S16 q8, d8, d1[2] 181 VMLAL.S16 q9, d9, d1[2] 182 VSUBL.U8 q5, d10, d14 183 ADDS r5, r5, 8 184 185 VMLAL.S16 q2, d10, d1[3] 186 VMLAL.S16 q3, d11, d1[3] 187 188 # Is there a remainder?- 1-7 bytes of A 189 BNE 4f 190 1913: 192 VADD.S32 q8, q8, q2 193 VADD.S32 q9, q9, q3 194 195 # RNDNU quantization 196 VDUP.32 q0, d12[0] // right_pre_shift 197 198 VQSHL.S32 q8, q8, q0 199 VQSHL.S32 q9, q9, q0 200 201 VDUP.32 q2, d13[0] // right_post_shift 202 203 VQDMULH.S32 q8, q8, d12[1] // multiplier 204 VQDMULH.S32 q9, q9, d12[1] 205 206 VRSHL.S32 q8, q8, q2 207 VRSHL.S32 q9, q9, q2 208 209 VDUP.16 q0, d13[2] // output_zero_point 210 211 VQMOVN.S32 d16, q8 212 VQMOVN.S32 d17, q9 213 214 VQADD.S16 q8, q8, q0 215 216 VDUP.8 d24, d13[6] // output_min 217 218 VQMOVUN.S16 d0, q8 219 220 VDUP.8 d25, d13[7] // output_max 221 222 VMAX.U8 d0, d0, d24 223 224 SUBS r1, r1, 8 225 226 VMIN.U8 d0, d0, d25 227 228 # Store full 1 x 8 229 BLO 5f 230 VST1.8 {d0}, [r11], r7 231 SUB r3, r3, r2 232 BHI 0b 233 234 VPOP {d8-d14} 235 ADD sp, sp, 8 // skip pad of 8 236 ADD sp, sp, 16 237 POP {r5, r7, r9, r11} 238 BX lr 239 240 # Remainder- 1 to 7 bytes of A 241 .p2align 3 2424: 243 AND r5, r5, 7 // kc remainder 1 to 7 244 245 VLD1.8 {d0}, [r3], r5 246 VLD1.8 {d8}, [r9]! 247 248 VMOVL.U8 q0, d0 249 VSUBL.U8 q4, d8, d14 250 VMLAL.S16 q8, d8, d0[0] 251 VMLAL.S16 q9, d9, d0[0] 252 CMP r5, 2 253 BLO 3b 254 255 VLD1.8 {d8}, [r9]! 256 VSUBL.U8 q4, d8, d14 257 VMLAL.S16 q8, d8, d0[1] 258 VMLAL.S16 q9, d9, d0[1] 259 BEQ 3b 260 261 VLD1.8 {d8}, [r9]! 262 VSUBL.U8 q4, d8, d14 263 VMLAL.S16 q8, d8, d0[2] 264 VMLAL.S16 q9, d9, d0[2] 265 CMP r5, 4 266 BLO 3b 267 268 VLD1.8 {d8}, [r9]! 269 VSUBL.U8 q4, d8, d14 270 VMLAL.S16 q8, d8, d0[3] 271 VMLAL.S16 q9, d9, d0[3] 272 BEQ 3b 273 274 VLD1.8 {d8}, [r9]! 275 VSUBL.U8 q4, d8, d14 276 VMLAL.S16 q8, d8, d1[0] 277 VMLAL.S16 q9, d9, d1[0] 278 CMP r5, 6 279 BLO 3b 280 281 VLD1.8 {d8}, [r9]! 282 VSUBL.U8 q4, d8, d14 283 VMLAL.S16 q8, d8, d1[1] 284 VMLAL.S16 q9, d9, d1[1] 285 BEQ 3b 286 287 VLD1.8 {d8}, [r9]! 288 VSUBL.U8 q4, d8, d14 289 VMLAL.S16 q8, d8, d1[2] 290 VMLAL.S16 q9, d9, d1[2] 291 B 3b 292 293 # Store odd width 294 .p2align 3 2955: 296 TST r1, 4 297 BEQ 6f 298 VST1.32 {d0[0]}, [r11]! 299 VEXT.8 q0, q0, q0, 4 3006: 301 TST r1, 2 302 BEQ 7f 303 VST1.16 {d0[0]}, [r11]! 304 VEXT.8 q0, q0, q0, 2 3057: 306 TST r1, 1 307 BEQ 8f 308 VST1.8 {d0[0]}, [r11] 3098: 310 VPOP {d8-d14} 311 ADD sp, sp, 8 // skip pad of 8 312 ADD sp, sp, 16 313 POP {r5, r7, r9, r11} 314 BX lr 315 316END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7 317 318#ifdef __ELF__ 319.section ".note.GNU-stack","",%progbits 320#endif 321 322