1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 16// size_t mr, (r0) 17// size_t nc, r1 18// size_t kc, (r2) -> sp + 56 -> r5 19// size_t ks, (r3) -> sp + 60 -> r14 20// const int8_t**restrict a, sp + 88 -> r2 21// const void*restrict w, sp + 92 -> r9 22// int8_t*restrict c, sp + 96 -> r11 23// size_t cm_stride, sp + 100 -> r6 24// size_t cn_stride, sp + 104 -> r12 25// size_t a_offset, sp + 108 -> (r5) 26// const int8_t* zero, sp + 112 -> r7 27// xnn_qs8_conv_minmax_params*params); sp + 116 -> (r5) 28 29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 30 31// Based on cortex_a53 microkernel but with Neon loads 32 33// Register usage 34// A0 r3 d0-d1 q0 35 36// B r9 d8-d9 q4 q5 37 38// C0 r11 d16-d17 q8 d18-d19 q9 39// q2, q3 acc2 40 41// Unused r4, r8, r10, d15, q10-q15, q1-q3 42 43// params structure is 16 bytes 44// struct { 45// int32_t right_pre_shift; d12[0] 46// int32_t multiplier; d12[1] 47// int32_t right_post_shift; d13[0] 48// int16_t output_zero_point; d13[2] 49// int8_t output_min; d13[6] 50// int8_t output_max; d13[7] 51// } rndnu_neon; 52 53BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 54 # Push 88 bytes 55 # r2, r3 will be reloaded in outer loop. 56 PUSH {r2, r3, r5, r6, r7, r9, r11, lr} // +32 57 SUB sp, sp, 8 // +8 58 VPUSH {d8-d13} // +48 = 88 59 60 LDR r2, [sp, 88] // a 61 LDR r9, [sp, 92] // w 62 LDR r11, [sp, 96] // c 63 LDR r6, [sp, 100] // cm_stride 64 LDR r12, [sp, 104] // cn_stride 65 LDR r7, [sp, 112] // zero 66 LDR r5, [sp, 116] // params 67 MOV r14, r3 // p = ks 68 69 # Load params values 70 VLDM r5, {d12-d13} // RNDNU params 71 72 PLD [r9, 64] // Prefetch B 73 PLD [r9, 112] 74 PLD [r9, 192] 75 PLD [r9, 256] 76 PLD [r9, 320] 77 PLD [r9, 384] 78 79 .p2align 3 800: 81 # Load initial bias from w into accumulators 82 VLDM r9!, {d16-d19} // Bias 83 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 84 VMOV.I32 q3, 0 85 86 .p2align 3 871: 88 # Load next A pointer 89 LDR r3, [r2, 0] 90 91 # Add a_offset 92 LDR r5, [sp, 108] // a_offset 93 ADD r2, r2, 4 94 CMP r3, r7 // if a0 == zero 95 ADD r3, r3, r5 // a0 += a_offset 96 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 97 98 LDR r5, [sp, 56] // kc 99 SUBS r5, r5, 8 // kc - 8 100 BLO 5f // less than 8 channels? 101 102 // Prologue - load A0 and B0 103 VLD1.8 {d0}, [r3]! // A0 104 SUBS r5, r5, 8 // k = k - 8 105 VLD1.8 {d8}, [r9]! // B0 106 BLO 3f // less than 8 channels? 107 108 // Main loop - 8 bytes 109 // 64 bytes for weights. 110 111 .p2align 3 1122: 113 // Extend 114 VMOVL.S8 q0, d0 115 VMOVL.S8 q4, d8 116 PLD [r9, 448] 117 118 // BLOCK 0 119 VLD1.8 {d10}, [r9]! // B1 120 VMLAL.S16 q8, d8, d0[0] 121 VMLAL.S16 q9, d9, d0[0] 122 VMOVL.S8 q5, d10 123 124 // BLOCK 1 125 VLD1.8 {d8}, [r9]! // B2 126 VMLAL.S16 q2, d10, d0[1] 127 VMLAL.S16 q3, d11, d0[1] 128 VMOVL.S8 q4, d8 129 130 // BLOCK 2 131 VLD1.8 {d10}, [r9]! // B3 132 VMLAL.S16 q8, d8, d0[2] 133 VMLAL.S16 q9, d9, d0[2] 134 VMOVL.S8 q5, d10 135 136 // BLOCK 3 137 VLD1.8 {d8}, [r9]! // B4 138 VMLAL.S16 q2, d10, d0[3] 139 VMLAL.S16 q3, d11, d0[3] 140 VLD1.8 {d0}, [r3]! // A0 141 VMOVL.S8 q4, d8 142 143 // BLOCK 4 144 VLD1.8 {d10}, [r9]! // B5 145 VMLAL.S16 q8, d8, d1[0] 146 VMLAL.S16 q9, d9, d1[0] 147 VMOVL.S8 q5, d10 148 149 // BLOCK 5 150 VLD1.8 {d8}, [r9]! // B6 151 VMLAL.S16 q2, d10, d1[1] 152 VMLAL.S16 q3, d11, d1[1] 153 VMOVL.S8 q4, d8 154 155 // BLOCK 6 156 VLD1.8 {d10}, [r9]! // B7 157 VMLAL.S16 q8, d8, d1[2] 158 VMLAL.S16 q9, d9, d1[2] 159 VMOVL.S8 q5, d10 160 SUBS r5, r5, 8 161 162 // BLOCK 7 163 VLD1.8 {d8}, [r9]! // B0 164 VMLAL.S16 q2, d10, d1[3] 165 VMLAL.S16 q3, d11, d1[3] 166 BHS 2b 167 168 // Epilogue 169 170 .p2align 3 1713: 172 // Extend 173 VMOVL.S8 q0, d0 174 VMOVL.S8 q4, d8 175 PLD [r9, 448] 176 177 // BLOCK 0 178 VLD1.8 {d10}, [r9]! // B1 179 VMLAL.S16 q8, d8, d0[0] 180 VMLAL.S16 q9, d9, d0[0] 181 VMOVL.S8 q5, d10 182 183 // BLOCK 1 184 VLD1.8 {d8}, [r9]! // B2 185 VMLAL.S16 q2, d10, d0[1] 186 VMLAL.S16 q3, d11, d0[1] 187 VMOVL.S8 q4, d8 188 189 // BLOCK 2 190 VLD1.8 {d10}, [r9]! // B3 191 VMLAL.S16 q8, d8, d0[2] 192 VMLAL.S16 q9, d9, d0[2] 193 VMOVL.S8 q5, d10 194 195 // BLOCK 3 196 VLD1.8 {d8}, [r9]! // B4 197 VMLAL.S16 q2, d10, d0[3] 198 VMLAL.S16 q3, d11, d0[3] 199 VMOVL.S8 q4, d8 200 201 // BLOCK 4 202 VLD1.8 {d10}, [r9]! // B5 203 VMLAL.S16 q8, d8, d1[0] 204 VMLAL.S16 q9, d9, d1[0] 205 VMOVL.S8 q5, d10 206 207 // BLOCK 5 208 VLD1.8 {d8}, [r9]! // B6 209 VMLAL.S16 q2, d10, d1[1] 210 VMLAL.S16 q3, d11, d1[1] 211 VMOVL.S8 q4, d8 212 213 // BLOCK 6 214 VLD1.8 {d10}, [r9]! // B7 215 VMLAL.S16 q8, d8, d1[2] 216 VMLAL.S16 q9, d9, d1[2] 217 VMOVL.S8 q5, d10 218 ADDS r5, r5, 8 219 220 VMLAL.S16 q2, d10, d1[3] 221 VMLAL.S16 q3, d11, d1[3] 222 223 # Is there a remainder?- 1-7 bytes of A 224 BNE 6f 225 2264: 227 # ks loop 228 SUBS r14, r14, 4 // ks -= MR * sizeof(void*) 229 BHI 1b 230 231 LDR r14, [sp, 60] // p = ks 232 233 VADD.S32 q8, q8, q2 234 VADD.S32 q9, q9, q3 235 236 # RNDNU quantization 237 VDUP.32 q0, d12[0] // right_pre_shift 238 239 VQSHL.S32 q8, q8, q0 240 VQSHL.S32 q9, q9, q0 241 242 VDUP.32 q2, d13[0] // right_post_shift 243 244 VQDMULH.S32 q8, q8, d12[1] // multiplier 245 VQDMULH.S32 q9, q9, d12[1] 246 247 VRSHL.S32 q8, q8, q2 248 VRSHL.S32 q9, q9, q2 249 250 VDUP.16 q0, d13[2] // output_zero_point 251 252 VQMOVN.S32 d16, q8 253 VQMOVN.S32 d17, q9 254 255 VQADD.S16 q8, q8, q0 256 257 VDUP.8 d24, d13[6] // output_min 258 259 VQMOVN.S16 d0, q8 260 261 VDUP.8 d25, d13[7] // output_max 262 263 VMAX.S8 d0, d0, d24 264 265 SUBS r1, r1, 8 266 267 VMIN.S8 d0, d0, d25 268 269 # Store full 1 x 8 270 BLO 7f 271 VST1.8 {d0}, [r11], r12 272 SUB r2, r2, r14 // a -= ks 273 BHI 0b 274 275 VPOP {d8-d13} 276 ADD sp, sp, 16 // skip pad of 8, r2, r3 277 POP {r5, r6, r7, r9, r11, pc} 278 279 # Remainder- 1 to 7 bytes of A 280 .p2align 3 2815: 282 AND r5, r5, 7 // kc remainder 1 to 7 2836: 284 VLD1.8 {d0}, [r3] 285 VLD1.8 {d8}, [r9]! 286 287 VMOVL.S8 q0, d0 288 VMOVL.S8 q4, d8 289 VMLAL.S16 q8, d8, d0[0] 290 VMLAL.S16 q9, d9, d0[0] 291 CMP r5, 2 292 BLO 4b 293 294 VLD1.8 {d8}, [r9]! 295 VMOVL.S8 q4, d8 296 VMLAL.S16 q8, d8, d0[1] 297 VMLAL.S16 q9, d9, d0[1] 298 BEQ 4b 299 300 VLD1.8 {d8}, [r9]! 301 VMOVL.S8 q4, d8 302 VMLAL.S16 q8, d8, d0[2] 303 VMLAL.S16 q9, d9, d0[2] 304 CMP r5, 4 305 BLO 4b 306 307 VLD1.8 {d8}, [r9]! 308 VMOVL.S8 q4, d8 309 VMLAL.S16 q8, d8, d0[3] 310 VMLAL.S16 q9, d9, d0[3] 311 BEQ 4b 312 313 VLD1.8 {d8}, [r9]! 314 VMOVL.S8 q4, d8 315 VMLAL.S16 q8, d8, d1[0] 316 VMLAL.S16 q9, d9, d1[0] 317 CMP r5, 6 318 BLO 4b 319 320 VLD1.8 {d8}, [r9]! 321 VMOVL.S8 q4, d8 322 VMLAL.S16 q8, d8, d1[1] 323 VMLAL.S16 q9, d9, d1[1] 324 BEQ 4b 325 326 VLD1.8 {d8}, [r9]! 327 VMOVL.S8 q4, d8 328 VMLAL.S16 q8, d8, d1[2] 329 VMLAL.S16 q9, d9, d1[2] 330 B 4b 331 332 # Store odd width 333 .p2align 3 3347: 335 TST r1, 4 336 BEQ 8f 337 VST1.32 {d0[0]}, [r11]! 338 VEXT.8 q0, q0, q0, 4 3398: 340 TST r1, 2 341 BEQ 9f 342 VST1.16 {d0[0]}, [r11]! 343 VEXT.8 q0, q0, q0, 2 344 3459: 346 TST r1, 1 347 BEQ 10f 348 VST1.8 {d0[0]}, [r11] 349 35010: 351 VPOP {d8-d13} 352 ADD sp, sp, 16 // skip pad of 8, r2, r3 353 POP {r5, r6, r7, r9, r11, pc} 354 355END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 356 357#ifdef __ELF__ 358.section ".note.GNU-stack","",%progbits 359#endif 360