1*4bdc9457SAndroid Build Coastguard Worker// Auto-generated file. Do not edit! 2*4bdc9457SAndroid Build Coastguard Worker// Template: src/qs8-gemm/4x8-aarch32-neon-mlal-lane-ld64.S.in 3*4bdc9457SAndroid Build Coastguard Worker// Generator: tools/xngen 4*4bdc9457SAndroid Build Coastguard Worker// 5*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 6*4bdc9457SAndroid Build Coastguard Worker// 7*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 8*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 9*4bdc9457SAndroid Build Coastguard Worker 10*4bdc9457SAndroid Build Coastguard Worker 11*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker.syntax unified 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker// void xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64( 16*4bdc9457SAndroid Build Coastguard Worker// size_t mr, r0 17*4bdc9457SAndroid Build Coastguard Worker// size_t nc, r1 18*4bdc9457SAndroid Build Coastguard Worker// size_t kc, r2 -> r5 19*4bdc9457SAndroid Build Coastguard Worker// const int8_t*restrict a, r3 20*4bdc9457SAndroid Build Coastguard Worker// size_t a_stride, sp + 72 -> (r7) 21*4bdc9457SAndroid Build Coastguard Worker// const void*restrict w, sp + 76 -> r9 22*4bdc9457SAndroid Build Coastguard Worker// int8_t*restrict c, sp + 80 -> r11 23*4bdc9457SAndroid Build Coastguard Worker// size_t cm_stride, sp + 84 -> (r6) 24*4bdc9457SAndroid Build Coastguard Worker// size_t cn_stride, sp + 88 -> r7 25*4bdc9457SAndroid Build Coastguard Worker// xnn_qs8_minmax_params params) sp + 92 -> (r5) 26*4bdc9457SAndroid Build Coastguard Worker 27*4bdc9457SAndroid Build Coastguard Worker// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28*4bdc9457SAndroid Build Coastguard Worker 29*4bdc9457SAndroid Build Coastguard Worker// Register usage 30*4bdc9457SAndroid Build Coastguard Worker// A0 r3 d0-d1 q0 31*4bdc9457SAndroid Build Coastguard Worker// A1 r12 d2-d3 q1 32*4bdc9457SAndroid Build Coastguard Worker// A2 r10 d4-d5 q2 33*4bdc9457SAndroid Build Coastguard Worker// A3 r0 d6-d7 q3 34*4bdc9457SAndroid Build Coastguard Worker 35*4bdc9457SAndroid Build Coastguard Worker// B r9 d10-d11 q5 36*4bdc9457SAndroid Build Coastguard Worker 37*4bdc9457SAndroid Build Coastguard Worker// C0 r11 d16-d17 q8 d18-d19 q9 38*4bdc9457SAndroid Build Coastguard Worker// C1 r4 d20-d21 q10 d22-d23 q11 39*4bdc9457SAndroid Build Coastguard Worker// C2 r8 d24-d25 q12 d26-d27 q13 40*4bdc9457SAndroid Build Coastguard Worker// C3 r6 d28-d29 q14 d30-d31 q15 41*4bdc9457SAndroid Build Coastguard Worker 42*4bdc9457SAndroid Build Coastguard Worker// Unused d13-d15 43*4bdc9457SAndroid Build Coastguard Worker 44*4bdc9457SAndroid Build Coastguard Worker// params structure is 4 bytes 45*4bdc9457SAndroid Build Coastguard Worker// struct { 46*4bdc9457SAndroid Build Coastguard Worker// int16_t output_zero_point; d13[2] 47*4bdc9457SAndroid Build Coastguard Worker// int8_t output_min; d13[6] 48*4bdc9457SAndroid Build Coastguard Worker// int8_t output_max; d13[7] 49*4bdc9457SAndroid Build Coastguard Worker// } xnn_qs8_minmax_params.neonv8; 50*4bdc9457SAndroid Build Coastguard Worker 51*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64 52*4bdc9457SAndroid Build Coastguard Worker # Push 72 bytes 53*4bdc9457SAndroid Build Coastguard Worker PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 54*4bdc9457SAndroid Build Coastguard Worker SUB sp, sp, 8 // +8 55*4bdc9457SAndroid Build Coastguard Worker VPUSH {d10-d13} // +32 = 72 56*4bdc9457SAndroid Build Coastguard Worker 57*4bdc9457SAndroid Build Coastguard Worker LDR r7, [sp, 72] // a_stride 58*4bdc9457SAndroid Build Coastguard Worker LDR r11, [sp, 80] // c 59*4bdc9457SAndroid Build Coastguard Worker LDR r6, [sp, 84] // cm_stride 60*4bdc9457SAndroid Build Coastguard Worker LDR r9, [sp, 76] // w 61*4bdc9457SAndroid Build Coastguard Worker LDR r5, [sp, 92] // params 62*4bdc9457SAndroid Build Coastguard Worker 63*4bdc9457SAndroid Build Coastguard Worker # Clamp A and C pointers 64*4bdc9457SAndroid Build Coastguard Worker CMP r0, 2 // if mr >= 2 65*4bdc9457SAndroid Build Coastguard Worker ADD r12, r3, r7 // a1 = a0 + a_stride 66*4bdc9457SAndroid Build Coastguard Worker ADD r4, r11, r6 // c1 = c0 + cm_stride 67*4bdc9457SAndroid Build Coastguard Worker MOVLO r12, r3 // a1 68*4bdc9457SAndroid Build Coastguard Worker MOVLO r4, r11 // c1 69*4bdc9457SAndroid Build Coastguard Worker // if mr > 2 70*4bdc9457SAndroid Build Coastguard Worker ADD r10, r12, r7 // a2 = a1 + a_stride 71*4bdc9457SAndroid Build Coastguard Worker ADD r8, r4, r6 // c2 = c1 + cm_stride 72*4bdc9457SAndroid Build Coastguard Worker MOVLS r10, r12 // a2 73*4bdc9457SAndroid Build Coastguard Worker MOVLS r8, r4 // c2 74*4bdc9457SAndroid Build Coastguard Worker 75*4bdc9457SAndroid Build Coastguard Worker CMP r0, 4 // if mr >=4 76*4bdc9457SAndroid Build Coastguard Worker ADD r0, r10, r7 // a3 = a2 + a_stride 77*4bdc9457SAndroid Build Coastguard Worker ADD r6, r8, r6 // c3 = c2 + cm_stride 78*4bdc9457SAndroid Build Coastguard Worker MOVLO r0, r10 // a3 79*4bdc9457SAndroid Build Coastguard Worker MOVLO r6, r8 // c3 80*4bdc9457SAndroid Build Coastguard Worker 81*4bdc9457SAndroid Build Coastguard Worker # Load params values 82*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 83*4bdc9457SAndroid Build Coastguard Worker LDR r7, [sp, 88] // cn_stride 84*4bdc9457SAndroid Build Coastguard Worker 85*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 64] // Prefetch B 86*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 128] 87*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 192] 88*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 256] 89*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 320] 90*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 384] 91*4bdc9457SAndroid Build Coastguard Worker 92*4bdc9457SAndroid Build Coastguard Worker .p2align 3 93*4bdc9457SAndroid Build Coastguard Worker0: 94*4bdc9457SAndroid Build Coastguard Worker # Load initial bias from w into accumulators 95*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d16-d19} // Bias 96*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r2, 8 // k = kc - 8 97*4bdc9457SAndroid Build Coastguard Worker 98*4bdc9457SAndroid Build Coastguard Worker VMOV q10, q8 99*4bdc9457SAndroid Build Coastguard Worker PLD [r3, 64] // Prefetch A 100*4bdc9457SAndroid Build Coastguard Worker VMOV q11, q9 101*4bdc9457SAndroid Build Coastguard Worker PLD [r12, 64] 102*4bdc9457SAndroid Build Coastguard Worker VMOV q12, q8 103*4bdc9457SAndroid Build Coastguard Worker PLD [r10, 64] 104*4bdc9457SAndroid Build Coastguard Worker VMOV q13, q9 105*4bdc9457SAndroid Build Coastguard Worker PLD [r0, 64] 106*4bdc9457SAndroid Build Coastguard Worker VMOV q14, q8 107*4bdc9457SAndroid Build Coastguard Worker VMOV q15, q9 108*4bdc9457SAndroid Build Coastguard Worker BLO 3f // less than 8 channels? 109*4bdc9457SAndroid Build Coastguard Worker 110*4bdc9457SAndroid Build Coastguard Worker # Main loop - 8 bytes 111*4bdc9457SAndroid Build Coastguard Worker # 64 bytes for weights. 112*4bdc9457SAndroid Build Coastguard Worker .p2align 3 113*4bdc9457SAndroid Build Coastguard Worker1: 114*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d0}, [r3]! // A0 115*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! // B 116*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d2}, [r12]! // A1 117*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d4}, [r10]! // A2 118*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d6}, [r0]! // A3 119*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r5, 8 120*4bdc9457SAndroid Build Coastguard Worker PLD [r3, 128] 121*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q0, d0 122*4bdc9457SAndroid Build Coastguard Worker PLD [r12, 128] 123*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 124*4bdc9457SAndroid Build Coastguard Worker PLD [r10, 128] 125*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q1, d2 126*4bdc9457SAndroid Build Coastguard Worker PLD [r0, 128] 127*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q2, d4 128*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 448] 129*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q3, d6 130*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[0] 131*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[0] 132*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[0] 133*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[0] 134*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[0] 135*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[0] 136*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[0] 137*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[0] 138*4bdc9457SAndroid Build Coastguard Worker 139*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 140*4bdc9457SAndroid Build Coastguard Worker 141*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 142*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[1] 143*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[1] 144*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[1] 145*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[1] 146*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[1] 147*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[1] 148*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[1] 149*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[1] 150*4bdc9457SAndroid Build Coastguard Worker 151*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 152*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 153*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[2] 154*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[2] 155*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[2] 156*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[2] 157*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[2] 158*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[2] 159*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[2] 160*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[2] 161*4bdc9457SAndroid Build Coastguard Worker 162*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 163*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 164*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[3] 165*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[3] 166*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[3] 167*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[3] 168*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[3] 169*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[3] 170*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[3] 171*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[3] 172*4bdc9457SAndroid Build Coastguard Worker 173*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 174*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 175*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[0] 176*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[0] 177*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[0] 178*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[0] 179*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[0] 180*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[0] 181*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[0] 182*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[0] 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 185*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 186*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[1] 187*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[1] 188*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[1] 189*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[1] 190*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[1] 191*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[1] 192*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[1] 193*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[1] 194*4bdc9457SAndroid Build Coastguard Worker 195*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 196*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 197*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[2] 198*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[2] 199*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[2] 200*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[2] 201*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[2] 202*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[2] 203*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[2] 204*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[2] 205*4bdc9457SAndroid Build Coastguard Worker 206*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 207*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 208*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[3] 209*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[3] 210*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[3] 211*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[3] 212*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[3] 213*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[3] 214*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[3] 215*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[3] 216*4bdc9457SAndroid Build Coastguard Worker BHS 1b 217*4bdc9457SAndroid Build Coastguard Worker 218*4bdc9457SAndroid Build Coastguard Worker # Is there a remainder?- 1-7 bytes of A 219*4bdc9457SAndroid Build Coastguard Worker ADDS r5, r5, 8 220*4bdc9457SAndroid Build Coastguard Worker BNE 3f 221*4bdc9457SAndroid Build Coastguard Worker 222*4bdc9457SAndroid Build Coastguard Worker2: 223*4bdc9457SAndroid Build Coastguard Worker # QC8 FP32 quantization 224*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {q0-q1}, [r9]! 225*4bdc9457SAndroid Build Coastguard Worker 226*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q8, q8 227*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q9, q9 228*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q10, q10 229*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q11, q11 230*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q12, q12 231*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q13, q13 232*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q14, q14 233*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q15, q15 234*4bdc9457SAndroid Build Coastguard Worker 235*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q8, q8, q0 // multiplier 236*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q9, q9, q1 237*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q10, q10, q0 238*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q11, q11, q1 239*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q12, q12, q0 240*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q13, q13, q1 241*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q14, q14, q0 242*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q15, q15, q1 243*4bdc9457SAndroid Build Coastguard Worker 244*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q8, q8 245*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q9, q9 246*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q10, q10 247*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q11, q11 248*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q12, q12 249*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q13, q13 250*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q14, q14 251*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q15, q15 252*4bdc9457SAndroid Build Coastguard Worker 253*4bdc9457SAndroid Build Coastguard Worker VDUP.16 q0, d13[2] // output_zero_point 254*4bdc9457SAndroid Build Coastguard Worker 255*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d16, q8 256*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d17, q9 257*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d18, q10 258*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d19, q11 259*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d20, q12 260*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d21, q13 261*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d22, q14 262*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d23, q15 263*4bdc9457SAndroid Build Coastguard Worker 264*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q8, q8, q0 265*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q9, q9, q0 266*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q10, q10, q0 267*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q11, q11, q0 268*4bdc9457SAndroid Build Coastguard Worker 269*4bdc9457SAndroid Build Coastguard Worker VDUP.8 q12, d13[6] // output_min 270*4bdc9457SAndroid Build Coastguard Worker 271*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d0, q8 272*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d1, q9 273*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d2, q10 274*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d3, q11 275*4bdc9457SAndroid Build Coastguard Worker 276*4bdc9457SAndroid Build Coastguard Worker VDUP.8 q13, d13[7] // output_max 277*4bdc9457SAndroid Build Coastguard Worker 278*4bdc9457SAndroid Build Coastguard Worker VMAX.S8 q0, q0, q12 279*4bdc9457SAndroid Build Coastguard Worker VMAX.S8 q1, q1, q12 280*4bdc9457SAndroid Build Coastguard Worker 281*4bdc9457SAndroid Build Coastguard Worker SUBS r1, r1, 8 282*4bdc9457SAndroid Build Coastguard Worker 283*4bdc9457SAndroid Build Coastguard Worker VMIN.S8 q0, q0, q13 284*4bdc9457SAndroid Build Coastguard Worker VMIN.S8 q1, q1, q13 285*4bdc9457SAndroid Build Coastguard Worker 286*4bdc9457SAndroid Build Coastguard Worker # Store full 4 x 8 287*4bdc9457SAndroid Build Coastguard Worker BLO 4f 288*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d0}, [r11], r7 289*4bdc9457SAndroid Build Coastguard Worker SUB r3, r3, r2 290*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d1}, [r4], r7 291*4bdc9457SAndroid Build Coastguard Worker SUB r12, r12, r2 292*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d2}, [r8], r7 293*4bdc9457SAndroid Build Coastguard Worker SUB r10, r10, r2 294*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d3}, [r6], r7 295*4bdc9457SAndroid Build Coastguard Worker SUB r0, r0, r2 296*4bdc9457SAndroid Build Coastguard Worker BHI 0b 297*4bdc9457SAndroid Build Coastguard Worker 298*4bdc9457SAndroid Build Coastguard Worker VPOP {d10-d13} 299*4bdc9457SAndroid Build Coastguard Worker ADD sp, sp, 8 300*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11} 301*4bdc9457SAndroid Build Coastguard Worker BX lr 302*4bdc9457SAndroid Build Coastguard Worker 303*4bdc9457SAndroid Build Coastguard Worker # Remainder- 1 to 7 bytes of A 304*4bdc9457SAndroid Build Coastguard Worker .p2align 3 305*4bdc9457SAndroid Build Coastguard Worker3: 306*4bdc9457SAndroid Build Coastguard Worker AND r5, r5, 7 // kc remainder 1 to 7 307*4bdc9457SAndroid Build Coastguard Worker 308*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d0}, [r3], r5 309*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 310*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d2}, [r12], r5 311*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d4}, [r10], r5 312*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d6}, [r0], r5 313*4bdc9457SAndroid Build Coastguard Worker 314*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q0, d0 315*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 316*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q1, d2 317*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q2, d4 318*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q3, d6 319*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[0] 320*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[0] 321*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[0] 322*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[0] 323*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[0] 324*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[0] 325*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[0] 326*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[0] 327*4bdc9457SAndroid Build Coastguard Worker CMP r5, 2 328*4bdc9457SAndroid Build Coastguard Worker BLO 2b 329*4bdc9457SAndroid Build Coastguard Worker 330*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 331*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 332*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[1] 333*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[1] 334*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[1] 335*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[1] 336*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[1] 337*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[1] 338*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[1] 339*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[1] 340*4bdc9457SAndroid Build Coastguard Worker BEQ 2b 341*4bdc9457SAndroid Build Coastguard Worker 342*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 343*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 344*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[2] 345*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[2] 346*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[2] 347*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[2] 348*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[2] 349*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[2] 350*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[2] 351*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[2] 352*4bdc9457SAndroid Build Coastguard Worker CMP r5, 4 353*4bdc9457SAndroid Build Coastguard Worker BLO 2b 354*4bdc9457SAndroid Build Coastguard Worker 355*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 356*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 357*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d0[3] 358*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d0[3] 359*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d2[3] 360*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d2[3] 361*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d4[3] 362*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d4[3] 363*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d6[3] 364*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d6[3] 365*4bdc9457SAndroid Build Coastguard Worker BEQ 2b 366*4bdc9457SAndroid Build Coastguard Worker 367*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 368*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 369*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[0] 370*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[0] 371*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[0] 372*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[0] 373*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[0] 374*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[0] 375*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[0] 376*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[0] 377*4bdc9457SAndroid Build Coastguard Worker CMP r5, 6 378*4bdc9457SAndroid Build Coastguard Worker BLO 2b 379*4bdc9457SAndroid Build Coastguard Worker 380*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 381*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 382*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[1] 383*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[1] 384*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[1] 385*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[1] 386*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[1] 387*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[1] 388*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[1] 389*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[1] 390*4bdc9457SAndroid Build Coastguard Worker BEQ 2b 391*4bdc9457SAndroid Build Coastguard Worker 392*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d10}, [r9]! 393*4bdc9457SAndroid Build Coastguard Worker VMOVL.S8 q5, d10 394*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q8, d10, d1[2] 395*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q9, d11, d1[2] 396*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q10, d10, d3[2] 397*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q11, d11, d3[2] 398*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q12, d10, d5[2] 399*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q13, d11, d5[2] 400*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q14, d10, d7[2] 401*4bdc9457SAndroid Build Coastguard Worker VMLAL.S16 q15, d11, d7[2] 402*4bdc9457SAndroid Build Coastguard Worker B 2b 403*4bdc9457SAndroid Build Coastguard Worker 404*4bdc9457SAndroid Build Coastguard Worker # Store odd width 405*4bdc9457SAndroid Build Coastguard Worker .p2align 3 406*4bdc9457SAndroid Build Coastguard Worker4: 407*4bdc9457SAndroid Build Coastguard Worker TST r1, 4 408*4bdc9457SAndroid Build Coastguard Worker BEQ 5f 409*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d0[0]}, [r11]! 410*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d1[0]}, [r4]! 411*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d2[0]}, [r8]! 412*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d3[0]}, [r6]! 413*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q0, q0, q0, 4 414*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q1, q1, q1, 4 415*4bdc9457SAndroid Build Coastguard Worker5: 416*4bdc9457SAndroid Build Coastguard Worker TST r1, 2 417*4bdc9457SAndroid Build Coastguard Worker BEQ 6f 418*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d0[0]}, [r11]! 419*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d1[0]}, [r4]! 420*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d2[0]}, [r8]! 421*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d3[0]}, [r6]! 422*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q0, q0, q0, 2 423*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q1, q1, q1, 2 424*4bdc9457SAndroid Build Coastguard Worker 425*4bdc9457SAndroid Build Coastguard Worker6: 426*4bdc9457SAndroid Build Coastguard Worker TST r1, 1 427*4bdc9457SAndroid Build Coastguard Worker BEQ 7f 428*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d0[0]}, [r11] 429*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d1[0]}, [r4] 430*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d2[0]}, [r8] 431*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d3[0]}, [r6] 432*4bdc9457SAndroid Build Coastguard Worker 433*4bdc9457SAndroid Build Coastguard Worker7: 434*4bdc9457SAndroid Build Coastguard Worker VPOP {d10-d13} 435*4bdc9457SAndroid Build Coastguard Worker ADD sp, sp, 8 436*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11} 437*4bdc9457SAndroid Build Coastguard Worker BX lr 438*4bdc9457SAndroid Build Coastguard Worker 439*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64 440*4bdc9457SAndroid Build Coastguard Worker 441*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__ 442*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits 443*4bdc9457SAndroid Build Coastguard Worker#endif 444*4bdc9457SAndroid Build Coastguard Worker 445