1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2022 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert REQUANTIZATION in ["FP32", "RNDNU"] 7*4bdc9457SAndroid Build Coastguard Worker$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8*4bdc9457SAndroid Build Coastguard Worker$assert DATATYPE in ["QC8", "QS8"] 9*4bdc9457SAndroid Build Coastguard Worker$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32" 10*4bdc9457SAndroid Build Coastguard Worker 11*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker.syntax unified 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 16*4bdc9457SAndroid Build Coastguard Worker# LINT.IfChange 17*4bdc9457SAndroid Build Coastguard Worker// void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch32_neondot_ld64( 18*4bdc9457SAndroid Build Coastguard Worker// size_t mr, r0 19*4bdc9457SAndroid Build Coastguard Worker// size_t nc, r1 20*4bdc9457SAndroid Build Coastguard Worker// size_t kc, r2 -> r5 -> sp + 52 21*4bdc9457SAndroid Build Coastguard Worker// size_t ks, r3 -> sp + 56 -> r14 22*4bdc9457SAndroid Build Coastguard Worker// const int8_t**restrict a, sp + 96 -> r2 23*4bdc9457SAndroid Build Coastguard Worker// const void*restrict w, sp + 100 -> r9 24*4bdc9457SAndroid Build Coastguard Worker// int8_t*restrict c, sp + 104 -> r11 25*4bdc9457SAndroid Build Coastguard Worker// size_t cm_stride, sp + 108 -> (r6) 26*4bdc9457SAndroid Build Coastguard Worker// size_t cn_stride, sp + 112 -> (r7) 27*4bdc9457SAndroid Build Coastguard Worker// size_t a_offset, sp + 116 -> (r5) 28*4bdc9457SAndroid Build Coastguard Worker// const int8_t* zero, sp + 120 -> (r7) 29*4bdc9457SAndroid Build Coastguard Worker// ${PARAMS_UNION}*params); sp + 124 -> (r5) 30*4bdc9457SAndroid Build Coastguard Worker 31*4bdc9457SAndroid Build Coastguard Worker// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 32*4bdc9457SAndroid Build Coastguard Worker 33*4bdc9457SAndroid Build Coastguard Worker// Register usage 34*4bdc9457SAndroid Build Coastguard Worker// A0 r3 d0 35*4bdc9457SAndroid Build Coastguard Worker// A1 r12 d1 36*4bdc9457SAndroid Build Coastguard Worker// A2 r10 d2 37*4bdc9457SAndroid Build Coastguard Worker// A3 r0 d3 38*4bdc9457SAndroid Build Coastguard Worker 39*4bdc9457SAndroid Build Coastguard Worker// B r9 q2 q3 q4 q5 40*4bdc9457SAndroid Build Coastguard Worker 41*4bdc9457SAndroid Build Coastguard Worker// C0 r11 d16-d17 q8 d18-d19 q9 42*4bdc9457SAndroid Build Coastguard Worker// C1 r4 d20-d21 q10 d22-d23 q11 43*4bdc9457SAndroid Build Coastguard Worker// C2 r8 d24-d25 q12 d26-d27 q13 44*4bdc9457SAndroid Build Coastguard Worker// C3 r6 d28-d29 q14 d30-d31 q15 45*4bdc9457SAndroid Build Coastguard Worker 46*4bdc9457SAndroid Build Coastguard Worker// unused q7 47*4bdc9457SAndroid Build Coastguard Worker 48*4bdc9457SAndroid Build Coastguard Worker$if REQUANTIZATION == "RNDNU": 49*4bdc9457SAndroid Build Coastguard Worker // params structure is 16 bytes 50*4bdc9457SAndroid Build Coastguard Worker // struct { 51*4bdc9457SAndroid Build Coastguard Worker // int32_t right_pre_shift; d12[0] 52*4bdc9457SAndroid Build Coastguard Worker // int32_t multiplier; d12[1] 53*4bdc9457SAndroid Build Coastguard Worker // int32_t right_post_shift; d13[0] 54*4bdc9457SAndroid Build Coastguard Worker // int16_t output_zero_point; d13[2] 55*4bdc9457SAndroid Build Coastguard Worker // int8_t output_min; d13[6] 56*4bdc9457SAndroid Build Coastguard Worker // int8_t output_max; d13[7] 57*4bdc9457SAndroid Build Coastguard Worker // } rndnu_neon; 58*4bdc9457SAndroid Build Coastguard Worker$else: 59*4bdc9457SAndroid Build Coastguard Worker // params structure is 4 bytes 60*4bdc9457SAndroid Build Coastguard Worker // struct { 61*4bdc9457SAndroid Build Coastguard Worker // int16_t output_zero_point; d13[2] 62*4bdc9457SAndroid Build Coastguard Worker // int8_t output_min; d13[6] 63*4bdc9457SAndroid Build Coastguard Worker // int8_t output_max; d13[7] 64*4bdc9457SAndroid Build Coastguard Worker // } xnn_qs8_minmax_params.neonv8; 65*4bdc9457SAndroid Build Coastguard Worker 66*4bdc9457SAndroid Build Coastguard Worker// iOS does not support 32 bit ARM with Neon DotProduct. 67*4bdc9457SAndroid Build Coastguard Worker#ifndef __APPLE__ 68*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch32_neondot_ld64 69*4bdc9457SAndroid Build Coastguard Worker ADD r2, r2, 3 // kc = (kc + 3) & ~3 70*4bdc9457SAndroid Build Coastguard Worker BIC r2, r2, 3 71*4bdc9457SAndroid Build Coastguard Worker # Push 96 bytes 72*4bdc9457SAndroid Build Coastguard Worker # r2 will be reloaded in outer loop. r3 is ks 73*4bdc9457SAndroid Build Coastguard Worker PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44 74*4bdc9457SAndroid Build Coastguard Worker SUB sp, sp, 4 // 4 75*4bdc9457SAndroid Build Coastguard Worker VPUSH {d8-d13} // +48 = 96 76*4bdc9457SAndroid Build Coastguard Worker 77*4bdc9457SAndroid Build Coastguard Worker LDR r11, [sp, 104] // c 78*4bdc9457SAndroid Build Coastguard Worker LDR r6, [sp, 108] // cm_stride 79*4bdc9457SAndroid Build Coastguard Worker LDR r2, [sp, 96] // a 80*4bdc9457SAndroid Build Coastguard Worker LDR r9, [sp, 100] // w 81*4bdc9457SAndroid Build Coastguard Worker LDR r5, [sp, 124] // params 82*4bdc9457SAndroid Build Coastguard Worker MOV r14, r3 // p = ks 83*4bdc9457SAndroid Build Coastguard Worker 84*4bdc9457SAndroid Build Coastguard Worker # Clamp C pointers 85*4bdc9457SAndroid Build Coastguard Worker CMP r0, 2 // if mr >= 2 86*4bdc9457SAndroid Build Coastguard Worker ADD r4, r11, r6 // c1 = c0 + cm_stride 87*4bdc9457SAndroid Build Coastguard Worker MOVLO r4, r11 // c1 88*4bdc9457SAndroid Build Coastguard Worker // if mr > 2 89*4bdc9457SAndroid Build Coastguard Worker ADD r8, r4, r6 // c2 = c1 + cm_stride 90*4bdc9457SAndroid Build Coastguard Worker MOVLS r8, r4 // c2 91*4bdc9457SAndroid Build Coastguard Worker CMP r0, 4 // if mr >=4 92*4bdc9457SAndroid Build Coastguard Worker ADD r6, r8, r6 // c3 = c2 + cm_stride 93*4bdc9457SAndroid Build Coastguard Worker MOVLO r6, r8 // c3 94*4bdc9457SAndroid Build Coastguard Worker 95*4bdc9457SAndroid Build Coastguard Worker # Load params values 96*4bdc9457SAndroid Build Coastguard Worker $if REQUANTIZATION == "RNDNU": 97*4bdc9457SAndroid Build Coastguard Worker VLDM r5, {d12-d13} // RNDNU params 98*4bdc9457SAndroid Build Coastguard Worker $else: 99*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d13[]}, [r5] // QC8 params 100*4bdc9457SAndroid Build Coastguard Worker 101*4bdc9457SAndroid Build Coastguard Worker0: 102*4bdc9457SAndroid Build Coastguard Worker # Load initial bias from w into accumulators 103*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d16-d19} // Bias 104*4bdc9457SAndroid Build Coastguard Worker VMOV q10, q8 105*4bdc9457SAndroid Build Coastguard Worker VMOV q11, q9 106*4bdc9457SAndroid Build Coastguard Worker LDR r7, [sp, 120] // zero 107*4bdc9457SAndroid Build Coastguard Worker VMOV q12, q8 108*4bdc9457SAndroid Build Coastguard Worker VMOV q13, q9 109*4bdc9457SAndroid Build Coastguard Worker VMOV q14, q8 110*4bdc9457SAndroid Build Coastguard Worker VMOV q15, q9 111*4bdc9457SAndroid Build Coastguard Worker 112*4bdc9457SAndroid Build Coastguard Worker1: 113*4bdc9457SAndroid Build Coastguard Worker # Load next 4 A pointers 114*4bdc9457SAndroid Build Coastguard Worker LDR r3, [r2, 0] 115*4bdc9457SAndroid Build Coastguard Worker LDR r12, [r2, 4] 116*4bdc9457SAndroid Build Coastguard Worker LDR r10, [r2, 8] 117*4bdc9457SAndroid Build Coastguard Worker LDR r0, [r2, 12] 118*4bdc9457SAndroid Build Coastguard Worker ADD r2, r2, 16 119*4bdc9457SAndroid Build Coastguard Worker 120*4bdc9457SAndroid Build Coastguard Worker # Add a_offset 121*4bdc9457SAndroid Build Coastguard Worker LDR r5, [sp, 116] // a_offset 122*4bdc9457SAndroid Build Coastguard Worker CMP r3, r7 // if a0 == zero 123*4bdc9457SAndroid Build Coastguard Worker ADD r3, r3, r5 // a0 += a_offset 124*4bdc9457SAndroid Build Coastguard Worker MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 125*4bdc9457SAndroid Build Coastguard Worker CMP r12, r7 // if a1 == zero 126*4bdc9457SAndroid Build Coastguard Worker ADD r12, r12, r5 // a1 += a_offset 127*4bdc9457SAndroid Build Coastguard Worker MOVEQ r12, r7 // a1 = zero, else += a1 + a_offset 128*4bdc9457SAndroid Build Coastguard Worker CMP r10, r7 // if a2 == zero 129*4bdc9457SAndroid Build Coastguard Worker ADD r10, r10, r5 // a2 += a_offset 130*4bdc9457SAndroid Build Coastguard Worker MOVEQ r10, r7 // a2 = zero, else += a2 + a_offset 131*4bdc9457SAndroid Build Coastguard Worker CMP r0, r7 // if a3 == zero 132*4bdc9457SAndroid Build Coastguard Worker ADD r0, r0, r5 // a3 += a_offset 133*4bdc9457SAndroid Build Coastguard Worker LDR r5, [sp, 52] // kc 134*4bdc9457SAndroid Build Coastguard Worker MOVEQ r0, r7 // a3 = zero, else += a3 + a_offset 135*4bdc9457SAndroid Build Coastguard Worker 136*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r5, 8 // kc - 8 137*4bdc9457SAndroid Build Coastguard Worker BLO 4f // less than 8 channels? 138*4bdc9457SAndroid Build Coastguard Worker 139*4bdc9457SAndroid Build Coastguard Worker # Main loop - 8 bytes of A. 140*4bdc9457SAndroid Build Coastguard Worker # 16 SDOT, 4 LD64 A, 4 LD128 B 141*4bdc9457SAndroid Build Coastguard Worker .p2align 3 142*4bdc9457SAndroid Build Coastguard Worker2: 143*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d0}, [r3]! // A0 144*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {q2}, [r9]! // B0 145*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d1}, [r12]! // A1 146*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {q3}, [r9]! // B1 147*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d2}, [r10]! // A2 148*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {q4}, [r9]! // B2 149*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {d3}, [r0]! // A3 150*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {q5}, [r9]! // B3 151*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r5, 8 152*4bdc9457SAndroid Build Coastguard Worker 153*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q8, q2, d0[0] 154*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q9, q3, d0[0] 155*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q10, q2, d1[0] 156*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q11, q3, d1[0] 157*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q12, q2, d2[0] 158*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q13, q3, d2[0] 159*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q14, q2, d3[0] 160*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q15, q3, d3[0] 161*4bdc9457SAndroid Build Coastguard Worker 162*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q8, q4, d0[1] 163*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q9, q5, d0[1] 164*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q10, q4, d1[1] 165*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q11, q5, d1[1] 166*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q12, q4, d2[1] 167*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q13, q5, d2[1] 168*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q14, q4, d3[1] 169*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q15, q5, d3[1] 170*4bdc9457SAndroid Build Coastguard Worker BHS 2b 171*4bdc9457SAndroid Build Coastguard Worker 172*4bdc9457SAndroid Build Coastguard Worker # Is there a remainder?- 4 bytes of A 173*4bdc9457SAndroid Build Coastguard Worker TST r5, 4 174*4bdc9457SAndroid Build Coastguard Worker BNE 4f 175*4bdc9457SAndroid Build Coastguard Worker 176*4bdc9457SAndroid Build Coastguard Worker3: 177*4bdc9457SAndroid Build Coastguard Worker # ks loop 178*4bdc9457SAndroid Build Coastguard Worker SUBS r14, r14, 16 // ks -= MR * sizeof(void*) 179*4bdc9457SAndroid Build Coastguard Worker BHI 1b 180*4bdc9457SAndroid Build Coastguard Worker 181*4bdc9457SAndroid Build Coastguard Worker LDR r7, [sp, 112] // cn_stride 182*4bdc9457SAndroid Build Coastguard Worker LDR r14, [sp, 56] // p = ks 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker $if REQUANTIZATION == "RNDNU": 185*4bdc9457SAndroid Build Coastguard Worker # RNDNU quantization 186*4bdc9457SAndroid Build Coastguard Worker VDUP.32 q0, d12[0] // right_pre_shift 187*4bdc9457SAndroid Build Coastguard Worker 188*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q8, q8, q0 189*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q9, q9, q0 190*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q10, q10, q0 191*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q11, q11, q0 192*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q12, q12, q0 193*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q13, q13, q0 194*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q14, q14, q0 195*4bdc9457SAndroid Build Coastguard Worker VQSHL.S32 q15, q15, q0 196*4bdc9457SAndroid Build Coastguard Worker 197*4bdc9457SAndroid Build Coastguard Worker VDUP.32 q2, d13[0] // right_post_shift 198*4bdc9457SAndroid Build Coastguard Worker 199*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q8, q8, d12[1] // multiplier 200*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q9, q9, d12[1] 201*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q10, q10, d12[1] 202*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q11, q11, d12[1] 203*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q12, q12, d12[1] 204*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q13, q13, d12[1] 205*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q14, q14, d12[1] 206*4bdc9457SAndroid Build Coastguard Worker VQDMULH.S32 q15, q15, d12[1] 207*4bdc9457SAndroid Build Coastguard Worker 208*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q8, q8, q2 209*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q9, q9, q2 210*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q10, q10, q2 211*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q11, q11, q2 212*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q12, q12, q2 213*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q13, q13, q2 214*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q14, q14, q2 215*4bdc9457SAndroid Build Coastguard Worker VRSHL.S32 q15, q15, q2 216*4bdc9457SAndroid Build Coastguard Worker $else: 217*4bdc9457SAndroid Build Coastguard Worker # QC8 FP32 quantization 218*4bdc9457SAndroid Build Coastguard Worker VLD1.8 {q0-q1}, [r9]! 219*4bdc9457SAndroid Build Coastguard Worker 220*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q8, q8 221*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q9, q9 222*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q10, q10 223*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q11, q11 224*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q12, q12 225*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q13, q13 226*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q14, q14 227*4bdc9457SAndroid Build Coastguard Worker VCVT.F32.S32 q15, q15 228*4bdc9457SAndroid Build Coastguard Worker 229*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q8, q8, q0 // multiplier 230*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q9, q9, q1 231*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q10, q10, q0 232*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q11, q11, q1 233*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q12, q12, q0 234*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q13, q13, q1 235*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q14, q14, q0 236*4bdc9457SAndroid Build Coastguard Worker VMUL.F32 q15, q15, q1 237*4bdc9457SAndroid Build Coastguard Worker 238*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q8, q8 239*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q9, q9 240*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q10, q10 241*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q11, q11 242*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q12, q12 243*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q13, q13 244*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q14, q14 245*4bdc9457SAndroid Build Coastguard Worker VCVTN.S32.F32 q15, q15 246*4bdc9457SAndroid Build Coastguard Worker VDUP.16 q0, d13[2] // output_zero_point 247*4bdc9457SAndroid Build Coastguard Worker 248*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d16, q8 249*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d17, q9 250*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d18, q10 251*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d19, q11 252*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d20, q12 253*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d21, q13 254*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d22, q14 255*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S32 d23, q15 256*4bdc9457SAndroid Build Coastguard Worker 257*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q8, q8, q0 258*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q9, q9, q0 259*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q10, q10, q0 260*4bdc9457SAndroid Build Coastguard Worker VQADD.S16 q11, q11, q0 261*4bdc9457SAndroid Build Coastguard Worker 262*4bdc9457SAndroid Build Coastguard Worker VDUP.8 q12, d13[6] // output_min 263*4bdc9457SAndroid Build Coastguard Worker 264*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d0, q8 265*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d1, q9 266*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d2, q10 267*4bdc9457SAndroid Build Coastguard Worker VQMOVN.S16 d3, q11 268*4bdc9457SAndroid Build Coastguard Worker 269*4bdc9457SAndroid Build Coastguard Worker VDUP.8 q13, d13[7] // output_max 270*4bdc9457SAndroid Build Coastguard Worker 271*4bdc9457SAndroid Build Coastguard Worker VMAX.S8 q0, q0, q12 272*4bdc9457SAndroid Build Coastguard Worker VMAX.S8 q1, q1, q12 273*4bdc9457SAndroid Build Coastguard Worker 274*4bdc9457SAndroid Build Coastguard Worker SUBS r1, r1, 8 // nc -= 8 275*4bdc9457SAndroid Build Coastguard Worker 276*4bdc9457SAndroid Build Coastguard Worker VMIN.S8 q0, q0, q13 277*4bdc9457SAndroid Build Coastguard Worker VMIN.S8 q1, q1, q13 278*4bdc9457SAndroid Build Coastguard Worker 279*4bdc9457SAndroid Build Coastguard Worker # Store full 4 x 8 280*4bdc9457SAndroid Build Coastguard Worker BLO 5f 281*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d3}, [r6], r7 282*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d2}, [r8], r7 283*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d1}, [r4], r7 284*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d0}, [r11], r7 285*4bdc9457SAndroid Build Coastguard Worker SUB r2, r2, r14 // a -= ks 286*4bdc9457SAndroid Build Coastguard Worker BHI 0b 287*4bdc9457SAndroid Build Coastguard Worker 288*4bdc9457SAndroid Build Coastguard Worker VPOP {d8-d13} 289*4bdc9457SAndroid Build Coastguard Worker ADD sp, sp, 12 // skip pad, r2, r3 290*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} 291*4bdc9457SAndroid Build Coastguard Worker 292*4bdc9457SAndroid Build Coastguard Worker4: 293*4bdc9457SAndroid Build Coastguard Worker # Remainder- 4 bytes of A 294*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d0[0]}, [r3]! // A0 295*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {q2}, [r9]! // B0 296*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d1[0]}, [r12]! // A1 297*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {q3}, [r9]! // B1 298*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d2[0]}, [r10]! // A2 299*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d3[0]}, [r0]! // A3 300*4bdc9457SAndroid Build Coastguard Worker 301*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q8, q2, d0[0] 302*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q9, q3, d0[0] 303*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q10, q2, d1[0] 304*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q11, q3, d1[0] 305*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q12, q2, d2[0] 306*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q13, q3, d2[0] 307*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q14, q2, d3[0] 308*4bdc9457SAndroid Build Coastguard Worker VSDOT.S8 q15, q3, d3[0] 309*4bdc9457SAndroid Build Coastguard Worker B 3b 310*4bdc9457SAndroid Build Coastguard Worker 311*4bdc9457SAndroid Build Coastguard Worker # Store odd width 312*4bdc9457SAndroid Build Coastguard Worker .p2align 3 313*4bdc9457SAndroid Build Coastguard Worker5: 314*4bdc9457SAndroid Build Coastguard Worker TST r1, 4 315*4bdc9457SAndroid Build Coastguard Worker BEQ 6f 316*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d3[0]}, [r6]! 317*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d2[0]}, [r8]! 318*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d1[0]}, [r4]! 319*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d0[0]}, [r11]! 320*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q1, q1, q1, 4 321*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q0, q0, q0, 4 322*4bdc9457SAndroid Build Coastguard Worker6: 323*4bdc9457SAndroid Build Coastguard Worker TST r1, 2 324*4bdc9457SAndroid Build Coastguard Worker BEQ 7f 325*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d3[0]}, [r6]! 326*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d2[0]}, [r8]! 327*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d1[0]}, [r4]! 328*4bdc9457SAndroid Build Coastguard Worker VST1.16 {d0[0]}, [r11]! 329*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q1, q1, q1, 2 330*4bdc9457SAndroid Build Coastguard Worker VEXT.8 q0, q0, q0, 2 331*4bdc9457SAndroid Build Coastguard Worker 332*4bdc9457SAndroid Build Coastguard Worker7: 333*4bdc9457SAndroid Build Coastguard Worker TST r1, 1 334*4bdc9457SAndroid Build Coastguard Worker BEQ 8f 335*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d3[0]}, [r6] 336*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d2[0]}, [r8] 337*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d1[0]}, [r4] 338*4bdc9457SAndroid Build Coastguard Worker VST1.8 {d0[0]}, [r11] 339*4bdc9457SAndroid Build Coastguard Worker 340*4bdc9457SAndroid Build Coastguard Worker8: 341*4bdc9457SAndroid Build Coastguard Worker VPOP {d8-d13} 342*4bdc9457SAndroid Build Coastguard Worker ADD sp, sp, 12 // skip pad, r2, r3 343*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} 344*4bdc9457SAndroid Build Coastguard Worker 345*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch32_neondot_ld64 346*4bdc9457SAndroid Build Coastguard Worker# LINT.ThenChange(4x8c4-rndnu-aarch32-neondot-ld64.cc) 347*4bdc9457SAndroid Build Coastguard Worker#endif // __APPLE__ 348*4bdc9457SAndroid Build Coastguard Worker 349*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__ 350*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits 351*4bdc9457SAndroid Build Coastguard Worker#endif 352