1*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s 2*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3 3*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX 4*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2 5*9880d681SAndroid Build Coastguard Worker 6*9880d681SAndroid Build Coastguard Workerdefine fastcc float @reduction_cost_float(<4 x float> %rdx) { 7*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 8*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 9*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 10*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 11*9880d681SAndroid Build Coastguard Worker 12*9880d681SAndroid Build Coastguard Worker; Check that we recognize the tree starting at the extractelement as a 13*9880d681SAndroid Build Coastguard Worker; reduction. 14*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: reduction_cost 15*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 9 {{.*}} extractelement 16*9880d681SAndroid Build Coastguard Worker 17*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x float> %bin.rdx8, i32 0 18*9880d681SAndroid Build Coastguard Worker ret float %r 19*9880d681SAndroid Build Coastguard Worker} 20*9880d681SAndroid Build Coastguard Worker 21*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { 22*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, 23*9880d681SAndroid Build Coastguard Worker <8 x i32> <i32 4 , i32 5, i32 6, i32 7, 24*9880d681SAndroid Build Coastguard Worker i32 undef, i32 undef, i32 undef, i32 undef> 25*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <8 x i32> %rdx, %rdx.shuf 26*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, 27*9880d681SAndroid Build Coastguard Worker <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef, 28*9880d681SAndroid Build Coastguard Worker i32 undef, i32 undef, i32 undef, i32 undef> 29*9880d681SAndroid Build Coastguard Worker %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 30*9880d681SAndroid Build Coastguard Worker %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, 31*9880d681SAndroid Build Coastguard Worker <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef, 32*9880d681SAndroid Build Coastguard Worker i32 undef, i32 undef, i32 undef, i32 undef> 33*9880d681SAndroid Build Coastguard Worker %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 34*9880d681SAndroid Build Coastguard Worker 35*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: reduction_cost_int 36*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 17 {{.*}} extractelement 37*9880d681SAndroid Build Coastguard Worker 38*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x i32> %bin.rdx.3, i32 0 39*9880d681SAndroid Build Coastguard Worker ret i32 %r 40*9880d681SAndroid Build Coastguard Worker} 41*9880d681SAndroid Build Coastguard Worker 42*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { 43*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 44*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 45*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 46*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 47*9880d681SAndroid Build Coastguard Worker %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 48*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 49*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 50*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 51*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 52*9880d681SAndroid Build Coastguard Worker %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 53*9880d681SAndroid Build Coastguard Worker 54*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: pairwise_hadd 55*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 11 {{.*}} extractelement 56*9880d681SAndroid Build Coastguard Worker 57*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x float> %bin.rdx.1, i32 0 58*9880d681SAndroid Build Coastguard Worker %r2 = fadd float %r, %f1 59*9880d681SAndroid Build Coastguard Worker ret float %r2 60*9880d681SAndroid Build Coastguard Worker} 61*9880d681SAndroid Build Coastguard Worker 62*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { 63*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 64*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 65*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 66*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 67*9880d681SAndroid Build Coastguard Worker %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 68*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 69*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 70*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 71*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 72*9880d681SAndroid Build Coastguard Worker %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 73*9880d681SAndroid Build Coastguard Worker 74*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: pairwise_hadd_assoc 75*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 11 {{.*}} extractelement 76*9880d681SAndroid Build Coastguard Worker 77*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x float> %bin.rdx.1, i32 0 78*9880d681SAndroid Build Coastguard Worker %r2 = fadd float %r, %f1 79*9880d681SAndroid Build Coastguard Worker ret float %r2 80*9880d681SAndroid Build Coastguard Worker} 81*9880d681SAndroid Build Coastguard Worker 82*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { 83*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 84*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 85*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 86*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 87*9880d681SAndroid Build Coastguard Worker %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 88*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 89*9880d681SAndroid Build Coastguard Worker <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 90*9880d681SAndroid Build Coastguard Worker %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 91*9880d681SAndroid Build Coastguard Worker 92*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: pairwise_hadd_skip_first 93*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 11 {{.*}} extractelement 94*9880d681SAndroid Build Coastguard Worker 95*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x float> %bin.rdx.1, i32 0 96*9880d681SAndroid Build Coastguard Worker %r2 = fadd float %r, %f1 97*9880d681SAndroid Build Coastguard Worker ret float %r2 98*9880d681SAndroid Build Coastguard Worker} 99*9880d681SAndroid Build Coastguard Worker 100*9880d681SAndroid Build Coastguard Workerdefine fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) { 101*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 102*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf 103*9880d681SAndroid Build Coastguard Worker 104*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 2 {{.*}} extractelement 105*9880d681SAndroid Build Coastguard Worker; AVX: cost of 2 {{.*}} extractelement 106*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 2 {{.*}} extractelement 107*9880d681SAndroid Build Coastguard Worker 108*9880d681SAndroid Build Coastguard Worker %r = extractelement <2 x double> %bin.rdx, i32 0 109*9880d681SAndroid Build Coastguard Worker ret double %r 110*9880d681SAndroid Build Coastguard Worker} 111*9880d681SAndroid Build Coastguard Worker 112*9880d681SAndroid Build Coastguard Workerdefine fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { 113*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 114*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 115*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 116*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 117*9880d681SAndroid Build Coastguard Worker 118*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 4 {{.*}} extractelement 119*9880d681SAndroid Build Coastguard Worker; AVX: cost of 3 {{.*}} extractelement 120*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 3 {{.*}} extractelement 121*9880d681SAndroid Build Coastguard Worker 122*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x float> %bin.rdx8, i32 0 123*9880d681SAndroid Build Coastguard Worker ret float %r 124*9880d681SAndroid Build Coastguard Worker} 125*9880d681SAndroid Build Coastguard Worker 126*9880d681SAndroid Build Coastguard Workerdefine fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { 127*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 128*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf 129*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 130*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 131*9880d681SAndroid Build Coastguard Worker 132*9880d681SAndroid Build Coastguard Worker; AVX: cost of 3 {{.*}} extractelement 133*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 3 {{.*}} extractelement 134*9880d681SAndroid Build Coastguard Worker 135*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x double> %bin.rdx8, i32 0 136*9880d681SAndroid Build Coastguard Worker ret double %r 137*9880d681SAndroid Build Coastguard Worker} 138*9880d681SAndroid Build Coastguard Worker 139*9880d681SAndroid Build Coastguard Workerdefine fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { 140*9880d681SAndroid Build Coastguard Worker %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 141*9880d681SAndroid Build Coastguard Worker %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 142*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 143*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf 144*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 145*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 146*9880d681SAndroid Build Coastguard Worker 147*9880d681SAndroid Build Coastguard Worker; AVX: cost of 4 {{.*}} extractelement 148*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 4 {{.*}} extractelement 149*9880d681SAndroid Build Coastguard Worker 150*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x float> %bin.rdx8, i32 0 151*9880d681SAndroid Build Coastguard Worker ret float %r 152*9880d681SAndroid Build Coastguard Worker} 153*9880d681SAndroid Build Coastguard Worker 154*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 155*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 156*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <2 x i64> %rdx, %rdx.shuf 157*9880d681SAndroid Build Coastguard Worker 158*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 2 {{.*}} extractelement 159*9880d681SAndroid Build Coastguard Worker; AVX: cost of 1 {{.*}} extractelement 160*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 1 {{.*}} extractelement 161*9880d681SAndroid Build Coastguard Worker 162*9880d681SAndroid Build Coastguard Worker %r = extractelement <2 x i64> %bin.rdx, i32 0 163*9880d681SAndroid Build Coastguard Worker ret i64 %r 164*9880d681SAndroid Build Coastguard Worker} 165*9880d681SAndroid Build Coastguard Worker 166*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 167*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 168*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <4 x i32> %rdx, %rdx.shuf 169*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 170*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 171*9880d681SAndroid Build Coastguard Worker 172*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 3 {{.*}} extractelement 173*9880d681SAndroid Build Coastguard Worker; AVX: cost of 3 {{.*}} extractelement 174*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 3 {{.*}} extractelement 175*9880d681SAndroid Build Coastguard Worker 176*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x i32> %bin.rdx8, i32 0 177*9880d681SAndroid Build Coastguard Worker ret i32 %r 178*9880d681SAndroid Build Coastguard Worker} 179*9880d681SAndroid Build Coastguard Worker 180*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 181*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 182*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <4 x i64> %rdx, %rdx.shuf 183*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 184*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 185*9880d681SAndroid Build Coastguard Worker 186*9880d681SAndroid Build Coastguard Worker; AVX: cost of 3 {{.*}} extractelement 187*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 3 {{.*}} extractelement 188*9880d681SAndroid Build Coastguard Worker 189*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x i64> %bin.rdx8, i32 0 190*9880d681SAndroid Build Coastguard Worker ret i64 %r 191*9880d681SAndroid Build Coastguard Worker} 192*9880d681SAndroid Build Coastguard Worker 193*9880d681SAndroid Build Coastguard Workerdefine fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 194*9880d681SAndroid Build Coastguard Worker %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 195*9880d681SAndroid Build Coastguard Worker %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 196*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 197*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf 198*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 199*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 200*9880d681SAndroid Build Coastguard Worker 201*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 4 {{.*}} extractelement 202*9880d681SAndroid Build Coastguard Worker; AVX: cost of 4 {{.*}} extractelement 203*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 4 {{.*}} extractelement 204*9880d681SAndroid Build Coastguard Worker 205*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x i16> %bin.rdx8, i32 0 206*9880d681SAndroid Build Coastguard Worker ret i16 %r 207*9880d681SAndroid Build Coastguard Worker} 208*9880d681SAndroid Build Coastguard Worker 209*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 210*9880d681SAndroid Build Coastguard Worker %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 211*9880d681SAndroid Build Coastguard Worker %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 212*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 213*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf 214*9880d681SAndroid Build Coastguard Worker %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 215*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 216*9880d681SAndroid Build Coastguard Worker 217*9880d681SAndroid Build Coastguard Worker; AVX: cost of 5 {{.*}} extractelement 218*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 5 {{.*}} extractelement 219*9880d681SAndroid Build Coastguard Worker 220*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x i32> %bin.rdx8, i32 0 221*9880d681SAndroid Build Coastguard Worker ret i32 %r 222*9880d681SAndroid Build Coastguard Worker} 223*9880d681SAndroid Build Coastguard Worker 224*9880d681SAndroid Build Coastguard Workerdefine fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { 225*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 226*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 227*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 228*9880d681SAndroid Build Coastguard Worker 229*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 2 {{.*}} extractelement 230*9880d681SAndroid Build Coastguard Worker; AVX: cost of 2 {{.*}} extractelement 231*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 2 {{.*}} extractelement 232*9880d681SAndroid Build Coastguard Worker 233*9880d681SAndroid Build Coastguard Worker %r = extractelement <2 x double> %bin.rdx8, i32 0 234*9880d681SAndroid Build Coastguard Worker ret double %r 235*9880d681SAndroid Build Coastguard Worker} 236*9880d681SAndroid Build Coastguard Worker 237*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { 238*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 239*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 240*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 241*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 242*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 243*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 244*9880d681SAndroid Build Coastguard Worker 245*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 4 {{.*}} extractelement 246*9880d681SAndroid Build Coastguard Worker; AVX: cost of 4 {{.*}} extractelement 247*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 4 {{.*}} extractelement 248*9880d681SAndroid Build Coastguard Worker 249*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x float> %bin.rdx8, i32 0 250*9880d681SAndroid Build Coastguard Worker ret float %r 251*9880d681SAndroid Build Coastguard Worker} 252*9880d681SAndroid Build Coastguard Worker 253*9880d681SAndroid Build Coastguard Workerdefine fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { 254*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 255*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 256*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 257*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 258*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 259*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 260*9880d681SAndroid Build Coastguard Worker 261*9880d681SAndroid Build Coastguard Worker; AVX: cost of 5 {{.*}} extractelement 262*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 5 {{.*}} extractelement 263*9880d681SAndroid Build Coastguard Worker 264*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x double> %bin.rdx8, i32 0 265*9880d681SAndroid Build Coastguard Worker ret double %r 266*9880d681SAndroid Build Coastguard Worker} 267*9880d681SAndroid Build Coastguard Worker 268*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { 269*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 270*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 271*9880d681SAndroid Build Coastguard Worker %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 272*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 273*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 274*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 275*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 276*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 277*9880d681SAndroid Build Coastguard Worker %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 278*9880d681SAndroid Build Coastguard Worker 279*9880d681SAndroid Build Coastguard Worker; AVX: cost of 7 {{.*}} extractelement 280*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 7 {{.*}} extractelement 281*9880d681SAndroid Build Coastguard Worker 282*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x float> %bin.rdx9, i32 0 283*9880d681SAndroid Build Coastguard Worker ret float %r 284*9880d681SAndroid Build Coastguard Worker} 285*9880d681SAndroid Build Coastguard Worker 286*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 287*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> 288*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 289*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 290*9880d681SAndroid Build Coastguard Worker 291*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 2 {{.*}} extractelement 292*9880d681SAndroid Build Coastguard Worker; AVX: cost of 1 {{.*}} extractelement 293*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 1 {{.*}} extractelement 294*9880d681SAndroid Build Coastguard Worker 295*9880d681SAndroid Build Coastguard Worker %r = extractelement <2 x i64> %bin.rdx8, i32 0 296*9880d681SAndroid Build Coastguard Worker ret i64 %r 297*9880d681SAndroid Build Coastguard Worker} 298*9880d681SAndroid Build Coastguard Worker 299*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 300*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 301*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 302*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 303*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 304*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 305*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 306*9880d681SAndroid Build Coastguard Worker 307*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 3 {{.*}} extractelement 308*9880d681SAndroid Build Coastguard Worker; AVX: cost of 3 {{.*}} extractelement 309*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 3 {{.*}} extractelement 310*9880d681SAndroid Build Coastguard Worker 311*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x i32> %bin.rdx8, i32 0 312*9880d681SAndroid Build Coastguard Worker ret i32 %r 313*9880d681SAndroid Build Coastguard Worker} 314*9880d681SAndroid Build Coastguard Worker 315*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 316*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 317*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 318*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 319*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 320*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 321*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 322*9880d681SAndroid Build Coastguard Worker 323*9880d681SAndroid Build Coastguard Worker; AVX: cost of 5 {{.*}} extractelement 324*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 5 {{.*}} extractelement 325*9880d681SAndroid Build Coastguard Worker 326*9880d681SAndroid Build Coastguard Worker %r = extractelement <4 x i64> %bin.rdx8, i32 0 327*9880d681SAndroid Build Coastguard Worker ret i64 %r 328*9880d681SAndroid Build Coastguard Worker} 329*9880d681SAndroid Build Coastguard Worker 330*9880d681SAndroid Build Coastguard Workerdefine fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 331*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 332*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 333*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 334*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 335*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 336*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 337*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 338*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 339*9880d681SAndroid Build Coastguard Worker %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 340*9880d681SAndroid Build Coastguard Worker 341*9880d681SAndroid Build Coastguard Worker; SSE3: cost of 5 {{.*}} extractelement 342*9880d681SAndroid Build Coastguard Worker; AVX: cost of 5 {{.*}} extractelement 343*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 5 {{.*}} extractelement 344*9880d681SAndroid Build Coastguard Worker 345*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x i16> %bin.rdx9, i32 0 346*9880d681SAndroid Build Coastguard Worker ret i16 %r 347*9880d681SAndroid Build Coastguard Worker} 348*9880d681SAndroid Build Coastguard Worker 349*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 350*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 351*9880d681SAndroid Build Coastguard Worker %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 352*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 353*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 354*9880d681SAndroid Build Coastguard Worker %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 355*9880d681SAndroid Build Coastguard Worker %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 356*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 357*9880d681SAndroid Build Coastguard Worker %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 358*9880d681SAndroid Build Coastguard Worker %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 359*9880d681SAndroid Build Coastguard Worker 360*9880d681SAndroid Build Coastguard Worker; AVX: cost of 5 {{.*}} extractelement 361*9880d681SAndroid Build Coastguard Worker; AVX2: cost of 5 {{.*}} extractelement 362*9880d681SAndroid Build Coastguard Worker 363*9880d681SAndroid Build Coastguard Worker %r = extractelement <8 x i32> %bin.rdx9, i32 0 364*9880d681SAndroid Build Coastguard Worker ret i32 %r 365*9880d681SAndroid Build Coastguard Worker} 366