xref: /aosp_15_r20/external/llvm/test/Analysis/CostModel/X86/reduction.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
2*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
3*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
4*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
5*9880d681SAndroid Build Coastguard Worker
6*9880d681SAndroid Build Coastguard Workerdefine fastcc float @reduction_cost_float(<4 x float> %rdx) {
7*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
8*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
9*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
10*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
11*9880d681SAndroid Build Coastguard Worker
12*9880d681SAndroid Build Coastguard Worker; Check that we recognize the tree starting at the extractelement as a
13*9880d681SAndroid Build Coastguard Worker; reduction.
14*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: reduction_cost
15*9880d681SAndroid Build Coastguard Worker; CHECK:  cost of 9 {{.*}} extractelement
16*9880d681SAndroid Build Coastguard Worker
17*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x float> %bin.rdx8, i32 0
18*9880d681SAndroid Build Coastguard Worker  ret float %r
19*9880d681SAndroid Build Coastguard Worker}
20*9880d681SAndroid Build Coastguard Worker
21*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
22*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
23*9880d681SAndroid Build Coastguard Worker   <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
24*9880d681SAndroid Build Coastguard Worker              i32 undef, i32 undef, i32 undef, i32 undef>
25*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
26*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
27*9880d681SAndroid Build Coastguard Worker   <8 x i32> <i32 2    , i32 3,     i32 undef, i32 undef,
28*9880d681SAndroid Build Coastguard Worker              i32 undef, i32 undef, i32 undef, i32 undef>
29*9880d681SAndroid Build Coastguard Worker  %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
30*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
31*9880d681SAndroid Build Coastguard Worker   <8 x i32> <i32 1    , i32 undef, i32 undef, i32 undef,
32*9880d681SAndroid Build Coastguard Worker              i32 undef, i32 undef, i32 undef, i32 undef>
33*9880d681SAndroid Build Coastguard Worker  %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
34*9880d681SAndroid Build Coastguard Worker
35*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: reduction_cost_int
36*9880d681SAndroid Build Coastguard Worker; CHECK:  cost of 17 {{.*}} extractelement
37*9880d681SAndroid Build Coastguard Worker
38*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x i32> %bin.rdx.3, i32 0
39*9880d681SAndroid Build Coastguard Worker  ret i32 %r
40*9880d681SAndroid Build Coastguard Worker}
41*9880d681SAndroid Build Coastguard Worker
42*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
43*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
44*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
45*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
46*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
47*9880d681SAndroid Build Coastguard Worker  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
48*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
49*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
50*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
51*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
52*9880d681SAndroid Build Coastguard Worker  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
53*9880d681SAndroid Build Coastguard Worker
54*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: pairwise_hadd
55*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 11 {{.*}} extractelement
56*9880d681SAndroid Build Coastguard Worker
57*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x float> %bin.rdx.1, i32 0
58*9880d681SAndroid Build Coastguard Worker  %r2 = fadd float %r, %f1
59*9880d681SAndroid Build Coastguard Worker  ret float %r2
60*9880d681SAndroid Build Coastguard Worker}
61*9880d681SAndroid Build Coastguard Worker
62*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
63*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
64*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
65*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
66*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
67*9880d681SAndroid Build Coastguard Worker  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
68*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
69*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
71*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
72*9880d681SAndroid Build Coastguard Worker  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
73*9880d681SAndroid Build Coastguard Worker
74*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: pairwise_hadd_assoc
75*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 11 {{.*}} extractelement
76*9880d681SAndroid Build Coastguard Worker
77*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x float> %bin.rdx.1, i32 0
78*9880d681SAndroid Build Coastguard Worker  %r2 = fadd float %r, %f1
79*9880d681SAndroid Build Coastguard Worker  ret float %r2
80*9880d681SAndroid Build Coastguard Worker}
81*9880d681SAndroid Build Coastguard Worker
82*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
83*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
84*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
85*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
86*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
87*9880d681SAndroid Build Coastguard Worker  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
88*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
89*9880d681SAndroid Build Coastguard Worker        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
90*9880d681SAndroid Build Coastguard Worker  %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
91*9880d681SAndroid Build Coastguard Worker
92*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: pairwise_hadd_skip_first
93*9880d681SAndroid Build Coastguard Worker; CHECK: cost of 11 {{.*}} extractelement
94*9880d681SAndroid Build Coastguard Worker
95*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x float> %bin.rdx.1, i32 0
96*9880d681SAndroid Build Coastguard Worker  %r2 = fadd float %r, %f1
97*9880d681SAndroid Build Coastguard Worker  ret float %r2
98*9880d681SAndroid Build Coastguard Worker}
99*9880d681SAndroid Build Coastguard Worker
100*9880d681SAndroid Build Coastguard Workerdefine fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
101*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
102*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
103*9880d681SAndroid Build Coastguard Worker
104*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 2 {{.*}} extractelement
105*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 2 {{.*}} extractelement
106*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 2 {{.*}} extractelement
107*9880d681SAndroid Build Coastguard Worker
108*9880d681SAndroid Build Coastguard Worker  %r = extractelement <2 x double> %bin.rdx, i32 0
109*9880d681SAndroid Build Coastguard Worker  ret double %r
110*9880d681SAndroid Build Coastguard Worker}
111*9880d681SAndroid Build Coastguard Worker
112*9880d681SAndroid Build Coastguard Workerdefine fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
113*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
114*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
115*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
116*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
117*9880d681SAndroid Build Coastguard Worker
118*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 4 {{.*}} extractelement
119*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 3 {{.*}} extractelement
120*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 3 {{.*}} extractelement
121*9880d681SAndroid Build Coastguard Worker
122*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x float> %bin.rdx8, i32 0
123*9880d681SAndroid Build Coastguard Worker  ret float %r
124*9880d681SAndroid Build Coastguard Worker}
125*9880d681SAndroid Build Coastguard Worker
126*9880d681SAndroid Build Coastguard Workerdefine fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
127*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
128*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
129*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
130*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
131*9880d681SAndroid Build Coastguard Worker
132*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 3 {{.*}} extractelement
133*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 3 {{.*}} extractelement
134*9880d681SAndroid Build Coastguard Worker
135*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x double> %bin.rdx8, i32 0
136*9880d681SAndroid Build Coastguard Worker  ret double %r
137*9880d681SAndroid Build Coastguard Worker}
138*9880d681SAndroid Build Coastguard Worker
139*9880d681SAndroid Build Coastguard Workerdefine fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
140*9880d681SAndroid Build Coastguard Worker  %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
141*9880d681SAndroid Build Coastguard Worker  %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
142*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
144*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
146*9880d681SAndroid Build Coastguard Worker
147*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 4 {{.*}} extractelement
148*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 4 {{.*}} extractelement
149*9880d681SAndroid Build Coastguard Worker
150*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x float> %bin.rdx8, i32 0
151*9880d681SAndroid Build Coastguard Worker  ret float %r
152*9880d681SAndroid Build Coastguard Worker}
153*9880d681SAndroid Build Coastguard Worker
154*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
155*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
156*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
157*9880d681SAndroid Build Coastguard Worker
158*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 2 {{.*}} extractelement
159*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 1 {{.*}} extractelement
160*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 1 {{.*}} extractelement
161*9880d681SAndroid Build Coastguard Worker
162*9880d681SAndroid Build Coastguard Worker  %r = extractelement <2 x i64> %bin.rdx, i32 0
163*9880d681SAndroid Build Coastguard Worker  ret i64 %r
164*9880d681SAndroid Build Coastguard Worker}
165*9880d681SAndroid Build Coastguard Worker
166*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
167*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
168*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
169*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
170*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
171*9880d681SAndroid Build Coastguard Worker
172*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 3 {{.*}} extractelement
173*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 3 {{.*}} extractelement
174*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 3 {{.*}} extractelement
175*9880d681SAndroid Build Coastguard Worker
176*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x i32> %bin.rdx8, i32 0
177*9880d681SAndroid Build Coastguard Worker  ret i32 %r
178*9880d681SAndroid Build Coastguard Worker}
179*9880d681SAndroid Build Coastguard Worker
180*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
181*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
182*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
183*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
184*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
185*9880d681SAndroid Build Coastguard Worker
186*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 3 {{.*}} extractelement
187*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 3 {{.*}} extractelement
188*9880d681SAndroid Build Coastguard Worker
189*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x i64> %bin.rdx8, i32 0
190*9880d681SAndroid Build Coastguard Worker  ret i64 %r
191*9880d681SAndroid Build Coastguard Worker}
192*9880d681SAndroid Build Coastguard Worker
193*9880d681SAndroid Build Coastguard Workerdefine fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
194*9880d681SAndroid Build Coastguard Worker  %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
195*9880d681SAndroid Build Coastguard Worker  %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
196*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
197*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
198*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
199*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
200*9880d681SAndroid Build Coastguard Worker
201*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 4 {{.*}} extractelement
202*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 4 {{.*}} extractelement
203*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 4 {{.*}} extractelement
204*9880d681SAndroid Build Coastguard Worker
205*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x i16> %bin.rdx8, i32 0
206*9880d681SAndroid Build Coastguard Worker  ret i16 %r
207*9880d681SAndroid Build Coastguard Worker}
208*9880d681SAndroid Build Coastguard Worker
209*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
210*9880d681SAndroid Build Coastguard Worker  %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
211*9880d681SAndroid Build Coastguard Worker  %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
212*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
213*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
214*9880d681SAndroid Build Coastguard Worker  %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
215*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
216*9880d681SAndroid Build Coastguard Worker
217*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 5 {{.*}} extractelement
218*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 5 {{.*}} extractelement
219*9880d681SAndroid Build Coastguard Worker
220*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x i32> %bin.rdx8, i32 0
221*9880d681SAndroid Build Coastguard Worker  ret i32 %r
222*9880d681SAndroid Build Coastguard Worker}
223*9880d681SAndroid Build Coastguard Worker
224*9880d681SAndroid Build Coastguard Workerdefine fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
225*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
226*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
227*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
228*9880d681SAndroid Build Coastguard Worker
229*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 2 {{.*}} extractelement
230*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 2 {{.*}} extractelement
231*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 2 {{.*}} extractelement
232*9880d681SAndroid Build Coastguard Worker
233*9880d681SAndroid Build Coastguard Worker  %r = extractelement <2 x double> %bin.rdx8, i32 0
234*9880d681SAndroid Build Coastguard Worker  ret double %r
235*9880d681SAndroid Build Coastguard Worker}
236*9880d681SAndroid Build Coastguard Worker
237*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
238*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
239*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
240*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
241*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
242*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
243*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
244*9880d681SAndroid Build Coastguard Worker
245*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 4 {{.*}} extractelement
246*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 4 {{.*}} extractelement
247*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 4 {{.*}} extractelement
248*9880d681SAndroid Build Coastguard Worker
249*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x float> %bin.rdx8, i32 0
250*9880d681SAndroid Build Coastguard Worker  ret float %r
251*9880d681SAndroid Build Coastguard Worker}
252*9880d681SAndroid Build Coastguard Worker
253*9880d681SAndroid Build Coastguard Workerdefine fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
254*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
255*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
256*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
257*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
258*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
259*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
260*9880d681SAndroid Build Coastguard Worker
261*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 5 {{.*}} extractelement
262*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 5 {{.*}} extractelement
263*9880d681SAndroid Build Coastguard Worker
264*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x double> %bin.rdx8, i32 0
265*9880d681SAndroid Build Coastguard Worker  ret double %r
266*9880d681SAndroid Build Coastguard Worker}
267*9880d681SAndroid Build Coastguard Worker
268*9880d681SAndroid Build Coastguard Workerdefine fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
269*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
270*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
271*9880d681SAndroid Build Coastguard Worker  %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
272*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
273*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
274*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
275*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
276*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
277*9880d681SAndroid Build Coastguard Worker  %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
278*9880d681SAndroid Build Coastguard Worker
279*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 7 {{.*}} extractelement
280*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 7 {{.*}} extractelement
281*9880d681SAndroid Build Coastguard Worker
282*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x float> %bin.rdx9, i32 0
283*9880d681SAndroid Build Coastguard Worker  ret float %r
284*9880d681SAndroid Build Coastguard Worker}
285*9880d681SAndroid Build Coastguard Worker
286*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
287*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
288*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
289*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
290*9880d681SAndroid Build Coastguard Worker
291*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 2 {{.*}} extractelement
292*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 1 {{.*}} extractelement
293*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 1 {{.*}} extractelement
294*9880d681SAndroid Build Coastguard Worker
295*9880d681SAndroid Build Coastguard Worker  %r = extractelement <2 x i64> %bin.rdx8, i32 0
296*9880d681SAndroid Build Coastguard Worker  ret i64 %r
297*9880d681SAndroid Build Coastguard Worker}
298*9880d681SAndroid Build Coastguard Worker
299*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
300*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
301*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
302*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
303*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
304*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
305*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
306*9880d681SAndroid Build Coastguard Worker
307*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 3 {{.*}} extractelement
308*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 3 {{.*}} extractelement
309*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 3 {{.*}} extractelement
310*9880d681SAndroid Build Coastguard Worker
311*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x i32> %bin.rdx8, i32 0
312*9880d681SAndroid Build Coastguard Worker  ret i32 %r
313*9880d681SAndroid Build Coastguard Worker}
314*9880d681SAndroid Build Coastguard Worker
315*9880d681SAndroid Build Coastguard Workerdefine fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
316*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
317*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
318*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
319*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
320*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
321*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
322*9880d681SAndroid Build Coastguard Worker
323*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 5 {{.*}} extractelement
324*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 5 {{.*}} extractelement
325*9880d681SAndroid Build Coastguard Worker
326*9880d681SAndroid Build Coastguard Worker  %r = extractelement <4 x i64> %bin.rdx8, i32 0
327*9880d681SAndroid Build Coastguard Worker  ret i64 %r
328*9880d681SAndroid Build Coastguard Worker}
329*9880d681SAndroid Build Coastguard Worker
330*9880d681SAndroid Build Coastguard Workerdefine fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
331*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
332*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
333*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
334*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
335*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
336*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
337*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
338*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
339*9880d681SAndroid Build Coastguard Worker  %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
340*9880d681SAndroid Build Coastguard Worker
341*9880d681SAndroid Build Coastguard Worker; SSE3:  cost of 5 {{.*}} extractelement
342*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 5 {{.*}} extractelement
343*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 5 {{.*}} extractelement
344*9880d681SAndroid Build Coastguard Worker
345*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x i16> %bin.rdx9, i32 0
346*9880d681SAndroid Build Coastguard Worker  ret i16 %r
347*9880d681SAndroid Build Coastguard Worker}
348*9880d681SAndroid Build Coastguard Worker
349*9880d681SAndroid Build Coastguard Workerdefine fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
350*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
351*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
352*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
353*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
354*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
355*9880d681SAndroid Build Coastguard Worker  %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
356*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
357*9880d681SAndroid Build Coastguard Worker  %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
358*9880d681SAndroid Build Coastguard Worker  %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
359*9880d681SAndroid Build Coastguard Worker
360*9880d681SAndroid Build Coastguard Worker; AVX:  cost of 5 {{.*}} extractelement
361*9880d681SAndroid Build Coastguard Worker; AVX2:  cost of 5 {{.*}} extractelement
362*9880d681SAndroid Build Coastguard Worker
363*9880d681SAndroid Build Coastguard Worker  %r = extractelement <8 x i32> %bin.rdx9, i32 0
364*9880d681SAndroid Build Coastguard Worker  ret i32 %r
365*9880d681SAndroid Build Coastguard Worker}
366