1*67e74705SXin Li // RUN: %clang_cc1 %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
2*67e74705SXin Li // FIXME: This is testing optimized generation of shuffle instructions and should be fixed.
3*67e74705SXin Li
4*67e74705SXin Li // Don't include mm_malloc.h, it's system specific.
5*67e74705SXin Li #define __MM_MALLOC_H
6*67e74705SXin Li
7*67e74705SXin Li #include <immintrin.h>
8*67e74705SXin Li
9*67e74705SXin Li //
10*67e74705SXin Li // Test LLVM IR codegen of shuffle instructions
11*67e74705SXin Li //
12*67e74705SXin Li
x(__m256 a,__m256 b)13*67e74705SXin Li __m256 x(__m256 a, __m256 b) {
14*67e74705SXin Li // Check if the mask is correct
15*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
16*67e74705SXin Li return _mm256_shuffle_ps(a, b, 203);
17*67e74705SXin Li }
18*67e74705SXin Li
test_mm_permute_pd(__m128d a)19*67e74705SXin Li __m128d test_mm_permute_pd(__m128d a) {
20*67e74705SXin Li // Check if the mask is correct
21*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 1, i32 0>
22*67e74705SXin Li return _mm_permute_pd(a, 1);
23*67e74705SXin Li }
24*67e74705SXin Li
test_mm256_permute_pd(__m256d a)25*67e74705SXin Li __m256d test_mm256_permute_pd(__m256d a) {
26*67e74705SXin Li // Check if the mask is correct
27*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2>
28*67e74705SXin Li return _mm256_permute_pd(a, 5);
29*67e74705SXin Li }
30*67e74705SXin Li
test_mm_permute_ps(__m128 a)31*67e74705SXin Li __m128 test_mm_permute_ps(__m128 a) {
32*67e74705SXin Li // Check if the mask is correct
33*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0>
34*67e74705SXin Li return _mm_permute_ps(a, 0x1b);
35*67e74705SXin Li }
36*67e74705SXin Li
37*67e74705SXin Li // Test case for PR12401
test_mm_permute_ps2(__m128 a)38*67e74705SXin Li __m128 test_mm_permute_ps2(__m128 a) {
39*67e74705SXin Li // Check if the mask is correct
40*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 2, i32 1, i32 2, i32 3>
41*67e74705SXin Li return _mm_permute_ps(a, 0xe6);
42*67e74705SXin Li }
43*67e74705SXin Li
test_mm256_permute_ps(__m256 a)44*67e74705SXin Li __m256 test_mm256_permute_ps(__m256 a) {
45*67e74705SXin Li // Check if the mask is correct
46*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
47*67e74705SXin Li return _mm256_permute_ps(a, 0x1b);
48*67e74705SXin Li }
49*67e74705SXin Li
test_mm256_permute2f128_pd(__m256d a,__m256d b)50*67e74705SXin Li __m256d test_mm256_permute2f128_pd(__m256d a, __m256d b) {
51*67e74705SXin Li // Check if the mask is correct
52*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 2, i32 3, i32 6, i32 7>
53*67e74705SXin Li return _mm256_permute2f128_pd(a, b, 0x31);
54*67e74705SXin Li }
55*67e74705SXin Li
test_mm256_permute2f128_ps(__m256 a,__m256 b)56*67e74705SXin Li __m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) {
57*67e74705SXin Li // Check if the mask is correct
58*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
59*67e74705SXin Li return _mm256_permute2f128_ps(a, b, 0x13);
60*67e74705SXin Li }
61*67e74705SXin Li
test_mm256_permute2f128_si256(__m256i a,__m256i b)62*67e74705SXin Li __m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) {
63*67e74705SXin Li // Check if the mask is correct
64*67e74705SXin Li // CHECK: shufflevector{{.*}} <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
65*67e74705SXin Li return _mm256_permute2f128_si256(a, b, 0x20);
66*67e74705SXin Li }
67*67e74705SXin Li
68*67e74705SXin Li __m128
test_mm_broadcast_ss(float const * __a)69*67e74705SXin Li test_mm_broadcast_ss(float const *__a) {
70*67e74705SXin Li // CHECK-LABEL: @test_mm_broadcast_ss
71*67e74705SXin Li // CHECK: insertelement <4 x float> {{.*}}, i32 0
72*67e74705SXin Li // CHECK: insertelement <4 x float> {{.*}}, i32 1
73*67e74705SXin Li // CHECK: insertelement <4 x float> {{.*}}, i32 2
74*67e74705SXin Li // CHECK: insertelement <4 x float> {{.*}}, i32 3
75*67e74705SXin Li return _mm_broadcast_ss(__a);
76*67e74705SXin Li }
77*67e74705SXin Li
78*67e74705SXin Li __m256d
test_mm256_broadcast_sd(double const * __a)79*67e74705SXin Li test_mm256_broadcast_sd(double const *__a) {
80*67e74705SXin Li // CHECK-LABEL: @test_mm256_broadcast_sd
81*67e74705SXin Li // CHECK: insertelement <4 x double> {{.*}}, i32 0
82*67e74705SXin Li // CHECK: insertelement <4 x double> {{.*}}, i32 1
83*67e74705SXin Li // CHECK: insertelement <4 x double> {{.*}}, i32 2
84*67e74705SXin Li // CHECK: insertelement <4 x double> {{.*}}, i32 3
85*67e74705SXin Li return _mm256_broadcast_sd(__a);
86*67e74705SXin Li }
87*67e74705SXin Li
88*67e74705SXin Li __m256
test_mm256_broadcast_ss(float const * __a)89*67e74705SXin Li test_mm256_broadcast_ss(float const *__a) {
90*67e74705SXin Li // CHECK-LABEL: @test_mm256_broadcast_ss
91*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 0
92*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 1
93*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 2
94*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 3
95*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 4
96*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 5
97*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 6
98*67e74705SXin Li // CHECK: insertelement <8 x float> {{.*}}, i32 7
99*67e74705SXin Li return _mm256_broadcast_ss(__a);
100*67e74705SXin Li }
101*67e74705SXin Li
102*67e74705SXin Li // Make sure we have the correct mask for each insertf128 case.
103*67e74705SXin Li
test_mm256_insertf128_ps_0(__m256 a,__m128 b)104*67e74705SXin Li __m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
105*67e74705SXin Li // CHECK-LABEL: @test_mm256_insertf128_ps_0
106*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
107*67e74705SXin Li return _mm256_insertf128_ps(a, b, 0);
108*67e74705SXin Li }
109*67e74705SXin Li
test_mm256_insertf128_pd_0(__m256d a,__m128d b)110*67e74705SXin Li __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
111*67e74705SXin Li // CHECK-LABEL: @test_mm256_insertf128_pd_0
112*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
113*67e74705SXin Li return _mm256_insertf128_pd(a, b, 0);
114*67e74705SXin Li }
115*67e74705SXin Li
test_mm256_insertf128_si256_0(__m256i a,__m128i b)116*67e74705SXin Li __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
117*67e74705SXin Li // CHECK-LABEL: @test_mm256_insertf128_si256_0
118*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
119*67e74705SXin Li return _mm256_insertf128_si256(a, b, 0);
120*67e74705SXin Li }
121*67e74705SXin Li
test_mm256_insertf128_ps_1(__m256 a,__m128 b)122*67e74705SXin Li __m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) {
123*67e74705SXin Li // CHECK-LABEL: @test_mm256_insertf128_ps_1
124*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
125*67e74705SXin Li return _mm256_insertf128_ps(a, b, 1);
126*67e74705SXin Li }
127*67e74705SXin Li
test_mm256_insertf128_pd_1(__m256d a,__m128d b)128*67e74705SXin Li __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) {
129*67e74705SXin Li // CHECK-LABEL: @test_mm256_insertf128_pd_1
130*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
131*67e74705SXin Li return _mm256_insertf128_pd(a, b, 1);
132*67e74705SXin Li }
133*67e74705SXin Li
test_mm256_insertf128_si256_1(__m256i a,__m128i b)134*67e74705SXin Li __m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
135*67e74705SXin Li // CHECK-LABEL: @test_mm256_insertf128_si256_1
136*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
137*67e74705SXin Li return _mm256_insertf128_si256(a, b, 1);
138*67e74705SXin Li }
139*67e74705SXin Li
140*67e74705SXin Li // Make sure we have the correct mask for each extractf128 case.
141*67e74705SXin Li
test_mm256_extractf128_ps_0(__m256 a)142*67e74705SXin Li __m128 test_mm256_extractf128_ps_0(__m256 a) {
143*67e74705SXin Li // CHECK-LABEL: @test_mm256_extractf128_ps_0
144*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
145*67e74705SXin Li return _mm256_extractf128_ps(a, 0);
146*67e74705SXin Li }
147*67e74705SXin Li
test_mm256_extractf128_pd_0(__m256d a)148*67e74705SXin Li __m128d test_mm256_extractf128_pd_0(__m256d a) {
149*67e74705SXin Li // CHECK-LABEL: @test_mm256_extractf128_pd_0
150*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1>
151*67e74705SXin Li return _mm256_extractf128_pd(a, 0);
152*67e74705SXin Li }
153*67e74705SXin Li
test_mm256_extractf128_si256_0(__m256i a)154*67e74705SXin Li __m128i test_mm256_extractf128_si256_0(__m256i a) {
155*67e74705SXin Li // CHECK-LABEL: @test_mm256_extractf128_si256_0
156*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1>
157*67e74705SXin Li return _mm256_extractf128_si256(a, 0);
158*67e74705SXin Li }
159*67e74705SXin Li
test_mm256_extractf128_ps_1(__m256 a)160*67e74705SXin Li __m128 test_mm256_extractf128_ps_1(__m256 a) {
161*67e74705SXin Li // CHECK-LABEL: @test_mm256_extractf128_ps_1
162*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7>
163*67e74705SXin Li return _mm256_extractf128_ps(a, 1);
164*67e74705SXin Li }
165*67e74705SXin Li
test_mm256_extractf128_pd_1(__m256d a)166*67e74705SXin Li __m128d test_mm256_extractf128_pd_1(__m256d a) {
167*67e74705SXin Li // CHECK-LABEL: @test_mm256_extractf128_pd_1
168*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 2, i32 3>
169*67e74705SXin Li return _mm256_extractf128_pd(a, 1);
170*67e74705SXin Li }
171*67e74705SXin Li
test_mm256_extractf128_si256_1(__m256i a)172*67e74705SXin Li __m128i test_mm256_extractf128_si256_1(__m256i a) {
173*67e74705SXin Li // CHECK-LABEL: @test_mm256_extractf128_si256_1
174*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 2, i32 3>
175*67e74705SXin Li return _mm256_extractf128_si256(a, 1);
176*67e74705SXin Li }
177*67e74705SXin Li
test_mm256_set_m128(__m128 hi,__m128 lo)178*67e74705SXin Li __m256 test_mm256_set_m128(__m128 hi, __m128 lo) {
179*67e74705SXin Li // CHECK-LABEL: @test_mm256_set_m128
180*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
181*67e74705SXin Li return _mm256_set_m128(hi, lo);
182*67e74705SXin Li }
183*67e74705SXin Li
test_mm256_set_m128d(__m128d hi,__m128d lo)184*67e74705SXin Li __m256d test_mm256_set_m128d(__m128d hi, __m128d lo) {
185*67e74705SXin Li // CHECK-LABEL: @test_mm256_set_m128d
186*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
187*67e74705SXin Li return _mm256_set_m128d(hi, lo);
188*67e74705SXin Li }
189*67e74705SXin Li
test_mm256_set_m128i(__m128i hi,__m128i lo)190*67e74705SXin Li __m256i test_mm256_set_m128i(__m128i hi, __m128i lo) {
191*67e74705SXin Li // CHECK-LABEL: @test_mm256_set_m128i
192*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
193*67e74705SXin Li return _mm256_set_m128i(hi, lo);
194*67e74705SXin Li }
195*67e74705SXin Li
test_mm256_setr_m128(__m128 hi,__m128 lo)196*67e74705SXin Li __m256 test_mm256_setr_m128(__m128 hi, __m128 lo) {
197*67e74705SXin Li // CHECK-LABEL: @test_mm256_setr_m128
198*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
199*67e74705SXin Li return _mm256_setr_m128(lo, hi);
200*67e74705SXin Li }
201*67e74705SXin Li
test_mm256_setr_m128d(__m128d hi,__m128d lo)202*67e74705SXin Li __m256d test_mm256_setr_m128d(__m128d hi, __m128d lo) {
203*67e74705SXin Li // CHECK-LABEL: @test_mm256_setr_m128d
204*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
205*67e74705SXin Li return _mm256_setr_m128d(lo, hi);
206*67e74705SXin Li }
207*67e74705SXin Li
test_mm256_setr_m128i(__m128i hi,__m128i lo)208*67e74705SXin Li __m256i test_mm256_setr_m128i(__m128i hi, __m128i lo) {
209*67e74705SXin Li // CHECK-LABEL: @test_mm256_setr_m128i
210*67e74705SXin Li // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
211*67e74705SXin Li return _mm256_setr_m128i(lo, hi);
212*67e74705SXin Li }
213