1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx | FileCheck %s 3 4; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. 5 6define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) { 7; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: 8; CHECK: ## BB#0: 9; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 10; CHECK-NEXT: retl 11 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1) 12 ret <4 x double> %res 13} 14declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone 15 16define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) { 17; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: 18; CHECK: ## BB#0: 19; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 20; CHECK-NEXT: retl 21 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1) 22 ret <8 x float> %res 23} 24declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone 25 26define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) { 27; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: 28; CHECK: ## BB#0: 29; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 30; CHECK-NEXT: retl 31 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1) 32 ret <8 x i32> %res 33} 34 35; Verify that high bits of the immediate are masked off. This should be the equivalent 36; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's 37; not a vinsertf128 $1. 38define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { 39; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: 40; CHECK: ## BB#0: 41; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %YMM1<def> 42; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 43; CHECK-NEXT: retl 44 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) 45 ret <8 x i32> %res 46} 47declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone 48 49; We don't check any vextractf128 variant with immediate 0 because that's just a move. 50 51define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) { 52; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1: 53; CHECK: ## BB#0: 54; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 55; CHECK-NEXT: vzeroupper 56; CHECK-NEXT: retl 57 %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1) 58 ret <2 x double> %res 59} 60declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone 61 62define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) { 63; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1: 64; CHECK: ## BB#0: 65; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 66; CHECK-NEXT: vzeroupper 67; CHECK-NEXT: retl 68 %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1) 69 ret <4 x float> %res 70} 71declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone 72 73define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) { 74; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1: 75; CHECK: ## BB#0: 76; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 77; CHECK-NEXT: vzeroupper 78; CHECK-NEXT: retl 79 %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1) 80 ret <4 x i32> %res 81} 82declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone 83 84; Verify that high bits of the immediate are masked off. This should be the equivalent 85; of a vextractf128 $0 which should be optimized away, so just check that it's 86; not a vextractf128 of any kind. 87define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) { 88; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2: 89; CHECK: ## BB#0: 90; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 91; CHECK-NEXT: vzeroupper 92; CHECK-NEXT: retl 93 %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2) 94 ret <2 x double> %res 95} 96 97 98define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { 99; CHECK-LABEL: test_x86_avx_blend_pd_256: 100; CHECK: ## BB#0: 101; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] 102; CHECK-NEXT: retl 103 %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] 104 ret <4 x double> %res 105} 106declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone 107 108 109define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { 110; CHECK-LABEL: test_x86_avx_blend_ps_256: 111; CHECK: ## BB#0: 112; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 113; CHECK-NEXT: retl 114 %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] 115 ret <8 x float> %res 116} 117declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone 118 119 120define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { 121; CHECK-LABEL: test_x86_avx_dp_ps_256: 122; CHECK: ## BB#0: 123; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 124; CHECK-NEXT: retl 125 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] 126 ret <8 x float> %res 127} 128declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone 129 130 131define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { 132; CHECK-LABEL: test_x86_sse2_psll_dq: 133; CHECK: ## BB#0: 134; CHECK-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 135; CHECK-NEXT: retl 136 %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1] 137 ret <2 x i64> %res 138} 139declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone 140 141 142define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { 143; CHECK-LABEL: test_x86_sse2_psrl_dq: 144; CHECK: ## BB#0: 145; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero 146; CHECK-NEXT: retl 147 %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1] 148 ret <2 x i64> %res 149} 150declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone 151 152 153define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { 154; CHECK-LABEL: test_x86_sse41_blendpd: 155; CHECK: ## BB#0: 156; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 157; CHECK-NEXT: retl 158 %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1] 159 ret <2 x double> %res 160} 161declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone 162 163 164define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { 165; CHECK-LABEL: test_x86_sse41_blendps: 166; CHECK: ## BB#0: 167; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 168; CHECK-NEXT: retl 169 %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] 170 ret <4 x float> %res 171} 172declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone 173 174 175define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { 176; CHECK-LABEL: test_x86_sse41_pblendw: 177; CHECK: ## BB#0: 178; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] 179; CHECK-NEXT: retl 180 %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1] 181 ret <8 x i16> %res 182} 183declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone 184 185 186define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) { 187; CHECK-LABEL: test_x86_sse41_pmovsxbd: 188; CHECK: ## BB#0: 189; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 190; CHECK-NEXT: retl 191 %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] 192 ret <4 x i32> %res 193} 194declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone 195 196 197define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) { 198; CHECK-LABEL: test_x86_sse41_pmovsxbq: 199; CHECK: ## BB#0: 200; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 201; CHECK-NEXT: retl 202 %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] 203 ret <2 x i64> %res 204} 205declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone 206 207 208define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) { 209; CHECK-LABEL: test_x86_sse41_pmovsxbw: 210; CHECK: ## BB#0: 211; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 212; CHECK-NEXT: retl 213 %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] 214 ret <8 x i16> %res 215} 216declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone 217 218 219define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) { 220; CHECK-LABEL: test_x86_sse41_pmovsxdq: 221; CHECK: ## BB#0: 222; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 223; CHECK-NEXT: retl 224 %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] 225 ret <2 x i64> %res 226} 227declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone 228 229 230define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) { 231; CHECK-LABEL: test_x86_sse41_pmovsxwd: 232; CHECK: ## BB#0: 233; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 234; CHECK-NEXT: retl 235 %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] 236 ret <4 x i32> %res 237} 238declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone 239 240 241define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { 242; CHECK-LABEL: test_x86_sse41_pmovsxwq: 243; CHECK: ## BB#0: 244; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 245; CHECK-NEXT: retl 246 %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] 247 ret <2 x i64> %res 248} 249declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone 250 251 252define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { 253; CHECK-LABEL: test_x86_sse41_pmovzxbd: 254; CHECK: ## BB#0: 255; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 256; CHECK-NEXT: retl 257 %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] 258 ret <4 x i32> %res 259} 260declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone 261 262 263define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) { 264; CHECK-LABEL: test_x86_sse41_pmovzxbq: 265; CHECK: ## BB#0: 266; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 267; CHECK-NEXT: retl 268 %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] 269 ret <2 x i64> %res 270} 271declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 272 273 274define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) { 275; CHECK-LABEL: test_x86_sse41_pmovzxbw: 276; CHECK: ## BB#0: 277; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 278; CHECK-NEXT: retl 279 %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] 280 ret <8 x i16> %res 281} 282declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone 283 284 285define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) { 286; CHECK-LABEL: test_x86_sse41_pmovzxdq: 287; CHECK: ## BB#0: 288; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 289; CHECK-NEXT: retl 290 %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] 291 ret <2 x i64> %res 292} 293declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone 294 295 296define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) { 297; CHECK-LABEL: test_x86_sse41_pmovzxwd: 298; CHECK: ## BB#0: 299; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 300; CHECK-NEXT: retl 301 %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] 302 ret <4 x i32> %res 303} 304declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone 305 306 307define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) { 308; CHECK-LABEL: test_x86_sse41_pmovzxwq: 309; CHECK: ## BB#0: 310; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 311; CHECK-NEXT: retl 312 %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] 313 ret <2 x i64> %res 314} 315declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone 316 317 318define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) { 319; CHECK-LABEL: test_x86_sse2_cvtdq2pd: 320; CHECK: ## BB#0: 321; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 322; CHECK-NEXT: retl 323 %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1] 324 ret <2 x double> %res 325} 326declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone 327 328 329define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) { 330; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256: 331; CHECK: ## BB#0: 332; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 333; CHECK-NEXT: retl 334 %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1] 335 ret <4 x double> %res 336} 337declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone 338 339 340define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) { 341; CHECK-LABEL: test_x86_sse2_cvtps2pd: 342; CHECK: ## BB#0: 343; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 344; CHECK-NEXT: retl 345 %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1] 346 ret <2 x double> %res 347} 348declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone 349 350 351define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) { 352; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256: 353; CHECK: ## BB#0: 354; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 355; CHECK-NEXT: retl 356 %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1] 357 ret <4 x double> %res 358} 359declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone 360 361 362define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) { 363; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256: 364; CHECK: ## BB#0: 365; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0 366; CHECK-NEXT: vzeroupper 367; CHECK-NEXT: retl 368 %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] 369 ret <4 x i32> %res 370} 371declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone 372 373 374define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) { 375; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256: 376; CHECK: ## BB#0: 377; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 378; CHECK-NEXT: retl 379 %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] 380 ret <8 x i32> %res 381} 382declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone 383 384 385define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { 386 ; add operation forces the execution domain. 387; CHECK-LABEL: test_x86_sse2_storeu_dq: 388; CHECK: ## BB#0: 389; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 390; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0 391; CHECK-NEXT: vmovdqu %xmm0, (%eax) 392; CHECK-NEXT: retl 393 %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 394 call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2) 395 ret void 396} 397declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind 398 399 400define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) { 401 ; fadd operation forces the execution domain. 402; CHECK-LABEL: test_x86_sse2_storeu_pd: 403; CHECK: ## BB#0: 404; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 405; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 406; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 407; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 408; CHECK-NEXT: vmovupd %xmm0, (%eax) 409; CHECK-NEXT: retl 410 %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000> 411 call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2) 412 ret void 413} 414declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind 415 416 417define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { 418; CHECK-LABEL: test_x86_sse_storeu_ps: 419; CHECK: ## BB#0: 420; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 421; CHECK-NEXT: vmovups %xmm0, (%eax) 422; CHECK-NEXT: retl 423 call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) 424 ret void 425} 426declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind 427 428 429define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { 430 ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions 431 ; add operation forces the execution domain. 432; CHECK-LABEL: test_x86_avx_storeu_dq_256: 433; CHECK: ## BB#0: 434; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 435; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 436; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 437; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 438; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 439; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 440; CHECK-NEXT: vmovups %ymm0, (%eax) 441; CHECK-NEXT: vzeroupper 442; CHECK-NEXT: retl 443 %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 444 call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2) 445 ret void 446} 447declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind 448 449 450define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) { 451 ; add operation forces the execution domain. 452; CHECK-LABEL: test_x86_avx_storeu_pd_256: 453; CHECK: ## BB#0: 454; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 455; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 456; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 457; CHECK-NEXT: vmovupd %ymm0, (%eax) 458; CHECK-NEXT: vzeroupper 459; CHECK-NEXT: retl 460 %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0> 461 call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2) 462 ret void 463} 464declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind 465 466 467define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) { 468; CHECK-LABEL: test_x86_avx_storeu_ps_256: 469; CHECK: ## BB#0: 470; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 471; CHECK-NEXT: vmovups %ymm0, (%eax) 472; CHECK-NEXT: vzeroupper 473; CHECK-NEXT: retl 474 call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1) 475 ret void 476} 477declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind 478 479 480define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) { 481; CHECK-LABEL: test_x86_avx_vpermil_pd: 482; CHECK: ## BB#0: 483; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 484; CHECK-NEXT: retl 485 %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1] 486 ret <2 x double> %res 487} 488declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone 489 490 491define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) { 492; CHECK-LABEL: test_x86_avx_vpermil_pd_256: 493; CHECK: ## BB#0: 494; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2] 495; CHECK-NEXT: retl 496 %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1] 497 ret <4 x double> %res 498} 499declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind readnone 500 501 502define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) { 503; CHECK-LABEL: test_x86_avx_vpermil_ps: 504; CHECK: ## BB#0: 505; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0] 506; CHECK-NEXT: retl 507 %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] 508 ret <4 x float> %res 509} 510declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone 511 512 513define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) { 514; CHECK-LABEL: test_x86_avx_vpermil_ps_256: 515; CHECK: ## BB#0: 516; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4] 517; CHECK-NEXT: retl 518 %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1] 519 ret <8 x float> %res 520} 521declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone 522