xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/avx512-intrinsics.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
3
4declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
5define i32 @test_kortestz(i16 %a0, i16 %a1) {
6; CHECK-LABEL: test_kortestz:
7; CHECK:       ## BB#0:
8; CHECK-NEXT:    kmovw %esi, %k0
9; CHECK-NEXT:    kmovw %edi, %k1
10; CHECK-NEXT:    xorl %eax, %eax
11; CHECK-NEXT:    kortestw %k0, %k1
12; CHECK-NEXT:    sete %al
13; CHECK-NEXT:    retq
14  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
15  ret i32 %res
16}
17
18declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
19define i32 @test_kortestc(i16 %a0, i16 %a1) {
20; CHECK-LABEL: test_kortestc:
21; CHECK:       ## BB#0:
22; CHECK-NEXT:    kmovw %esi, %k0
23; CHECK-NEXT:    kmovw %edi, %k1
24; CHECK-NEXT:    kortestw %k0, %k1
25; CHECK-NEXT:    sbbl %eax, %eax
26; CHECK-NEXT:    andl $1, %eax
27; CHECK-NEXT:    retq
28  %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
29  ret i32 %res
30}
31
32declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
33define i16 @test_kand(i16 %a0, i16 %a1) {
34; CHECK-LABEL: test_kand:
35; CHECK:       ## BB#0:
36; CHECK-NEXT:    movw $8, %ax
37; CHECK-NEXT:    kmovw %eax, %k0
38; CHECK-NEXT:    kmovw %edi, %k1
39; CHECK-NEXT:    kandw %k0, %k1, %k0
40; CHECK-NEXT:    kmovw %esi, %k1
41; CHECK-NEXT:    kandw %k1, %k0, %k0
42; CHECK-NEXT:    kmovw %k0, %eax
43; CHECK-NEXT:    retq
44  %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
45  %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
46  ret i16 %t2
47}
48
49declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
50define i16 @test_knot(i16 %a0) {
51; CHECK-LABEL: test_knot:
52; CHECK:       ## BB#0:
53; CHECK-NEXT:    kmovw %edi, %k0
54; CHECK-NEXT:    knotw %k0, %k0
55; CHECK-NEXT:    kmovw %k0, %eax
56; CHECK-NEXT:    retq
57  %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
58  ret i16 %res
59}
60
61declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
62
63define i16 @unpckbw_test(i16 %a0, i16 %a1) {
64; CHECK-LABEL: unpckbw_test:
65; CHECK:       ## BB#0:
66; CHECK-NEXT:    kmovw %edi, %k0
67; CHECK-NEXT:    kmovw %esi, %k1
68; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
69; CHECK-NEXT:    kmovw %k0, %eax
70; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
71; CHECK-NEXT:    retq
72  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
73  ret i16 %res
74}
75
76define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
77; CHECK-LABEL: test_rcp_ps_512:
78; CHECK:       ## BB#0:
79; CHECK-NEXT:    vrcp14ps %zmm0, %zmm0
80; CHECK-NEXT:    retq
81  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
82  ret <16 x float> %res
83}
84declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
85
86define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
87; CHECK-LABEL: test_rcp_pd_512:
88; CHECK:       ## BB#0:
89; CHECK-NEXT:    vrcp14pd %zmm0, %zmm0
90; CHECK-NEXT:    retq
91  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
92  ret <8 x double> %res
93}
94declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
95
96declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
97
98define <8 x double> @test7(<8 x double> %a) {
99; CHECK-LABEL: test7:
100; CHECK:       ## BB#0:
101; CHECK-NEXT:    vrndscalepd $11, %zmm0, %zmm0
102; CHECK-NEXT:    retq
103  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
104  ret <8 x double>%res
105}
106
107declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
108
109define <16 x float> @test8(<16 x float> %a) {
110; CHECK-LABEL: test8:
111; CHECK:       ## BB#0:
112; CHECK-NEXT:    vrndscaleps $11, %zmm0, %zmm0
113; CHECK-NEXT:    retq
114  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
115  ret <16 x float>%res
116}
117
118define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
119; CHECK-LABEL: test_rsqrt_ps_512:
120; CHECK:       ## BB#0:
121; CHECK-NEXT:    vrsqrt14ps %zmm0, %zmm0
122; CHECK-NEXT:    retq
123  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
124  ret <16 x float> %res
125}
126declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
127
128define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
129; CHECK-LABEL: test_sqrt_pd_512:
130; CHECK:       ## BB#0:
131; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
132; CHECK-NEXT:    retq
133  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
134  ret <8 x double> %res
135}
136declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
137
138define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
139; CHECK-LABEL: test_sqrt_ps_512:
140; CHECK:       ## BB#0:
141; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
142; CHECK-NEXT:    retq
143  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
144  ret <16 x float> %res
145}
146define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
147; CHECK-LABEL: test_sqrt_round_ps_512:
148; CHECK:       ## BB#0:
149; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm0
150; CHECK-NEXT:    retq
151  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3)
152  ret <16 x float> %res
153}
154declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
155
156define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
157; CHECK-LABEL: test_getexp_pd_512:
158; CHECK:       ## BB#0:
159; CHECK-NEXT:    vgetexppd %zmm0, %zmm0
160; CHECK-NEXT:    retq
161  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
162  ret <8 x double> %res
163}
164define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
165; CHECK-LABEL: test_getexp_round_pd_512:
166; CHECK:       ## BB#0:
167; CHECK-NEXT:    vgetexppd {sae}, %zmm0, %zmm0
168; CHECK-NEXT:    retq
169  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 8)
170  ret <8 x double> %res
171}
172declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
173
174define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
175; CHECK-LABEL: test_getexp_ps_512:
176; CHECK:       ## BB#0:
177; CHECK-NEXT:    vgetexpps %zmm0, %zmm0
178; CHECK-NEXT:    retq
179  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
180  ret <16 x float> %res
181}
182
183define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
184; CHECK-LABEL: test_getexp_round_ps_512:
185; CHECK:       ## BB#0:
186; CHECK-NEXT:    vgetexpps {sae}, %zmm0, %zmm0
187; CHECK-NEXT:    retq
188  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
189  ret <16 x float> %res
190}
191declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
192
193declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
194
195define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
196; CHECK-LABEL: test_sqrt_ss:
197; CHECK:       ## BB#0:
198; CHECK-NEXT:    andl $1, %edi
199; CHECK-NEXT:    kmovw %edi, %k1
200; CHECK-NEXT:    vmovaps %zmm2, %zmm3
201; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
202; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
203; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
204; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
205; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
206; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
207; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
208; CHECK-NEXT:    retq
209  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
210  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
211  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
212  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
213
214  %res.1 = fadd <4 x float> %res0, %res1
215  %res.2 = fadd <4 x float> %res2, %res3
216  %res   = fadd <4 x float> %res.1, %res.2
217  ret <4 x float> %res
218}
219
220declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
221
222define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
223; CHECK-LABEL: test_sqrt_sd:
224; CHECK:       ## BB#0:
225; CHECK-NEXT:    andl $1, %edi
226; CHECK-NEXT:    kmovw %edi, %k1
227; CHECK-NEXT:    vmovaps %zmm2, %zmm3
228; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
229; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
230; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
231; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
232; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
233; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
234; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
235; CHECK-NEXT:    retq
236  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
237  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
238  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
239  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
240
241  %res.1 = fadd <2 x double> %res0, %res1
242  %res.2 = fadd <2 x double> %res2, %res3
243  %res   = fadd <2 x double> %res.1, %res.2
244  ret <2 x double> %res
245}
246
247define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
248; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
249; CHECK:       ## BB#0:
250; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
251; CHECK-NEXT:    retq
252  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
253  ret i64 %res
254}
255declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
256
257define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
258; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
259; CHECK:       ## BB#0:
260; CHECK-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
261; CHECK-NEXT:    retq
262  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
263  ret <2 x double> %res
264}
265declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
266
267define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
268; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
269; CHECK:       ## BB#0:
270; CHECK-NEXT:    vcvttsd2si %xmm0, %rcx
271; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %rax
272; CHECK-NEXT:    addq %rcx, %rax
273; CHECK-NEXT:    retq
274  %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ;
275  %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ;
276  %res2 = add i64 %res0, %res1
277  ret i64 %res2
278}
279declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
280
281define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
282; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
283; CHECK:       ## BB#0:
284; CHECK-NEXT:    vcvttsd2usi %xmm0, %ecx
285; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
286; CHECK-NEXT:    addl %ecx, %eax
287; CHECK-NEXT:    retq
288  %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
289  %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
290  %res2 = add i32 %res0, %res1
291  ret i32 %res2
292}
293declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
294
295define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
296; CHECK-LABEL: test_x86_avx512_cvttsd2si:
297; CHECK:       ## BB#0:
298; CHECK-NEXT:    vcvttsd2si %xmm0, %ecx
299; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
300; CHECK-NEXT:    addl %ecx, %eax
301; CHECK-NEXT:    retq
302  %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
303  %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
304  %res2 = add i32 %res0, %res1
305  ret i32 %res2
306}
307declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
308
309
310
311define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
312; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
313; CHECK:       ## BB#0:
314; CHECK-NEXT:    vcvttsd2usi %xmm0, %rcx
315; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %rax
316; CHECK-NEXT:    addq %rcx, %rax
317; CHECK-NEXT:    retq
318  %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ;
319  %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ;
320  %res2 = add i64 %res0, %res1
321  ret i64 %res2
322}
323declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
324
325define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
326; CHECK-LABEL: test_x86_sse_cvtss2si64:
327; CHECK:       ## BB#0:
328; CHECK-NEXT:    vcvtss2si %xmm0, %rax
329; CHECK-NEXT:    retq
330  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
331  ret i64 %res
332}
333declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
334
335
336define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
337; CHECK-LABEL: test_x86_sse_cvtsi642ss:
338; CHECK:       ## BB#0:
339; CHECK-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
340; CHECK-NEXT:    retq
341  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
342  ret <4 x float> %res
343}
344declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
345
346
347define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
348; CHECK-LABEL: test_x86_avx512_cvttss2si:
349; CHECK:       ## BB#0:
350; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
351; CHECK-NEXT:    vcvttss2si %xmm0, %eax
352; CHECK-NEXT:    addl %ecx, %eax
353; CHECK-NEXT:    retq
354  %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
355  %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
356  %res2 = add i32 %res0, %res1
357  ret i32 %res2
358}
359declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
360
361define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
362; CHECK-LABEL: test_x86_avx512_cvttss2si64:
363; CHECK:       ## BB#0:
364; CHECK-NEXT:    vcvttss2si %xmm0, %rcx
365; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %rax
366; CHECK-NEXT:    addq %rcx, %rax
367; CHECK-NEXT:    retq
368  %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ;
369  %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ;
370  %res2 = add i64 %res0, %res1
371  ret i64 %res2
372}
373declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
374
375define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
376; CHECK-LABEL: test_x86_avx512_cvttss2usi:
377; CHECK:       ## BB#0:
378; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
379; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
380; CHECK-NEXT:    addl %ecx, %eax
381; CHECK-NEXT:    retq
382  %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
383  %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
384  %res2 = add i32 %res0, %res1
385  ret i32 %res2
386}
387declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
388
389define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
390; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
391; CHECK:       ## BB#0:
392; CHECK-NEXT:    vcvttss2usi %xmm0, %rcx
393; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %rax
394; CHECK-NEXT:    addq %rcx, %rax
395; CHECK-NEXT:    retq
396  %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ;
397  %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ;
398  %res2 = add i64 %res0, %res1
399  ret i64 %res2
400}
401declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
402
403define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
404; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
405; CHECK:       ## BB#0:
406; CHECK-NEXT:    vcvtsd2usi %xmm0, %rcx
407; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rax
408; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rdx
409; CHECK-NEXT:    addq %rcx, %rax
410; CHECK-NEXT:    addq %rdx, %rax
411; CHECK-NEXT:    retq
412
413  %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
414  %res1 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 3)
415  %res2 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 1)
416  %res3 = add i64 %res, %res1
417  %res4 = add i64 %res3, %res2
418  ret i64 %res4
419}
420declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
421
422define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
423; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
424; CHECK:       ## BB#0:
425; CHECK-NEXT:    vcvtsd2si %xmm0, %rcx
426; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rax
427; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rdx
428; CHECK-NEXT:    addq %rcx, %rax
429; CHECK-NEXT:    addq %rdx, %rax
430; CHECK-NEXT:    retq
431
432  %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
433  %res1 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 3)
434  %res2 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 1)
435  %res3 = add i64 %res, %res1
436  %res4 = add i64 %res3, %res2
437  ret i64 %res4
438}
439declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
440
441define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
442; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
443; CHECK:       ## BB#0:
444; CHECK-NEXT:    vcvtss2usi %xmm0, %rcx
445; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rax
446; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rdx
447; CHECK-NEXT:    addq %rcx, %rax
448; CHECK-NEXT:    addq %rdx, %rax
449; CHECK-NEXT:    retq
450
451  %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
452  %res1 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 3)
453  %res2 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 1)
454  %res3 = add i64 %res, %res1
455  %res4 = add i64 %res3, %res2
456  ret i64 %res4
457}
458declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
459
460define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
461; CHECK-LABEL: test_x86_avx512_cvtss2si64:
462; CHECK:       ## BB#0:
463; CHECK-NEXT:    vcvtss2si %xmm0, %rcx
464; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rax
465; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rdx
466; CHECK-NEXT:    addq %rcx, %rax
467; CHECK-NEXT:    addq %rdx, %rax
468; CHECK-NEXT:    retq
469
470  %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
471  %res1 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 3)
472  %res2 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 1)
473  %res3 = add i64 %res, %res1
474  %res4 = add i64 %res3, %res2
475  ret i64 %res4
476}
477declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
478
479define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
480; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
481; CHECK:       ## BB#0:
482; CHECK-NEXT:    vcvtsd2usi %xmm0, %ecx
483; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %eax
484; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %edx
485; CHECK-NEXT:    addl %ecx, %eax
486; CHECK-NEXT:    addl %edx, %eax
487; CHECK-NEXT:    retq
488
489  %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
490  %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 3)
491  %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 1)
492  %res3 = add i32 %res, %res1
493  %res4 = add i32 %res3, %res2
494  ret i32 %res4
495}
496declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
497
498define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
499; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
500; CHECK:       ## BB#0:
501; CHECK-NEXT:    vcvtsd2si %xmm0, %ecx
502; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %eax
503; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %edx
504; CHECK-NEXT:    addl %ecx, %eax
505; CHECK-NEXT:    addl %edx, %eax
506; CHECK-NEXT:    retq
507
508  %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
509  %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 3)
510  %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 1)
511  %res3 = add i32 %res, %res1
512  %res4 = add i32 %res3, %res2
513  ret i32 %res4
514}
515declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
516
517define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
518; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
519; CHECK:       ## BB#0:
520; CHECK-NEXT:    vcvtss2usi %xmm0, %ecx
521; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %eax
522; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %edx
523; CHECK-NEXT:    addl %ecx, %eax
524; CHECK-NEXT:    addl %edx, %eax
525; CHECK-NEXT:    retq
526
527  %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
528  %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 3)
529  %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 1)
530  %res3 = add i32 %res, %res1
531  %res4 = add i32 %res3, %res2
532  ret i32 %res4
533}
534declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
535
536define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
537; CHECK-LABEL: test_x86_avx512_cvtss2si32:
538; CHECK:       ## BB#0:
539; CHECK-NEXT:    vcvtss2si %xmm0, %ecx
540; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %eax
541; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %edx
542; CHECK-NEXT:    addl %ecx, %eax
543; CHECK-NEXT:    addl %edx, %eax
544; CHECK-NEXT:    retq
545
546  %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
547  %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 3)
548  %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 1)
549  %res3 = add i32 %res, %res1
550  %res4 = add i32 %res3, %res2
551  ret i32 %res4
552}
553declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
554
555define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
556; CHECK-LABEL: test_x86_vcvtph2ps_512:
557; CHECK:       ## BB#0:
558; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
559; CHECK-NEXT:    retq
560  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
561  ret <16 x float> %res
562}
563
564define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
565; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
566; CHECK:       ## BB#0:
567; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0
568; CHECK-NEXT:    retq
569  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
570  ret <16 x float> %res
571}
572
573define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
574; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
575; CHECK:       ## BB#0:
576; CHECK-NEXT:    kmovw %edi, %k1
577; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
578; CHECK-NEXT:    vmovaps %zmm1, %zmm0
579; CHECK-NEXT:    retq
580  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
581  ret <16 x float> %res
582}
583
584define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
585; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
586; CHECK:       ## BB#0:
587; CHECK-NEXT:    kmovw %edi, %k1
588; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
589; CHECK-NEXT:    retq
590  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
591  ret <16 x float> %res
592}
593
594define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
595; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
596; CHECK:       ## BB#0:
597; CHECK-NEXT:    kmovw %edi, %k1
598; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
599; CHECK-NEXT:    retq
600  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
601  ret <16 x float> %res
602}
603
604declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
605
606define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
607; CHECK-LABEL: test_x86_vcvtps2ph_256:
608; CHECK:       ## BB#0:
609; CHECK-NEXT:    kmovw %edi, %k1
610; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm1 {%k1}
611; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
612; CHECK-NEXT:    vcvtps2ph $2, %zmm0, (%rsi)
613; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm0
614; CHECK-NEXT:    retq
615  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
616  %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
617  %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
618  store <16 x i16> %res1, <16 x i16> * %dst
619  %res  = add <16 x i16> %res2, %res3
620  ret <16 x i16> %res
621}
622
623declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
624
625define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
626; CHECK-LABEL: test_x86_vbroadcast_ss_512:
627; CHECK:       ## BB#0:
628; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0
629; CHECK-NEXT:    retq
630  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
631  ret <16 x float> %res
632}
633declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
634
635define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
636; CHECK-LABEL: test_x86_vbroadcast_sd_512:
637; CHECK:       ## BB#0:
638; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0
639; CHECK-NEXT:    retq
640  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
641  ret <8 x double> %res
642}
643declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
644
645define <16 x i32> @test_conflict_d(<16 x i32> %a) {
646; CHECK-LABEL: test_conflict_d:
647; CHECK:       ## BB#0:
648; CHECK-NEXT:    vpconflictd %zmm0, %zmm0
649; CHECK-NEXT:    retq
650  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
651  ret <16 x i32> %res
652}
653
654declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
655
656define <8 x i64> @test_conflict_q(<8 x i64> %a) {
657; CHECK-LABEL: test_conflict_q:
658; CHECK:       ## BB#0:
659; CHECK-NEXT:    vpconflictq %zmm0, %zmm0
660; CHECK-NEXT:    retq
661  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
662  ret <8 x i64> %res
663}
664
665declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
666
667define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
668; CHECK-LABEL: test_maskz_conflict_d:
669; CHECK:       ## BB#0:
670; CHECK-NEXT:    kmovw %edi, %k1
671; CHECK-NEXT:    vpconflictd %zmm0, %zmm0 {%k1} {z}
672; CHECK-NEXT:    retq
673  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
674  ret <16 x i32> %res
675}
676
677define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
678; CHECK-LABEL: test_mask_conflict_q:
679; CHECK:       ## BB#0:
680; CHECK-NEXT:    kmovw %edi, %k1
681; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
682; CHECK-NEXT:    vmovaps %zmm1, %zmm0
683; CHECK-NEXT:    retq
684  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
685  ret <8 x i64> %res
686}
687
688define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
689; CHECK-LABEL: test_lzcnt_d:
690; CHECK:       ## BB#0:
691; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
692; CHECK-NEXT:    retq
693  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
694  ret <16 x i32> %res
695}
696
697declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
698
699define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
700; CHECK-LABEL: test_lzcnt_q:
701; CHECK:       ## BB#0:
702; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
703; CHECK-NEXT:    retq
704  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
705  ret <8 x i64> %res
706}
707
708declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
709
710
711define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
712; CHECK-LABEL: test_mask_lzcnt_d:
713; CHECK:       ## BB#0:
714; CHECK-NEXT:    kmovw %edi, %k1
715; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
716; CHECK-NEXT:    vmovaps %zmm1, %zmm0
717; CHECK-NEXT:    retq
718  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
719  ret <16 x i32> %res
720}
721
722define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
723; CHECK-LABEL: test_mask_lzcnt_q:
724; CHECK:       ## BB#0:
725; CHECK-NEXT:    kmovw %edi, %k1
726; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
727; CHECK-NEXT:    vmovaps %zmm1, %zmm0
728; CHECK-NEXT:    retq
729  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
730  ret <8 x i64> %res
731}
732
733 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
734; CHECK-LABEL: test_cmpps:
735; CHECK:       ## BB#0:
736; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
737; CHECK-NEXT:    kmovw %k0, %eax
738; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
739; CHECK-NEXT:    retq
740   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
741   ret i16 %res
742 }
743 declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
744
745 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
746; CHECK-LABEL: test_cmppd:
747; CHECK:       ## BB#0:
748; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
749; CHECK-NEXT:    kmovw %k0, %eax
750; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
751; CHECK-NEXT:    retq
752   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
753   ret i8 %res
754 }
755 declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
756
757 ; fp min - max
758define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
759; CHECK-LABEL: test_vmaxpd:
760; CHECK:       ## BB#0:
761; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
762; CHECK-NEXT:    retq
763  %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
764                    <8 x double>zeroinitializer, i8 -1, i32 4)
765  ret <8 x double> %res
766}
767declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
768                    <8 x double>, i8, i32)
769
770define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
771; CHECK-LABEL: test_vminpd:
772; CHECK:       ## BB#0:
773; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
774; CHECK-NEXT:    retq
775  %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
776                    <8 x double>zeroinitializer, i8 -1, i32 4)
777  ret <8 x double> %res
778}
779declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
780                    <8 x double>, i8, i32)
781
782 declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
783
784define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
785; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
786; CHECK:       ## BB#0:
787; CHECK-NEXT:    kmovw %edi, %k1
788; CHECK-NEXT:    vpabsd %zmm0, %zmm1 {%k1}
789; CHECK-NEXT:    vpabsd %zmm0, %zmm0
790; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
791; CHECK-NEXT:    retq
792  %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
793  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
794  %res2 = add <16 x i32> %res, %res1
795  ret <16 x i32> %res2
796}
797
798declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
799
800define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
801; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
802; CHECK:       ## BB#0:
803; CHECK-NEXT:    kmovw %edi, %k1
804; CHECK-NEXT:    vpabsq %zmm0, %zmm1 {%k1}
805; CHECK-NEXT:    vpabsq %zmm0, %zmm0
806; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
807; CHECK-NEXT:    retq
808  %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
809  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
810  %res2 = add <8 x i64> %res, %res1
811  ret <8 x i64> %res2
812}
813
814define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
815; CHECK-LABEL: test_vptestmq:
816; CHECK:       ## BB#0:
817; CHECK-NEXT:    kmovw %edi, %k1
818; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0 {%k1}
819; CHECK-NEXT:    kmovw %k0, %ecx
820; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
821; CHECK-NEXT:    kmovw %k0, %eax
822; CHECK-NEXT:    addb %cl, %al
823; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
824; CHECK-NEXT:    retq
825  %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
826  %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
827  %res2 = add i8 %res1, %res
828  ret i8 %res2
829}
830declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
831
832define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
833; CHECK-LABEL: test_vptestmd:
834; CHECK:       ## BB#0:
835; CHECK-NEXT:    kmovw %edi, %k1
836; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0 {%k1}
837; CHECK-NEXT:    kmovw %k0, %ecx
838; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
839; CHECK-NEXT:    kmovw %k0, %eax
840; CHECK-NEXT:    addl %ecx, %eax
841; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
842; CHECK-NEXT:    retq
843  %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
844  %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
845  %res2 = add i16 %res1, %res
846  ret i16 %res2
847}
848declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
849
850define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
851; CHECK-LABEL: test_valign_q:
852; CHECK:       ## BB#0:
853; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm0
854; CHECK-NEXT:    retq
855  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
856  ret <8 x i64> %res
857}
858
859define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
860; CHECK-LABEL: test_mask_valign_q:
861; CHECK:       ## BB#0:
862; CHECK-NEXT:    kmovw %edi, %k1
863; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
864; CHECK-NEXT:    vmovaps %zmm2, %zmm0
865; CHECK-NEXT:    retq
866  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
867  ret <8 x i64> %res
868}
869
870declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
871
872define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
873; CHECK-LABEL: test_maskz_valign_d:
874; CHECK:       ## BB#0:
875; CHECK-NEXT:    kmovw %edi, %k1
876; CHECK-NEXT:    valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z}
877; CHECK-NEXT:    retq
878  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
879  ret <16 x i32> %res
880}
881
882declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
883
884define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
885; CHECK-LABEL: test_mask_store_ss:
886; CHECK:       ## BB#0:
887; CHECK-NEXT:    kmovw %esi, %k1
888; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
889; CHECK-NEXT:    retq
890 call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
891 ret void
892}
893
894declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
895
896define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
897; CHECK-LABEL: test_cmp_d_512:
898; CHECK:       ## BB#0:
899; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
900; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k1
901; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k2
902; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k3
903; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4
904; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k5
905; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k6
906; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k7
907; CHECK-NEXT:    kmovw %k1, %eax
908; CHECK-NEXT:    kmovw %k0, %ecx
909; CHECK-NEXT:    vmovd %ecx, %xmm0
910; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
911; CHECK-NEXT:    kmovw %k2, %eax
912; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
913; CHECK-NEXT:    kmovw %k3, %eax
914; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
915; CHECK-NEXT:    kmovw %k4, %eax
916; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
917; CHECK-NEXT:    kmovw %k5, %eax
918; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
919; CHECK-NEXT:    kmovw %k6, %eax
920; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
921; CHECK-NEXT:    kmovw %k7, %eax
922; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
923; CHECK-NEXT:    retq
924  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
925  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
926  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
927  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
928  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
929  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
930  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
931  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
932  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
933  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
934  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
935  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
936  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
937  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
938  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
939  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
940  ret <8 x i16> %vec7
941}
942
943define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
944; CHECK-LABEL: test_mask_cmp_d_512:
945; CHECK:       ## BB#0:
946; CHECK-NEXT:    kmovw %edi, %k1
947; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
948; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k2 {%k1}
949; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k3 {%k1}
950; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k4 {%k1}
951; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k5 {%k1}
952; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k6 {%k1}
953; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k7 {%k1}
954; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k1 {%k1}
955; CHECK-NEXT:    kmovw %k2, %eax
956; CHECK-NEXT:    kmovw %k0, %ecx
957; CHECK-NEXT:    vmovd %ecx, %xmm0
958; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
959; CHECK-NEXT:    kmovw %k3, %eax
960; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
961; CHECK-NEXT:    kmovw %k4, %eax
962; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
963; CHECK-NEXT:    kmovw %k5, %eax
964; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
965; CHECK-NEXT:    kmovw %k6, %eax
966; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
967; CHECK-NEXT:    kmovw %k7, %eax
968; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
969; CHECK-NEXT:    kmovw %k1, %eax
970; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
971; CHECK-NEXT:    retq
972  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
973  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
974  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
975  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
976  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
977  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
978  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
979  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
980  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
981  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
982  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
983  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
984  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
985  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
986  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
987  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
988  ret <8 x i16> %vec7
989}
990
991declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
992
993define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
994; CHECK-LABEL: test_ucmp_d_512:
995; CHECK:       ## BB#0:
996; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0
997; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k1
998; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k2
999; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k3
1000; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k4
1001; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k5
1002; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k6
1003; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k7
1004; CHECK-NEXT:    kmovw %k1, %eax
1005; CHECK-NEXT:    kmovw %k0, %ecx
1006; CHECK-NEXT:    vmovd %ecx, %xmm0
1007; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1008; CHECK-NEXT:    kmovw %k2, %eax
1009; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
1010; CHECK-NEXT:    kmovw %k3, %eax
1011; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
1012; CHECK-NEXT:    kmovw %k4, %eax
1013; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
1014; CHECK-NEXT:    kmovw %k5, %eax
1015; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1016; CHECK-NEXT:    kmovw %k6, %eax
1017; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
1018; CHECK-NEXT:    kmovw %k7, %eax
1019; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1020; CHECK-NEXT:    retq
1021  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
1022  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1023  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
1024  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1025  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
1026  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1027  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
1028  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1029  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
1030  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1031  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
1032  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1033  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
1034  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1035  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
1036  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1037  ret <8 x i16> %vec7
1038}
1039
1040define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1041; CHECK-LABEL: test_mask_ucmp_d_512:
1042; CHECK:       ## BB#0:
1043; CHECK-NEXT:    kmovw %edi, %k1
1044; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0 {%k1}
1045; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k2 {%k1}
1046; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k3 {%k1}
1047; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k4 {%k1}
1048; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k5 {%k1}
1049; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k6 {%k1}
1050; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k7 {%k1}
1051; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k1 {%k1}
1052; CHECK-NEXT:    kmovw %k2, %eax
1053; CHECK-NEXT:    kmovw %k0, %ecx
1054; CHECK-NEXT:    vmovd %ecx, %xmm0
1055; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1056; CHECK-NEXT:    kmovw %k3, %eax
1057; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
1058; CHECK-NEXT:    kmovw %k4, %eax
1059; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
1060; CHECK-NEXT:    kmovw %k5, %eax
1061; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
1062; CHECK-NEXT:    kmovw %k6, %eax
1063; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1064; CHECK-NEXT:    kmovw %k7, %eax
1065; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
1066; CHECK-NEXT:    kmovw %k1, %eax
1067; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1068; CHECK-NEXT:    retq
1069  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
1070  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1071  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
1072  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1073  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
1074  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1075  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
1076  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1077  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
1078  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1079  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
1080  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1081  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
1082  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1083  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
1084  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1085  ret <8 x i16> %vec7
1086}
1087
1088declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
1089
1090define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
1091; CHECK-LABEL: test_cmp_q_512:
1092; CHECK:       ## BB#0:
1093; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
1094; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k1
1095; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k2
1096; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k3
1097; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4
1098; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k5
1099; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k6
1100; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k7
1101; CHECK-NEXT:    kmovw %k0, %eax
1102; CHECK-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
1103; CHECK-NEXT:    kmovw %k1, %eax
1104; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1105; CHECK-NEXT:    kmovw %k2, %eax
1106; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1107; CHECK-NEXT:    kmovw %k3, %eax
1108; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1109; CHECK-NEXT:    kmovw %k4, %eax
1110; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1111; CHECK-NEXT:    kmovw %k5, %eax
1112; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1113; CHECK-NEXT:    kmovw %k6, %eax
1114; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1115; CHECK-NEXT:    kmovw %k7, %eax
1116; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1117; CHECK-NEXT:    retq
1118  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
1119  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1120  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
1121  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1122  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
1123  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1124  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
1125  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1126  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
1127  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1128  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
1129  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1130  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
1131  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1132  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
1133  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1134  ret <8 x i8> %vec7
1135}
1136
1137define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1138; CHECK-LABEL: test_mask_cmp_q_512:
1139; CHECK:       ## BB#0:
1140; CHECK-NEXT:    kmovw %edi, %k1
1141; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
1142; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k2 {%k1}
1143; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k3 {%k1}
1144; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k4 {%k1}
1145; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k5 {%k1}
1146; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k6 {%k1}
1147; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k7 {%k1}
1148; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k1 {%k1}
1149; CHECK-NEXT:    kmovw %k0, %eax
1150; CHECK-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
1151; CHECK-NEXT:    kmovw %k2, %eax
1152; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1153; CHECK-NEXT:    kmovw %k3, %eax
1154; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1155; CHECK-NEXT:    kmovw %k4, %eax
1156; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1157; CHECK-NEXT:    kmovw %k5, %eax
1158; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1159; CHECK-NEXT:    kmovw %k6, %eax
1160; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1161; CHECK-NEXT:    kmovw %k7, %eax
1162; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1163; CHECK-NEXT:    kmovw %k1, %eax
1164; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1165; CHECK-NEXT:    retq
1166  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
1167  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1168  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
1169  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1170  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
1171  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1172  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
1173  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1174  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
1175  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1176  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
1177  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1178  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
1179  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1180  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
1181  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1182  ret <8 x i8> %vec7
1183}
1184
1185declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
1186
1187define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
1188; CHECK-LABEL: test_ucmp_q_512:
1189; CHECK:       ## BB#0:
1190; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0
1191; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k1
1192; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k2
1193; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k3
1194; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k4
1195; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k5
1196; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k6
1197; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k7
1198; CHECK-NEXT:    kmovw %k0, %eax
1199; CHECK-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
1200; CHECK-NEXT:    kmovw %k1, %eax
1201; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1202; CHECK-NEXT:    kmovw %k2, %eax
1203; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1204; CHECK-NEXT:    kmovw %k3, %eax
1205; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1206; CHECK-NEXT:    kmovw %k4, %eax
1207; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1208; CHECK-NEXT:    kmovw %k5, %eax
1209; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1210; CHECK-NEXT:    kmovw %k6, %eax
1211; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1212; CHECK-NEXT:    kmovw %k7, %eax
1213; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1214; CHECK-NEXT:    retq
1215  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
1216  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1217  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
1218  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1219  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
1220  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1221  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
1222  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1223  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
1224  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1225  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
1226  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1227  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
1228  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1229  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
1230  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1231  ret <8 x i8> %vec7
1232}
1233
1234define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1235; CHECK-LABEL: test_mask_ucmp_q_512:
1236; CHECK:       ## BB#0:
1237; CHECK-NEXT:    kmovw %edi, %k1
1238; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0 {%k1}
1239; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k2 {%k1}
1240; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k3 {%k1}
1241; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k4 {%k1}
1242; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k5 {%k1}
1243; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k6 {%k1}
1244; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k7 {%k1}
1245; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k1 {%k1}
1246; CHECK-NEXT:    kmovw %k0, %eax
1247; CHECK-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
1248; CHECK-NEXT:    kmovw %k2, %eax
1249; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1250; CHECK-NEXT:    kmovw %k3, %eax
1251; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1252; CHECK-NEXT:    kmovw %k4, %eax
1253; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1254; CHECK-NEXT:    kmovw %k5, %eax
1255; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1256; CHECK-NEXT:    kmovw %k6, %eax
1257; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1258; CHECK-NEXT:    kmovw %k7, %eax
1259; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1260; CHECK-NEXT:    kmovw %k1, %eax
1261; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1262; CHECK-NEXT:    retq
1263  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
1264  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1265  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
1266  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1267  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
1268  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1269  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
1270  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1271  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
1272  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1273  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
1274  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1275  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
1276  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1277  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
1278  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1279  ret <8 x i8> %vec7
1280}
1281
1282declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
1283
1284define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
1285; CHECK-LABEL: test_mask_vextractf32x4:
1286; CHECK:       ## BB#0:
1287; CHECK-NEXT:    kmovw %edi, %k1
1288; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
1289; CHECK-NEXT:    retq
1290  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
1291  ret <4 x float> %res
1292}
1293
1294declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
1295
1296define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
1297; CHECK-LABEL: test_mask_vextracti64x4:
1298; CHECK:       ## BB#0:
1299; CHECK-NEXT:    kmovw %edi, %k1
1300; CHECK-NEXT:    vextracti64x4 $2, %zmm1, %ymm0 {%k1}
1301; CHECK-NEXT:    retq
1302  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
1303  ret <4 x i64> %res
1304}
1305
1306declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
1307
1308define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
1309; CHECK-LABEL: test_maskz_vextracti32x4:
1310; CHECK:       ## BB#0:
1311; CHECK-NEXT:    kmovw %edi, %k1
1312; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
1313; CHECK-NEXT:    retq
1314  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
1315  ret <4 x i32> %res
1316}
1317
1318declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
1319
1320define <4 x double> @test_vextractf64x4(<8 x double> %a) {
1321; CHECK-LABEL: test_vextractf64x4:
1322; CHECK:       ## BB#0:
1323; CHECK-NEXT:    vextractf64x4 $2, %zmm0, %ymm0
1324; CHECK-NEXT:    retq
1325  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
1326  ret <4 x double> %res
1327}
1328
1329declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
1330
1331define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
1332; CHECK-LABEL: test_x86_avx512_psll_d:
1333; CHECK:       ## BB#0:
1334; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0
1335; CHECK-NEXT:    retq
1336  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1337  ret <16 x i32> %res
1338}
1339
1340define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1341; CHECK-LABEL: test_x86_avx512_mask_psll_d:
1342; CHECK:       ## BB#0:
1343; CHECK-NEXT:    kmovw %edi, %k1
1344; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1}
1345; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1346; CHECK-NEXT:    retq
1347  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1348  ret <16 x i32> %res
1349}
1350
1351define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1352; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
1353; CHECK:       ## BB#0:
1354; CHECK-NEXT:    kmovw %edi, %k1
1355; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
1356; CHECK-NEXT:    retq
1357  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1358  ret <16 x i32> %res
1359}
1360
1361declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1362
1363define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
1364; CHECK-LABEL: test_x86_avx512_psll_q:
1365; CHECK:       ## BB#0:
1366; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
1367; CHECK-NEXT:    retq
1368  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1369  ret <8 x i64> %res
1370}
1371
1372define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1373; CHECK-LABEL: test_x86_avx512_mask_psll_q:
1374; CHECK:       ## BB#0:
1375; CHECK-NEXT:    kmovw %edi, %k1
1376; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
1377; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1378; CHECK-NEXT:    retq
1379  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1380  ret <8 x i64> %res
1381}
1382
1383define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1384; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
1385; CHECK:       ## BB#0:
1386; CHECK-NEXT:    kmovw %edi, %k1
1387; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
1388; CHECK-NEXT:    retq
1389  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1390  ret <8 x i64> %res
1391}
1392
1393declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1394
1395define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
1396; CHECK-LABEL: test_x86_avx512_psrl_d:
1397; CHECK:       ## BB#0:
1398; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
1399; CHECK-NEXT:    retq
1400  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1401  ret <16 x i32> %res
1402}
1403
1404define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1405; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
1406; CHECK:       ## BB#0:
1407; CHECK-NEXT:    kmovw %edi, %k1
1408; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1}
1409; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1410; CHECK-NEXT:    retq
1411  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1412  ret <16 x i32> %res
1413}
1414
1415define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1416; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
1417; CHECK:       ## BB#0:
1418; CHECK-NEXT:    kmovw %edi, %k1
1419; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
1420; CHECK-NEXT:    retq
1421  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1422  ret <16 x i32> %res
1423}
1424
1425declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1426
1427define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
1428; CHECK-LABEL: test_x86_avx512_psrl_q:
1429; CHECK:       ## BB#0:
1430; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
1431; CHECK-NEXT:    retq
1432  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1433  ret <8 x i64> %res
1434}
1435
1436define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1437; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
1438; CHECK:       ## BB#0:
1439; CHECK-NEXT:    kmovw %edi, %k1
1440; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
1441; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1442; CHECK-NEXT:    retq
1443  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1444  ret <8 x i64> %res
1445}
1446
1447define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1448; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
1449; CHECK:       ## BB#0:
1450; CHECK-NEXT:    kmovw %edi, %k1
1451; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
1452; CHECK-NEXT:    retq
1453  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1454  ret <8 x i64> %res
1455}
1456
1457declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1458
1459define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
1460; CHECK-LABEL: test_x86_avx512_psra_d:
1461; CHECK:       ## BB#0:
1462; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1463; CHECK-NEXT:    retq
1464  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1465  ret <16 x i32> %res
1466}
1467
1468define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1469; CHECK-LABEL: test_x86_avx512_mask_psra_d:
1470; CHECK:       ## BB#0:
1471; CHECK-NEXT:    kmovw %edi, %k1
1472; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1}
1473; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1474; CHECK-NEXT:    retq
1475  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1476  ret <16 x i32> %res
1477}
1478
1479define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1480; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
1481; CHECK:       ## BB#0:
1482; CHECK-NEXT:    kmovw %edi, %k1
1483; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
1484; CHECK-NEXT:    retq
1485  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1486  ret <16 x i32> %res
1487}
1488
1489declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1490
1491define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
1492; CHECK-LABEL: test_x86_avx512_psra_q:
1493; CHECK:       ## BB#0:
1494; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
1495; CHECK-NEXT:    retq
1496  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1497  ret <8 x i64> %res
1498}
1499
1500define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1501; CHECK-LABEL: test_x86_avx512_mask_psra_q:
1502; CHECK:       ## BB#0:
1503; CHECK-NEXT:    kmovw %edi, %k1
1504; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
1505; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1506; CHECK-NEXT:    retq
1507  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1508  ret <8 x i64> %res
1509}
1510
1511define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1512; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
1513; CHECK:       ## BB#0:
1514; CHECK-NEXT:    kmovw %edi, %k1
1515; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
1516; CHECK-NEXT:    retq
1517  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1518  ret <8 x i64> %res
1519}
1520
1521declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1522
1523define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
1524; CHECK-LABEL: test_x86_avx512_psllv_d:
1525; CHECK:       ## BB#0:
1526; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
1527; CHECK-NEXT:    retq
1528  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1529  ret <16 x i32> %res
1530}
1531
1532define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1533; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
1534; CHECK:       ## BB#0:
1535; CHECK-NEXT:    kmovw %edi, %k1
1536; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
1537; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1538; CHECK-NEXT:    retq
1539  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
1540  ret <16 x i32> %res
1541}
1542
1543define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1544; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
1545; CHECK:       ## BB#0:
1546; CHECK-NEXT:    kmovw %edi, %k1
1547; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
1548; CHECK-NEXT:    retq
1549  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1550  ret <16 x i32> %res
1551}
1552
1553declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
1554
1555define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
1556; CHECK-LABEL: test_x86_avx512_psllv_q:
1557; CHECK:       ## BB#0:
1558; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
1559; CHECK-NEXT:    retq
1560  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1561  ret <8 x i64> %res
1562}
1563
1564define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1565; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
1566; CHECK:       ## BB#0:
1567; CHECK-NEXT:    kmovw %edi, %k1
1568; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
1569; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1570; CHECK-NEXT:    retq
1571  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
1572  ret <8 x i64> %res
1573}
1574
1575define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1576; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
1577; CHECK:       ## BB#0:
1578; CHECK-NEXT:    kmovw %edi, %k1
1579; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
1580; CHECK-NEXT:    retq
1581  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1582  ret <8 x i64> %res
1583}
1584
1585declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
1586
1587
1588define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
1589; CHECK-LABEL: test_x86_avx512_psrav_d:
1590; CHECK:       ## BB#0:
1591; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
1592; CHECK-NEXT:    retq
1593  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1594  ret <16 x i32> %res
1595}
1596
1597define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1598; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
1599; CHECK:       ## BB#0:
1600; CHECK-NEXT:    kmovw %edi, %k1
1601; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1}
1602; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1603; CHECK-NEXT:    retq
1604  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
1605  ret <16 x i32> %res
1606}
1607
1608define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1609; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
1610; CHECK:       ## BB#0:
1611; CHECK-NEXT:    kmovw %edi, %k1
1612; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
1613; CHECK-NEXT:    retq
1614  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1615  ret <16 x i32> %res
1616}
1617
1618declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
1619
1620define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
1621; CHECK-LABEL: test_x86_avx512_psrav_q:
1622; CHECK:       ## BB#0:
1623; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
1624; CHECK-NEXT:    retq
1625  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1626  ret <8 x i64> %res
1627}
1628
1629define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1630; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
1631; CHECK:       ## BB#0:
1632; CHECK-NEXT:    kmovw %edi, %k1
1633; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
1634; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1635; CHECK-NEXT:    retq
1636  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
1637  ret <8 x i64> %res
1638}
1639
1640define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1641; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
1642; CHECK:       ## BB#0:
1643; CHECK-NEXT:    kmovw %edi, %k1
1644; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
1645; CHECK-NEXT:    retq
1646  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1647  ret <8 x i64> %res
1648}
1649
1650declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
1651
1652define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
1653; CHECK-LABEL: test_x86_avx512_psrlv_d:
1654; CHECK:       ## BB#0:
1655; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
1656; CHECK-NEXT:    retq
1657  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1658  ret <16 x i32> %res
1659}
1660
1661define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1662; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
1663; CHECK:       ## BB#0:
1664; CHECK-NEXT:    kmovw %edi, %k1
1665; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
1666; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1667; CHECK-NEXT:    retq
1668  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
1669  ret <16 x i32> %res
1670}
1671
1672define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1673; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
1674; CHECK:       ## BB#0:
1675; CHECK-NEXT:    kmovw %edi, %k1
1676; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
1677; CHECK-NEXT:    retq
1678  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1679  ret <16 x i32> %res
1680}
1681
1682declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
1683
1684define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
1685; CHECK-LABEL: test_x86_avx512_psrlv_q:
1686; CHECK:       ## BB#0:
1687; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
1688; CHECK-NEXT:    retq
1689  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1690  ret <8 x i64> %res
1691}
1692
1693define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1694; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
1695; CHECK:       ## BB#0:
1696; CHECK-NEXT:    kmovw %edi, %k1
1697; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
1698; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1699; CHECK-NEXT:    retq
1700  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
1701  ret <8 x i64> %res
1702}
1703
1704define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1705; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
1706; CHECK:       ## BB#0:
1707; CHECK-NEXT:    kmovw %edi, %k1
1708; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
1709; CHECK-NEXT:    retq
1710  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1711  ret <8 x i64> %res
1712}
1713
1714declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
1715
1716define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
1717; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
1718; CHECK:       ## BB#0:
1719; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
1720; CHECK-NEXT:    retq
1721  %b = load <8 x i64>, <8 x i64>* %ptr
1722  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1723  ret <8 x i64> %res
1724}
1725
1726declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
1727declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
1728declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
1729
1730define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
1731; CHECK-LABEL: test_vsubps_rn:
1732; CHECK:       ## BB#0:
1733; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1734; CHECK-NEXT:    retq
1735  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
1736                    <16 x float> zeroinitializer, i16 -1, i32 0)
1737  ret <16 x float> %res
1738}
1739
1740define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
1741; CHECK-LABEL: test_vsubps_rd:
1742; CHECK:       ## BB#0:
1743; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1744; CHECK-NEXT:    retq
1745  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
1746                    <16 x float> zeroinitializer, i16 -1, i32 1)
1747  ret <16 x float> %res
1748}
1749
1750define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
1751; CHECK-LABEL: test_vsubps_ru:
1752; CHECK:       ## BB#0:
1753; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1754; CHECK-NEXT:    retq
1755  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
1756                    <16 x float> zeroinitializer, i16 -1, i32 2)
1757  ret <16 x float> %res
1758}
1759
1760define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
1761; CHECK-LABEL: test_vsubps_rz:
1762; CHECK:       ## BB#0:
1763; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1764; CHECK-NEXT:    retq
1765  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
1766                    <16 x float> zeroinitializer, i16 -1, i32 3)
1767  ret <16 x float> %res
1768}
1769
1770define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
1771; CHECK-LABEL: test_vmulps_rn:
1772; CHECK:       ## BB#0:
1773; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
1774; CHECK-NEXT:    retq
1775  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1776                    <16 x float> zeroinitializer, i16 -1, i32 0)
1777  ret <16 x float> %res
1778}
1779
1780define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
1781; CHECK-LABEL: test_vmulps_rd:
1782; CHECK:       ## BB#0:
1783; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
1784; CHECK-NEXT:    retq
1785  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1786                    <16 x float> zeroinitializer, i16 -1, i32 1)
1787  ret <16 x float> %res
1788}
1789
1790define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
1791; CHECK-LABEL: test_vmulps_ru:
1792; CHECK:       ## BB#0:
1793; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
1794; CHECK-NEXT:    retq
1795  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1796                    <16 x float> zeroinitializer, i16 -1, i32 2)
1797  ret <16 x float> %res
1798}
1799
1800define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
1801; CHECK-LABEL: test_vmulps_rz:
1802; CHECK:       ## BB#0:
1803; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
1804; CHECK-NEXT:    retq
1805  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1806                    <16 x float> zeroinitializer, i16 -1, i32 3)
1807  ret <16 x float> %res
1808}
1809
1810;; mask float
1811define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1812; CHECK-LABEL: test_vmulps_mask_rn:
1813; CHECK:       ## BB#0:
1814; CHECK-NEXT:    kmovw %edi, %k1
1815; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1816; CHECK-NEXT:    retq
1817  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1818                    <16 x float> zeroinitializer, i16 %mask, i32 0)
1819  ret <16 x float> %res
1820}
1821
1822define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1823; CHECK-LABEL: test_vmulps_mask_rd:
1824; CHECK:       ## BB#0:
1825; CHECK-NEXT:    kmovw %edi, %k1
1826; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1827; CHECK-NEXT:    retq
1828  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1829                    <16 x float> zeroinitializer, i16 %mask, i32 1)
1830  ret <16 x float> %res
1831}
1832
1833define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1834; CHECK-LABEL: test_vmulps_mask_ru:
1835; CHECK:       ## BB#0:
1836; CHECK-NEXT:    kmovw %edi, %k1
1837; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1838; CHECK-NEXT:    retq
1839  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1840                    <16 x float> zeroinitializer, i16 %mask, i32 2)
1841  ret <16 x float> %res
1842}
1843
1844define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1845; CHECK-LABEL: test_vmulps_mask_rz:
1846; CHECK:       ## BB#0:
1847; CHECK-NEXT:    kmovw %edi, %k1
1848; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1849; CHECK-NEXT:    retq
1850  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1851                    <16 x float> zeroinitializer, i16 %mask, i32 3)
1852  ret <16 x float> %res
1853}
1854
1855;; With Passthru value
1856define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1857; CHECK-LABEL: test_vmulps_mask_passthru_rn:
1858; CHECK:       ## BB#0:
1859; CHECK-NEXT:    kmovw %edi, %k1
1860; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1861; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1862; CHECK-NEXT:    retq
1863  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1864                    <16 x float> %passthru, i16 %mask, i32 0)
1865  ret <16 x float> %res
1866}
1867
1868define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1869; CHECK-LABEL: test_vmulps_mask_passthru_rd:
1870; CHECK:       ## BB#0:
1871; CHECK-NEXT:    kmovw %edi, %k1
1872; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1873; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1874; CHECK-NEXT:    retq
1875  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1876                    <16 x float> %passthru, i16 %mask, i32 1)
1877  ret <16 x float> %res
1878}
1879
1880define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1881; CHECK-LABEL: test_vmulps_mask_passthru_ru:
1882; CHECK:       ## BB#0:
1883; CHECK-NEXT:    kmovw %edi, %k1
1884; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1885; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1886; CHECK-NEXT:    retq
1887  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1888                    <16 x float> %passthru, i16 %mask, i32 2)
1889  ret <16 x float> %res
1890}
1891
1892define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1893; CHECK-LABEL: test_vmulps_mask_passthru_rz:
1894; CHECK:       ## BB#0:
1895; CHECK-NEXT:    kmovw %edi, %k1
1896; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1897; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1898; CHECK-NEXT:    retq
1899  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
1900                    <16 x float> %passthru, i16 %mask, i32 3)
1901  ret <16 x float> %res
1902}
1903
1904;; mask double
1905define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1906; CHECK-LABEL: test_vmulpd_mask_rn:
1907; CHECK:       ## BB#0:
1908; CHECK-NEXT:    kmovw %edi, %k1
1909; CHECK-NEXT:    vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1910; CHECK-NEXT:    retq
1911  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
1912                    <8 x double> zeroinitializer, i8 %mask, i32 0)
1913  ret <8 x double> %res
1914}
1915
1916define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1917; CHECK-LABEL: test_vmulpd_mask_rd:
1918; CHECK:       ## BB#0:
1919; CHECK-NEXT:    kmovw %edi, %k1
1920; CHECK-NEXT:    vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1921; CHECK-NEXT:    retq
1922  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
1923                    <8 x double> zeroinitializer, i8 %mask, i32 1)
1924  ret <8 x double> %res
1925}
1926
1927define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1928; CHECK-LABEL: test_vmulpd_mask_ru:
1929; CHECK:       ## BB#0:
1930; CHECK-NEXT:    kmovw %edi, %k1
1931; CHECK-NEXT:    vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1932; CHECK-NEXT:    retq
1933  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
1934                    <8 x double> zeroinitializer, i8 %mask, i32 2)
1935  ret <8 x double> %res
1936}
1937
1938define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1939; CHECK-LABEL: test_vmulpd_mask_rz:
1940; CHECK:       ## BB#0:
1941; CHECK-NEXT:    kmovw %edi, %k1
1942; CHECK-NEXT:    vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1943; CHECK-NEXT:    retq
1944  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
1945                    <8 x double> zeroinitializer, i8 %mask, i32 3)
1946  ret <8 x double> %res
1947}
1948
1949define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
1950; CHECK-LABEL: test_mask_add_epi32_rr:
1951; CHECK:       ## BB#0:
1952; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1953; CHECK-NEXT:    retq
1954  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1955  ret < 16 x i32> %res
1956}
1957
1958define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
1959; CHECK-LABEL: test_mask_add_epi32_rrk:
1960; CHECK:       ## BB#0:
1961; CHECK-NEXT:    kmovw %edi, %k1
1962; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1}
1963; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1964; CHECK-NEXT:    retq
1965  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1966  ret < 16 x i32> %res
1967}
1968
1969define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1970; CHECK-LABEL: test_mask_add_epi32_rrkz:
1971; CHECK:       ## BB#0:
1972; CHECK-NEXT:    kmovw %edi, %k1
1973; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
1974; CHECK-NEXT:    retq
1975  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1976  ret < 16 x i32> %res
1977}
1978
1979define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
1980; CHECK-LABEL: test_mask_add_epi32_rm:
1981; CHECK:       ## BB#0:
1982; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
1983; CHECK-NEXT:    retq
1984  %b = load <16 x i32>, <16 x i32>* %ptr_b
1985  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1986  ret < 16 x i32> %res
1987}
1988
1989define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1990; CHECK-LABEL: test_mask_add_epi32_rmk:
1991; CHECK:       ## BB#0:
1992; CHECK-NEXT:    kmovw %esi, %k1
1993; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm1 {%k1}
1994; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1995; CHECK-NEXT:    retq
1996  %b = load <16 x i32>, <16 x i32>* %ptr_b
1997  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1998  ret < 16 x i32> %res
1999}
2000
2001define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2002; CHECK-LABEL: test_mask_add_epi32_rmkz:
2003; CHECK:       ## BB#0:
2004; CHECK-NEXT:    kmovw %esi, %k1
2005; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
2006; CHECK-NEXT:    retq
2007  %b = load <16 x i32>, <16 x i32>* %ptr_b
2008  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2009  ret < 16 x i32> %res
2010}
2011
2012define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
2013; CHECK-LABEL: test_mask_add_epi32_rmb:
2014; CHECK:       ## BB#0:
2015; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0
2016; CHECK-NEXT:    retq
2017  %q = load i32, i32* %ptr_b
2018  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2019  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2020  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2021  ret < 16 x i32> %res
2022}
2023
2024define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2025; CHECK-LABEL: test_mask_add_epi32_rmbk:
2026; CHECK:       ## BB#0:
2027; CHECK-NEXT:    kmovw %esi, %k1
2028; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2029; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2030; CHECK-NEXT:    retq
2031  %q = load i32, i32* %ptr_b
2032  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2033  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2034  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2035  ret < 16 x i32> %res
2036}
2037
2038define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2039; CHECK-LABEL: test_mask_add_epi32_rmbkz:
2040; CHECK:       ## BB#0:
2041; CHECK-NEXT:    kmovw %esi, %k1
2042; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2043; CHECK-NEXT:    retq
2044  %q = load i32, i32* %ptr_b
2045  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2046  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2047  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2048  ret < 16 x i32> %res
2049}
2050
2051declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2052
2053define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2054; CHECK-LABEL: test_mask_sub_epi32_rr:
2055; CHECK:       ## BB#0:
2056; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
2057; CHECK-NEXT:    retq
2058  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2059  ret < 16 x i32> %res
2060}
2061
2062define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2063; CHECK-LABEL: test_mask_sub_epi32_rrk:
2064; CHECK:       ## BB#0:
2065; CHECK-NEXT:    kmovw %edi, %k1
2066; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1}
2067; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2068; CHECK-NEXT:    retq
2069  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2070  ret < 16 x i32> %res
2071}
2072
2073define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2074; CHECK-LABEL: test_mask_sub_epi32_rrkz:
2075; CHECK:       ## BB#0:
2076; CHECK-NEXT:    kmovw %edi, %k1
2077; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
2078; CHECK-NEXT:    retq
2079  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2080  ret < 16 x i32> %res
2081}
2082
2083define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2084; CHECK-LABEL: test_mask_sub_epi32_rm:
2085; CHECK:       ## BB#0:
2086; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0
2087; CHECK-NEXT:    retq
2088  %b = load <16 x i32>, <16 x i32>* %ptr_b
2089  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2090  ret < 16 x i32> %res
2091}
2092
2093define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2094; CHECK-LABEL: test_mask_sub_epi32_rmk:
2095; CHECK:       ## BB#0:
2096; CHECK-NEXT:    kmovw %esi, %k1
2097; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm1 {%k1}
2098; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2099; CHECK-NEXT:    retq
2100  %b = load <16 x i32>, <16 x i32>* %ptr_b
2101  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2102  ret < 16 x i32> %res
2103}
2104
2105define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2106; CHECK-LABEL: test_mask_sub_epi32_rmkz:
2107; CHECK:       ## BB#0:
2108; CHECK-NEXT:    kmovw %esi, %k1
2109; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
2110; CHECK-NEXT:    retq
2111  %b = load <16 x i32>, <16 x i32>* %ptr_b
2112  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2113  ret < 16 x i32> %res
2114}
2115
2116define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
2117; CHECK-LABEL: test_mask_sub_epi32_rmb:
2118; CHECK:       ## BB#0:
2119; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0
2120; CHECK-NEXT:    retq
2121  %q = load i32, i32* %ptr_b
2122  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2123  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2124  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2125  ret < 16 x i32> %res
2126}
2127
2128define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2129; CHECK-LABEL: test_mask_sub_epi32_rmbk:
2130; CHECK:       ## BB#0:
2131; CHECK-NEXT:    kmovw %esi, %k1
2132; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2133; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2134; CHECK-NEXT:    retq
2135  %q = load i32, i32* %ptr_b
2136  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2137  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2138  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2139  ret < 16 x i32> %res
2140}
2141
2142define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2143; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
2144; CHECK:       ## BB#0:
2145; CHECK-NEXT:    kmovw %esi, %k1
2146; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2147; CHECK-NEXT:    retq
2148  %q = load i32, i32* %ptr_b
2149  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2150  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2151  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2152  ret < 16 x i32> %res
2153}
2154
2155declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2156
2157define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
2158; CHECK-LABEL: test_mask_add_epi64_rr:
2159; CHECK:       ## BB#0:
2160; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
2161; CHECK-NEXT:    retq
2162  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2163  ret < 8 x i64> %res
2164}
2165
2166define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2167; CHECK-LABEL: test_mask_add_epi64_rrk:
2168; CHECK:       ## BB#0:
2169; CHECK-NEXT:    kmovw %edi, %k1
2170; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1}
2171; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2172; CHECK-NEXT:    retq
2173  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2174  ret < 8 x i64> %res
2175}
2176
2177define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
2178; CHECK-LABEL: test_mask_add_epi64_rrkz:
2179; CHECK:       ## BB#0:
2180; CHECK-NEXT:    kmovw %edi, %k1
2181; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
2182; CHECK-NEXT:    retq
2183  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2184  ret < 8 x i64> %res
2185}
2186
2187define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
2188; CHECK-LABEL: test_mask_add_epi64_rm:
2189; CHECK:       ## BB#0:
2190; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
2191; CHECK-NEXT:    retq
2192  %b = load <8 x i64>, <8 x i64>* %ptr_b
2193  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2194  ret < 8 x i64> %res
2195}
2196
2197define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2198; CHECK-LABEL: test_mask_add_epi64_rmk:
2199; CHECK:       ## BB#0:
2200; CHECK-NEXT:    kmovw %esi, %k1
2201; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1}
2202; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2203; CHECK-NEXT:    retq
2204  %b = load <8 x i64>, <8 x i64>* %ptr_b
2205  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2206  ret < 8 x i64> %res
2207}
2208
2209define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
2210; CHECK-LABEL: test_mask_add_epi64_rmkz:
2211; CHECK:       ## BB#0:
2212; CHECK-NEXT:    kmovw %esi, %k1
2213; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
2214; CHECK-NEXT:    retq
2215  %b = load <8 x i64>, <8 x i64>* %ptr_b
2216  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2217  ret < 8 x i64> %res
2218}
2219
2220define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
2221; CHECK-LABEL: test_mask_add_epi64_rmb:
2222; CHECK:       ## BB#0:
2223; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
2224; CHECK-NEXT:    retq
2225  %q = load i64, i64* %ptr_b
2226  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2227  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2228  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2229  ret < 8 x i64> %res
2230}
2231
2232define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2233; CHECK-LABEL: test_mask_add_epi64_rmbk:
2234; CHECK:       ## BB#0:
2235; CHECK-NEXT:    kmovw %esi, %k1
2236; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2237; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2238; CHECK-NEXT:    retq
2239  %q = load i64, i64* %ptr_b
2240  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2241  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2242  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2243  ret < 8 x i64> %res
2244}
2245
2246define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
2247; CHECK-LABEL: test_mask_add_epi64_rmbkz:
2248; CHECK:       ## BB#0:
2249; CHECK-NEXT:    kmovw %esi, %k1
2250; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2251; CHECK-NEXT:    retq
2252  %q = load i64, i64* %ptr_b
2253  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2254  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2255  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2256  ret < 8 x i64> %res
2257}
2258
2259declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2260
2261define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
2262; CHECK-LABEL: test_mask_sub_epi64_rr:
2263; CHECK:       ## BB#0:
2264; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2265; CHECK-NEXT:    retq
2266  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2267  ret < 8 x i64> %res
2268}
2269
2270define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2271; CHECK-LABEL: test_mask_sub_epi64_rrk:
2272; CHECK:       ## BB#0:
2273; CHECK-NEXT:    kmovw %edi, %k1
2274; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1}
2275; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2276; CHECK-NEXT:    retq
2277  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2278  ret < 8 x i64> %res
2279}
2280
2281define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
2282; CHECK-LABEL: test_mask_sub_epi64_rrkz:
2283; CHECK:       ## BB#0:
2284; CHECK-NEXT:    kmovw %edi, %k1
2285; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
2286; CHECK-NEXT:    retq
2287  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2288  ret < 8 x i64> %res
2289}
2290
2291define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
2292; CHECK-LABEL: test_mask_sub_epi64_rm:
2293; CHECK:       ## BB#0:
2294; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0
2295; CHECK-NEXT:    retq
2296  %b = load <8 x i64>, <8 x i64>* %ptr_b
2297  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2298  ret < 8 x i64> %res
2299}
2300
2301define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2302; CHECK-LABEL: test_mask_sub_epi64_rmk:
2303; CHECK:       ## BB#0:
2304; CHECK-NEXT:    kmovw %esi, %k1
2305; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1}
2306; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2307; CHECK-NEXT:    retq
2308  %b = load <8 x i64>, <8 x i64>* %ptr_b
2309  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2310  ret < 8 x i64> %res
2311}
2312
2313define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
2314; CHECK-LABEL: test_mask_sub_epi64_rmkz:
2315; CHECK:       ## BB#0:
2316; CHECK-NEXT:    kmovw %esi, %k1
2317; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
2318; CHECK-NEXT:    retq
2319  %b = load <8 x i64>, <8 x i64>* %ptr_b
2320  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2321  ret < 8 x i64> %res
2322}
2323
2324define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
2325; CHECK-LABEL: test_mask_sub_epi64_rmb:
2326; CHECK:       ## BB#0:
2327; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0
2328; CHECK-NEXT:    retq
2329  %q = load i64, i64* %ptr_b
2330  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2331  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2332  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2333  ret < 8 x i64> %res
2334}
2335
2336define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2337; CHECK-LABEL: test_mask_sub_epi64_rmbk:
2338; CHECK:       ## BB#0:
2339; CHECK-NEXT:    kmovw %esi, %k1
2340; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2341; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2342; CHECK-NEXT:    retq
2343  %q = load i64, i64* %ptr_b
2344  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2345  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2346  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2347  ret < 8 x i64> %res
2348}
2349
2350define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
2351; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
2352; CHECK:       ## BB#0:
2353; CHECK-NEXT:    kmovw %esi, %k1
2354; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2355; CHECK-NEXT:    retq
2356  %q = load i64, i64* %ptr_b
2357  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2358  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2359  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2360  ret < 8 x i64> %res
2361}
2362
2363declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2364
2365define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2366; CHECK-LABEL: test_mask_mul_epi32_rr:
2367; CHECK:       ## BB#0:
2368; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
2369; CHECK-NEXT:    retq
2370  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2371  ret < 8 x i64> %res
2372}
2373
2374define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
2375; CHECK-LABEL: test_mask_mul_epi32_rrk:
2376; CHECK:       ## BB#0:
2377; CHECK-NEXT:    kmovw %edi, %k1
2378; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
2379; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2380; CHECK-NEXT:    retq
2381  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2382  ret < 8 x i64> %res
2383}
2384
2385define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
2386; CHECK-LABEL: test_mask_mul_epi32_rrkz:
2387; CHECK:       ## BB#0:
2388; CHECK-NEXT:    kmovw %edi, %k1
2389; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
2390; CHECK-NEXT:    retq
2391  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2392  ret < 8 x i64> %res
2393}
2394
2395define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2396; CHECK-LABEL: test_mask_mul_epi32_rm:
2397; CHECK:       ## BB#0:
2398; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0
2399; CHECK-NEXT:    retq
2400  %b = load <16 x i32>, <16 x i32>* %ptr_b
2401  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2402  ret < 8 x i64> %res
2403}
2404
2405define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2406; CHECK-LABEL: test_mask_mul_epi32_rmk:
2407; CHECK:       ## BB#0:
2408; CHECK-NEXT:    kmovw %esi, %k1
2409; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
2410; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2411; CHECK-NEXT:    retq
2412  %b = load <16 x i32>, <16 x i32>* %ptr_b
2413  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2414  ret < 8 x i64> %res
2415}
2416
2417define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
2418; CHECK-LABEL: test_mask_mul_epi32_rmkz:
2419; CHECK:       ## BB#0:
2420; CHECK-NEXT:    kmovw %esi, %k1
2421; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
2422; CHECK-NEXT:    retq
2423  %b = load <16 x i32>, <16 x i32>* %ptr_b
2424  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2425  ret < 8 x i64> %res
2426}
2427
2428define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
2429; CHECK-LABEL: test_mask_mul_epi32_rmb:
2430; CHECK:       ## BB#0:
2431; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0
2432; CHECK-NEXT:    retq
2433  %q = load i64, i64* %ptr_b
2434  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2435  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2436  %b = bitcast <8 x i64> %b64 to <16 x i32>
2437  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2438  ret < 8 x i64> %res
2439}
2440
2441define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2442; CHECK-LABEL: test_mask_mul_epi32_rmbk:
2443; CHECK:       ## BB#0:
2444; CHECK-NEXT:    kmovw %esi, %k1
2445; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2446; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2447; CHECK-NEXT:    retq
2448  %q = load i64, i64* %ptr_b
2449  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2450  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2451  %b = bitcast <8 x i64> %b64 to <16 x i32>
2452  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2453  ret < 8 x i64> %res
2454}
2455
2456define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
2457; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
2458; CHECK:       ## BB#0:
2459; CHECK-NEXT:    kmovw %esi, %k1
2460; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2461; CHECK-NEXT:    retq
2462  %q = load i64, i64* %ptr_b
2463  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2464  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2465  %b = bitcast <8 x i64> %b64 to <16 x i32>
2466  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2467  ret < 8 x i64> %res
2468}
2469
2470declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
2471
2472define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
2473; CHECK-LABEL: test_mask_mul_epu32_rr:
2474; CHECK:       ## BB#0:
2475; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
2476; CHECK-NEXT:    retq
2477  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2478  ret < 8 x i64> %res
2479}
2480
2481define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
2482; CHECK-LABEL: test_mask_mul_epu32_rrk:
2483; CHECK:       ## BB#0:
2484; CHECK-NEXT:    kmovw %edi, %k1
2485; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
2486; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2487; CHECK-NEXT:    retq
2488  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2489  ret < 8 x i64> %res
2490}
2491
2492define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
2493; CHECK-LABEL: test_mask_mul_epu32_rrkz:
2494; CHECK:       ## BB#0:
2495; CHECK-NEXT:    kmovw %edi, %k1
2496; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
2497; CHECK-NEXT:    retq
2498  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2499  ret < 8 x i64> %res
2500}
2501
2502define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2503; CHECK-LABEL: test_mask_mul_epu32_rm:
2504; CHECK:       ## BB#0:
2505; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0
2506; CHECK-NEXT:    retq
2507  %b = load <16 x i32>, <16 x i32>* %ptr_b
2508  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2509  ret < 8 x i64> %res
2510}
2511
2512define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2513; CHECK-LABEL: test_mask_mul_epu32_rmk:
2514; CHECK:       ## BB#0:
2515; CHECK-NEXT:    kmovw %esi, %k1
2516; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
2517; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2518; CHECK-NEXT:    retq
2519  %b = load <16 x i32>, <16 x i32>* %ptr_b
2520  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2521  ret < 8 x i64> %res
2522}
2523
2524define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
2525; CHECK-LABEL: test_mask_mul_epu32_rmkz:
2526; CHECK:       ## BB#0:
2527; CHECK-NEXT:    kmovw %esi, %k1
2528; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
2529; CHECK-NEXT:    retq
2530  %b = load <16 x i32>, <16 x i32>* %ptr_b
2531  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2532  ret < 8 x i64> %res
2533}
2534
2535define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
2536; CHECK-LABEL: test_mask_mul_epu32_rmb:
2537; CHECK:       ## BB#0:
2538; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0
2539; CHECK-NEXT:    retq
2540  %q = load i64, i64* %ptr_b
2541  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2542  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2543  %b = bitcast <8 x i64> %b64 to <16 x i32>
2544  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2545  ret < 8 x i64> %res
2546}
2547
2548define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2549; CHECK-LABEL: test_mask_mul_epu32_rmbk:
2550; CHECK:       ## BB#0:
2551; CHECK-NEXT:    kmovw %esi, %k1
2552; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2553; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2554; CHECK-NEXT:    retq
2555  %q = load i64, i64* %ptr_b
2556  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2557  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2558  %b = bitcast <8 x i64> %b64 to <16 x i32>
2559  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2560  ret < 8 x i64> %res
2561}
2562
2563define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
2564; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
2565; CHECK:       ## BB#0:
2566; CHECK-NEXT:    kmovw %esi, %k1
2567; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2568; CHECK-NEXT:    retq
2569  %q = load i64, i64* %ptr_b
2570  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2571  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2572  %b = bitcast <8 x i64> %b64 to <16 x i32>
2573  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2574  ret < 8 x i64> %res
2575}
2576
2577declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
2578
2579define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
2580; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
2581; CHECK:       ## BB#0:
2582; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2583; CHECK-NEXT:    retq
2584  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2585  ret <16 x i32> %res
2586}
2587
2588define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2589; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
2590; CHECK:       ## BB#0:
2591; CHECK-NEXT:    kmovw %edi, %k1
2592; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1}
2593; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2594; CHECK-NEXT:    retq
2595  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2596  ret < 16 x i32> %res
2597}
2598
2599define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2600; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
2601; CHECK:       ## BB#0:
2602; CHECK-NEXT:    kmovw %edi, %k1
2603; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
2604; CHECK-NEXT:    retq
2605  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2606  ret < 16 x i32> %res
2607}
2608
2609define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
2610; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
2611; CHECK:       ## BB#0:
2612; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0
2613; CHECK-NEXT:    retq
2614  %b = load <16 x i32>, <16 x i32>* %ptr_b
2615  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2616  ret < 16 x i32> %res
2617}
2618
2619define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2620; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
2621; CHECK:       ## BB#0:
2622; CHECK-NEXT:    kmovw %esi, %k1
2623; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm1 {%k1}
2624; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2625; CHECK-NEXT:    retq
2626  %b = load <16 x i32>, <16 x i32>* %ptr_b
2627  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2628  ret < 16 x i32> %res
2629}
2630
2631define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2632; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
2633; CHECK:       ## BB#0:
2634; CHECK-NEXT:    kmovw %esi, %k1
2635; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
2636; CHECK-NEXT:    retq
2637  %b = load <16 x i32>, <16 x i32>* %ptr_b
2638  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2639  ret < 16 x i32> %res
2640}
2641
2642define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
2643; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
2644; CHECK:       ## BB#0:
2645; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0
2646; CHECK-NEXT:    retq
2647  %q = load i32, i32* %ptr_b
2648  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2649  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2650  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2651  ret < 16 x i32> %res
2652}
2653
2654define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2655; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
2656; CHECK:       ## BB#0:
2657; CHECK-NEXT:    kmovw %esi, %k1
2658; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2659; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2660; CHECK-NEXT:    retq
2661  %q = load i32, i32* %ptr_b
2662  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2663  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2664  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2665  ret < 16 x i32> %res
2666}
2667
2668define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2669; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
2670; CHECK:       ## BB#0:
2671; CHECK-NEXT:    kmovw %esi, %k1
2672; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2673; CHECK-NEXT:    retq
2674  %q = load i32, i32* %ptr_b
2675  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2676  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2677  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2678  ret < 16 x i32> %res
2679}
2680
2681declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2682
2683define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2684; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
2685; CHECK:       ## BB#0:
2686; CHECK-NEXT:    kmovw %edi, %k1
2687; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2688; CHECK-NEXT:    retq
2689  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
2690  ret <16 x float> %res
2691}
2692define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2693; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
2694; CHECK:       ## BB#0:
2695; CHECK-NEXT:    kmovw %edi, %k1
2696; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2697; CHECK-NEXT:    retq
2698  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
2699  ret <16 x float> %res
2700}
2701define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2702; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
2703; CHECK:       ## BB#0:
2704; CHECK-NEXT:    kmovw %edi, %k1
2705; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2706; CHECK-NEXT:    retq
2707  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
2708  ret <16 x float> %res
2709}
2710
2711define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2712; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
2713; CHECK:       ## BB#0:
2714; CHECK-NEXT:    kmovw %edi, %k1
2715; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2716; CHECK-NEXT:    retq
2717  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
2718  ret <16 x float> %res
2719}
2720
2721
2722define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2723; CHECK-LABEL: test_mm512_maskz_add_round_ps_current:
2724; CHECK:       ## BB#0:
2725; CHECK-NEXT:    kmovw %edi, %k1
2726; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
2727; CHECK-NEXT:    retq
2728  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
2729  ret <16 x float> %res
2730}
2731
2732define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2733; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae:
2734; CHECK:       ## BB#0:
2735; CHECK-NEXT:    kmovw %edi, %k1
2736; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2737; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2738; CHECK-NEXT:    retq
2739  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
2740  ret <16 x float> %res
2741}
2742define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2743; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae:
2744; CHECK:       ## BB#0:
2745; CHECK-NEXT:    kmovw %edi, %k1
2746; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2747; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2748; CHECK-NEXT:    retq
2749  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
2750  ret <16 x float> %res
2751}
2752define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2753; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae:
2754; CHECK:       ## BB#0:
2755; CHECK-NEXT:    kmovw %edi, %k1
2756; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2757; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2758; CHECK-NEXT:    retq
2759  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
2760  ret <16 x float> %res
2761}
2762
2763define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2764; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae:
2765; CHECK:       ## BB#0:
2766; CHECK-NEXT:    kmovw %edi, %k1
2767; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2768; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2769; CHECK-NEXT:    retq
2770  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
2771  ret <16 x float> %res
2772}
2773
2774
2775define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2776; CHECK-LABEL: test_mm512_mask_add_round_ps_current:
2777; CHECK:       ## BB#0:
2778; CHECK-NEXT:    kmovw %edi, %k1
2779; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm2 {%k1}
2780; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2781; CHECK-NEXT:    retq
2782  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
2783  ret <16 x float> %res
2784}
2785
2786
2787define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2788; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
2789; CHECK:       ## BB#0:
2790; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
2791; CHECK-NEXT:    retq
2792  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
2793  ret <16 x float> %res
2794}
2795define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2796; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
2797; CHECK:       ## BB#0:
2798; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
2799; CHECK-NEXT:    retq
2800  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
2801  ret <16 x float> %res
2802}
2803define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2804; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
2805; CHECK:       ## BB#0:
2806; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
2807; CHECK-NEXT:    retq
2808  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
2809  ret <16 x float> %res
2810}
2811
2812define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2813; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
2814; CHECK:       ## BB#0:
2815; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
2816; CHECK-NEXT:    retq
2817  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
2818  ret <16 x float> %res
2819}
2820
2821define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2822; CHECK-LABEL: test_mm512_add_round_ps_current:
2823; CHECK:       ## BB#0:
2824; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
2825; CHECK-NEXT:    retq
2826  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
2827  ret <16 x float> %res
2828}
2829declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2830
2831define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2832; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
2833; CHECK:       ## BB#0:
2834; CHECK-NEXT:    kmovw %edi, %k1
2835; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2836; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2837; CHECK-NEXT:    retq
2838  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
2839  ret <16 x float> %res
2840}
2841define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2842; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
2843; CHECK:       ## BB#0:
2844; CHECK-NEXT:    kmovw %edi, %k1
2845; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2846; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2847; CHECK-NEXT:    retq
2848  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
2849  ret <16 x float> %res
2850}
2851define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2852; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
2853; CHECK:       ## BB#0:
2854; CHECK-NEXT:    kmovw %edi, %k1
2855; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2856; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2857; CHECK-NEXT:    retq
2858  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
2859  ret <16 x float> %res
2860}
2861
2862define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2863; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
2864; CHECK:       ## BB#0:
2865; CHECK-NEXT:    kmovw %edi, %k1
2866; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2867; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2868; CHECK-NEXT:    retq
2869  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
2870  ret <16 x float> %res
2871}
2872
2873
2874define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2875; CHECK-LABEL: test_mm512_mask_sub_round_ps_current:
2876; CHECK:       ## BB#0:
2877; CHECK-NEXT:    kmovw %edi, %k1
2878; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm2 {%k1}
2879; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2880; CHECK-NEXT:    retq
2881  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
2882  ret <16 x float> %res
2883}
2884
2885define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2886; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
2887; CHECK:       ## BB#0:
2888; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
2889; CHECK-NEXT:    retq
2890  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
2891  ret <16 x float> %res
2892}
2893define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2894; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
2895; CHECK:       ## BB#0:
2896; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
2897; CHECK-NEXT:    retq
2898  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
2899  ret <16 x float> %res
2900}
2901define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2902; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
2903; CHECK:       ## BB#0:
2904; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
2905; CHECK-NEXT:    retq
2906  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
2907  ret <16 x float> %res
2908}
2909
2910define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2911; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
2912; CHECK:       ## BB#0:
2913; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
2914; CHECK-NEXT:    retq
2915  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
2916  ret <16 x float> %res
2917}
2918
2919define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2920; CHECK-LABEL: test_mm512_sub_round_ps_current:
2921; CHECK:       ## BB#0:
2922; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm0
2923; CHECK-NEXT:    retq
2924  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
2925  ret <16 x float> %res
2926}
2927
2928define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2929; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
2930; CHECK:       ## BB#0:
2931; CHECK-NEXT:    kmovw %edi, %k1
2932; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2933; CHECK-NEXT:    retq
2934  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
2935  ret <16 x float> %res
2936}
2937define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2938; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
2939; CHECK:       ## BB#0:
2940; CHECK-NEXT:    kmovw %edi, %k1
2941; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2942; CHECK-NEXT:    retq
2943  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
2944  ret <16 x float> %res
2945}
2946define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2947; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
2948; CHECK:       ## BB#0:
2949; CHECK-NEXT:    kmovw %edi, %k1
2950; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2951; CHECK-NEXT:    retq
2952  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
2953  ret <16 x float> %res
2954}
2955
2956define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2957; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
2958; CHECK:       ## BB#0:
2959; CHECK-NEXT:    kmovw %edi, %k1
2960; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2961; CHECK-NEXT:    retq
2962  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
2963  ret <16 x float> %res
2964}
2965
2966
2967define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2968; CHECK-LABEL: test_mm512_maskz_div_round_ps_current:
2969; CHECK:       ## BB#0:
2970; CHECK-NEXT:    kmovw %edi, %k1
2971; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
2972; CHECK-NEXT:    retq
2973  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
2974  ret <16 x float> %res
2975}
2976
2977define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2978; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae:
2979; CHECK:       ## BB#0:
2980; CHECK-NEXT:    kmovw %edi, %k1
2981; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2982; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2983; CHECK-NEXT:    retq
2984  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
2985  ret <16 x float> %res
2986}
2987define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2988; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae:
2989; CHECK:       ## BB#0:
2990; CHECK-NEXT:    kmovw %edi, %k1
2991; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2992; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2993; CHECK-NEXT:    retq
2994  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
2995  ret <16 x float> %res
2996}
2997define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2998; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae:
2999; CHECK:       ## BB#0:
3000; CHECK-NEXT:    kmovw %edi, %k1
3001; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3002; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3003; CHECK-NEXT:    retq
3004  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
3005  ret <16 x float> %res
3006}
3007
3008define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3009; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae:
3010; CHECK:       ## BB#0:
3011; CHECK-NEXT:    kmovw %edi, %k1
3012; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3013; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3014; CHECK-NEXT:    retq
3015  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
3016  ret <16 x float> %res
3017}
3018
3019
3020define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3021; CHECK-LABEL: test_mm512_mask_div_round_ps_current:
3022; CHECK:       ## BB#0:
3023; CHECK-NEXT:    kmovw %edi, %k1
3024; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm2 {%k1}
3025; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3026; CHECK-NEXT:    retq
3027  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3028  ret <16 x float> %res
3029}
3030
3031
3032define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3033; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
3034; CHECK:       ## BB#0:
3035; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
3036; CHECK-NEXT:    retq
3037  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
3038  ret <16 x float> %res
3039}
3040define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3041; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
3042; CHECK:       ## BB#0:
3043; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
3044; CHECK-NEXT:    retq
3045  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
3046  ret <16 x float> %res
3047}
3048define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3049; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
3050; CHECK:       ## BB#0:
3051; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
3052; CHECK-NEXT:    retq
3053  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
3054  ret <16 x float> %res
3055}
3056
3057define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3058; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
3059; CHECK:       ## BB#0:
3060; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
3061; CHECK-NEXT:    retq
3062  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
3063  ret <16 x float> %res
3064}
3065
3066define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3067; CHECK-LABEL: test_mm512_div_round_ps_current:
3068; CHECK:       ## BB#0:
3069; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0
3070; CHECK-NEXT:    retq
3071  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3072  ret <16 x float> %res
3073}
3074declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3075
3076define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3077; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae:
3078; CHECK:       ## BB#0:
3079; CHECK-NEXT:    kmovw %edi, %k1
3080; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3081; CHECK-NEXT:    retq
3082  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
3083  ret <16 x float> %res
3084}
3085
3086define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3087; CHECK-LABEL: test_mm512_maskz_min_round_ps_current:
3088; CHECK:       ## BB#0:
3089; CHECK-NEXT:    kmovw %edi, %k1
3090; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
3091; CHECK-NEXT:    retq
3092  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
3093  ret <16 x float> %res
3094}
3095
3096define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3097; CHECK-LABEL: test_mm512_mask_min_round_ps_sae:
3098; CHECK:       ## BB#0:
3099; CHECK-NEXT:    kmovw %edi, %k1
3100; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
3101; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3102; CHECK-NEXT:    retq
3103  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
3104  ret <16 x float> %res
3105}
3106
3107define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3108; CHECK-LABEL: test_mm512_mask_min_round_ps_current:
3109; CHECK:       ## BB#0:
3110; CHECK-NEXT:    kmovw %edi, %k1
3111; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm2 {%k1}
3112; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3113; CHECK-NEXT:    retq
3114  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3115  ret <16 x float> %res
3116}
3117
3118define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3119; CHECK-LABEL: test_mm512_min_round_ps_sae:
3120; CHECK:       ## BB#0:
3121; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0
3122; CHECK-NEXT:    retq
3123  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
3124  ret <16 x float> %res
3125}
3126
3127define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3128; CHECK-LABEL: test_mm512_min_round_ps_current:
3129; CHECK:       ## BB#0:
3130; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
3131; CHECK-NEXT:    retq
3132  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3133  ret <16 x float> %res
3134}
3135declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3136
3137define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3138; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae:
3139; CHECK:       ## BB#0:
3140; CHECK-NEXT:    kmovw %edi, %k1
3141; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3142; CHECK-NEXT:    retq
3143  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
3144  ret <16 x float> %res
3145}
3146
3147define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3148; CHECK-LABEL: test_mm512_maskz_max_round_ps_current:
3149; CHECK:       ## BB#0:
3150; CHECK-NEXT:    kmovw %edi, %k1
3151; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
3152; CHECK-NEXT:    retq
3153  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
3154  ret <16 x float> %res
3155}
3156
3157define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3158; CHECK-LABEL: test_mm512_mask_max_round_ps_sae:
3159; CHECK:       ## BB#0:
3160; CHECK-NEXT:    kmovw %edi, %k1
3161; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
3162; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3163; CHECK-NEXT:    retq
3164  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
3165  ret <16 x float> %res
3166}
3167
3168define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3169; CHECK-LABEL: test_mm512_mask_max_round_ps_current:
3170; CHECK:       ## BB#0:
3171; CHECK-NEXT:    kmovw %edi, %k1
3172; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm2 {%k1}
3173; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3174; CHECK-NEXT:    retq
3175  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3176  ret <16 x float> %res
3177}
3178
3179define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3180; CHECK-LABEL: test_mm512_max_round_ps_sae:
3181; CHECK:       ## BB#0:
3182; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0
3183; CHECK-NEXT:    retq
3184  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
3185  ret <16 x float> %res
3186}
3187
3188define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3189; CHECK-LABEL: test_mm512_max_round_ps_current:
3190; CHECK:       ## BB#0:
3191; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
3192; CHECK-NEXT:    retq
3193  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3194  ret <16 x float> %res
3195}
3196declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3197
3198declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
3199
3200define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3201; CHECK-LABEL: test_mask_add_ss_rn:
3202; CHECK:       ## BB#0:
3203; CHECK-NEXT:    andl $1, %edi
3204; CHECK-NEXT:    kmovw %edi, %k1
3205; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3206; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3207; CHECK-NEXT:    retq
3208  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
3209  ret <4 x float> %res
3210}
3211
3212define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3213; CHECK-LABEL: test_mask_add_ss_rd:
3214; CHECK:       ## BB#0:
3215; CHECK-NEXT:    andl $1, %edi
3216; CHECK-NEXT:    kmovw %edi, %k1
3217; CHECK-NEXT:    vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3218; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3219; CHECK-NEXT:    retq
3220  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
3221  ret <4 x float> %res
3222}
3223
3224define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3225; CHECK-LABEL: test_mask_add_ss_ru:
3226; CHECK:       ## BB#0:
3227; CHECK-NEXT:    andl $1, %edi
3228; CHECK-NEXT:    kmovw %edi, %k1
3229; CHECK-NEXT:    vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3230; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3231; CHECK-NEXT:    retq
3232  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
3233  ret <4 x float> %res
3234}
3235
3236define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3237; CHECK-LABEL: test_mask_add_ss_rz:
3238; CHECK:       ## BB#0:
3239; CHECK-NEXT:    andl $1, %edi
3240; CHECK-NEXT:    kmovw %edi, %k1
3241; CHECK-NEXT:    vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3242; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3243; CHECK-NEXT:    retq
3244  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
3245  ret <4 x float> %res
3246}
3247
3248define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3249; CHECK-LABEL: test_mask_add_ss_current:
3250; CHECK:       ## BB#0:
3251; CHECK-NEXT:    andl $1, %edi
3252; CHECK-NEXT:    kmovw %edi, %k1
3253; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
3254; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3255; CHECK-NEXT:    retq
3256  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
3257  ret <4 x float> %res
3258}
3259
3260define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
3261; CHECK-LABEL: test_maskz_add_ss_rn:
3262; CHECK:       ## BB#0:
3263; CHECK-NEXT:    andl $1, %edi
3264; CHECK-NEXT:    kmovw %edi, %k1
3265; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3266; CHECK-NEXT:    retq
3267  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0)
3268  ret <4 x float> %res
3269}
3270
3271define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
3272; CHECK-LABEL: test_add_ss_rn:
3273; CHECK:       ## BB#0:
3274; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
3275; CHECK-NEXT:    retq
3276  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
3277  ret <4 x float> %res
3278}
3279
3280declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
3281
3282define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3283; CHECK-LABEL: test_mask_add_sd_rn:
3284; CHECK:       ## BB#0:
3285; CHECK-NEXT:    andl $1, %edi
3286; CHECK-NEXT:    kmovw %edi, %k1
3287; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3288; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3289; CHECK-NEXT:    retq
3290  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
3291  ret <2 x double> %res
3292}
3293
3294define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3295; CHECK-LABEL: test_mask_add_sd_rd:
3296; CHECK:       ## BB#0:
3297; CHECK-NEXT:    andl $1, %edi
3298; CHECK-NEXT:    kmovw %edi, %k1
3299; CHECK-NEXT:    vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3300; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3301; CHECK-NEXT:    retq
3302  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
3303  ret <2 x double> %res
3304}
3305
3306define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3307; CHECK-LABEL: test_mask_add_sd_ru:
3308; CHECK:       ## BB#0:
3309; CHECK-NEXT:    andl $1, %edi
3310; CHECK-NEXT:    kmovw %edi, %k1
3311; CHECK-NEXT:    vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3312; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3313; CHECK-NEXT:    retq
3314  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
3315  ret <2 x double> %res
3316}
3317
3318define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3319; CHECK-LABEL: test_mask_add_sd_rz:
3320; CHECK:       ## BB#0:
3321; CHECK-NEXT:    andl $1, %edi
3322; CHECK-NEXT:    kmovw %edi, %k1
3323; CHECK-NEXT:    vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3324; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3325; CHECK-NEXT:    retq
3326  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
3327  ret <2 x double> %res
3328}
3329
3330define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3331; CHECK-LABEL: test_mask_add_sd_current:
3332; CHECK:       ## BB#0:
3333; CHECK-NEXT:    andl $1, %edi
3334; CHECK-NEXT:    kmovw %edi, %k1
3335; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
3336; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3337; CHECK-NEXT:    retq
3338  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
3339  ret <2 x double> %res
3340}
3341
3342define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
3343; CHECK-LABEL: test_maskz_add_sd_rn:
3344; CHECK:       ## BB#0:
3345; CHECK-NEXT:    andl $1, %edi
3346; CHECK-NEXT:    kmovw %edi, %k1
3347; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3348; CHECK-NEXT:    retq
3349  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0)
3350  ret <2 x double> %res
3351}
3352
3353define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
3354; CHECK-LABEL: test_add_sd_rn:
3355; CHECK:       ## BB#0:
3356; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
3357; CHECK-NEXT:    retq
3358  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
3359  ret <2 x double> %res
3360}
3361
3362declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
3363
3364define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3365; CHECK-LABEL: test_mask_max_ss_sae:
3366; CHECK:       ## BB#0:
3367; CHECK-NEXT:    andl $1, %edi
3368; CHECK-NEXT:    kmovw %edi, %k1
3369; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
3370; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3371; CHECK-NEXT:    retq
3372  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
3373  ret <4 x float> %res
3374}
3375
3376define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
3377; CHECK-LABEL: test_maskz_max_ss_sae:
3378; CHECK:       ## BB#0:
3379; CHECK-NEXT:    andl $1, %edi
3380; CHECK-NEXT:    kmovw %edi, %k1
3381; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3382; CHECK-NEXT:    retq
3383  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
3384  ret <4 x float> %res
3385}
3386
3387define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
3388; CHECK-LABEL: test_max_ss_sae:
3389; CHECK:       ## BB#0:
3390; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0
3391; CHECK-NEXT:    retq
3392  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
3393  ret <4 x float> %res
3394}
3395
3396define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3397; CHECK-LABEL: test_mask_max_ss:
3398; CHECK:       ## BB#0:
3399; CHECK-NEXT:    andl $1, %edi
3400; CHECK-NEXT:    kmovw %edi, %k1
3401; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm2 {%k1}
3402; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3403; CHECK-NEXT:    retq
3404  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
3405  ret <4 x float> %res
3406}
3407
3408define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
3409; CHECK-LABEL: test_maskz_max_ss:
3410; CHECK:       ## BB#0:
3411; CHECK-NEXT:    andl $1, %edi
3412; CHECK-NEXT:    kmovw %edi, %k1
3413; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
3414; CHECK-NEXT:    retq
3415  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
3416  ret <4 x float> %res
3417}
3418
3419define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
3420; CHECK-LABEL: test_max_ss:
3421; CHECK:       ## BB#0:
3422; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
3423; CHECK-NEXT:    retq
3424  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
3425  ret <4 x float> %res
3426}
3427declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
3428
3429define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3430; CHECK-LABEL: test_mask_max_sd_sae:
3431; CHECK:       ## BB#0:
3432; CHECK-NEXT:    andl $1, %edi
3433; CHECK-NEXT:    kmovw %edi, %k1
3434; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
3435; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3436; CHECK-NEXT:    retq
3437  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
3438  ret <2 x double> %res
3439}
3440
3441define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
3442; CHECK-LABEL: test_maskz_max_sd_sae:
3443; CHECK:       ## BB#0:
3444; CHECK-NEXT:    andl $1, %edi
3445; CHECK-NEXT:    kmovw %edi, %k1
3446; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3447; CHECK-NEXT:    retq
3448  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
3449  ret <2 x double> %res
3450}
3451
3452define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
3453; CHECK-LABEL: test_max_sd_sae:
3454; CHECK:       ## BB#0:
3455; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0
3456; CHECK-NEXT:    retq
3457  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
3458  ret <2 x double> %res
3459}
3460
3461define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3462; CHECK-LABEL: test_mask_max_sd:
3463; CHECK:       ## BB#0:
3464; CHECK-NEXT:    andl $1, %edi
3465; CHECK-NEXT:    kmovw %edi, %k1
3466; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
3467; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3468; CHECK-NEXT:    retq
3469  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
3470  ret <2 x double> %res
3471}
3472
3473define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
3474; CHECK-LABEL: test_maskz_max_sd:
3475; CHECK:       ## BB#0:
3476; CHECK-NEXT:    andl $1, %edi
3477; CHECK-NEXT:    kmovw %edi, %k1
3478; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3479; CHECK-NEXT:    retq
3480  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
3481  ret <2 x double> %res
3482}
3483
3484define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
3485; CHECK-LABEL: test_max_sd:
3486; CHECK:       ## BB#0:
3487; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
3488; CHECK-NEXT:    retq
3489  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
3490  ret <2 x double> %res
3491}
3492
3493define <2 x double> @test_x86_avx512_cvtsi2sd32(<2 x double> %a, i32 %b) {
3494; CHECK-LABEL: test_x86_avx512_cvtsi2sd32:
3495; CHECK:       ## BB#0:
3496; CHECK-NEXT:    vcvtsi2sdl %edi, {rz-sae}, %xmm0, %xmm0
3497; CHECK-NEXT:    retq
3498  %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double> %a, i32 %b, i32 3) ; <<<2 x double>> [#uses=1]
3499  ret <2 x double> %res
3500}
3501declare <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double>, i32, i32) nounwind readnone
3502
3503define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
3504; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
3505; CHECK:       ## BB#0:
3506; CHECK-NEXT:    vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0
3507; CHECK-NEXT:    retq
3508  %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1]
3509  ret <2 x double> %res
3510}
3511declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwind readnone
3512
3513define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
3514; CHECK-LABEL: test_x86_avx512_cvtsi2ss32:
3515; CHECK:       ## BB#0:
3516; CHECK-NEXT:    vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0
3517; CHECK-NEXT:    retq
3518  %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1]
3519  ret <4 x float> %res
3520}
3521declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
3522
3523define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) {
3524; CHECK-LABEL: test_x86_avx512_cvtsi2ss64:
3525; CHECK:       ## BB#0:
3526; CHECK-NEXT:    vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0
3527; CHECK-NEXT:    retq
3528  %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1]
3529  ret <4 x float> %res
3530}
3531declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind readnone
3532
3533define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
3534; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
3535; CHECK:       ## BB#0:
3536; CHECK-NEXT:    vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0
3537; CHECK-NEXT:    retq
3538{
3539  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
3540  ret <4 x float> %res
3541}
3542
3543define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr)
3544; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
3545; CHECK:       ## BB#0:
3546; CHECK-NEXT:    movl (%rdi), %eax
3547; CHECK-NEXT:    vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0
3548; CHECK-NEXT:    retq
3549{
3550  %b = load i32, i32* %ptr
3551  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
3552  ret <4 x float> %res
3553}
3554
3555define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
3556; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss:
3557; CHECK:       ## BB#0:
3558; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
3559; CHECK-NEXT:    retq
3560{
3561  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
3562  ret <4 x float> %res
3563}
3564
3565define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr)
3566; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
3567; CHECK:       ## BB#0:
3568; CHECK-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
3569; CHECK-NEXT:    retq
3570{
3571  %b = load i32, i32* %ptr
3572  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
3573  ret <4 x float> %res
3574}
3575declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
3576
3577define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
3578; CHECK-LABEL: _mm_cvt_roundu64_ss:
3579; CHECK:       ## BB#0:
3580; CHECK-NEXT:    vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0
3581; CHECK-NEXT:    retq
3582{
3583  %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 1) ; <<<4 x float>> [#uses=1]
3584  ret <4 x float> %res
3585}
3586
3587define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b)
3588; CHECK-LABEL: _mm_cvtu64_ss:
3589; CHECK:       ## BB#0:
3590; CHECK-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
3591; CHECK-NEXT:    retq
3592{
3593  %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 4) ; <<<4 x float>> [#uses=1]
3594  ret <4 x float> %res
3595}
3596declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind readnone
3597
3598define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
3599; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd:
3600; CHECK:       ## BB#0:
3601; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
3602; CHECK-NEXT:    retq
3603{
3604  %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
3605  ret <2 x double> %res
3606}
3607declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone
3608
3609define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
3610; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd:
3611; CHECK:       ## BB#0:
3612; CHECK-NEXT:    vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0
3613; CHECK-NEXT:    retq
3614{
3615  %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 1) ; <<<2 x double>> [#uses=1]
3616  ret <2 x double> %res
3617}
3618
3619define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b)
3620; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd:
3621; CHECK:       ## BB#0:
3622; CHECK-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
3623; CHECK-NEXT:    retq
3624{
3625  %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 4) ; <<<2 x double>> [#uses=1]
3626  ret <2 x double> %res
3627}
3628declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone
3629
3630define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
3631; CHECK-LABEL: test_vpmaxq:
3632; CHECK:       ## BB#0:
3633; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
3634; CHECK-NEXT:    retq
3635  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1,
3636                    <8 x i64>zeroinitializer, i8 -1)
3637  ret <8 x i64> %res
3638}
3639declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
3640
3641define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
3642; CHECK-LABEL: test_vpminud:
3643; CHECK:       ## BB#0:
3644; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
3645; CHECK-NEXT:    retq
3646  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1,
3647                    <16 x i32>zeroinitializer, i16 -1)
3648  ret <16 x i32> %res
3649}
3650declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3651
3652define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
3653; CHECK-LABEL: test_vpmaxsd:
3654; CHECK:       ## BB#0:
3655; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
3656; CHECK-NEXT:    retq
3657  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1,
3658                    <16 x i32>zeroinitializer, i16 -1)
3659  ret <16 x i32> %res
3660}
3661declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3662
3663define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3664; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
3665; CHECK:       ## BB#0:
3666; CHECK-NEXT:    kmovw %edi, %k1
3667; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
3668; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
3669; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
3670; CHECK-NEXT:    retq
3671  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3672  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
3673  %res2 = add <16 x i32> %res, %res1
3674  ret <16 x i32> %res2
3675}
3676
3677define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3678; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
3679; CHECK:       ## BB#0:
3680; CHECK-NEXT:    kmovw %edi, %k1
3681; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
3682; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
3683; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
3684; CHECK-NEXT:    retq
3685  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
3686  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
3687  %res2 = add <8 x i64> %res, %res1
3688  ret <8 x i64> %res2
3689}
3690
3691declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3692
3693define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3694; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
3695; CHECK:       ## BB#0:
3696; CHECK-NEXT:    kmovw %edi, %k1
3697; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
3698; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
3699; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
3700; CHECK-NEXT:    retq
3701  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3702  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
3703  %res2 = add <16 x i32> %res, %res1
3704  ret <16 x i32> %res2
3705}
3706
3707declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
3708
3709define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3710; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
3711; CHECK:       ## BB#0:
3712; CHECK-NEXT:    kmovw %edi, %k1
3713; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
3714; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
3715; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
3716; CHECK-NEXT:    retq
3717  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
3718  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
3719  %res2 = add <8 x i64> %res, %res1
3720  ret <8 x i64> %res2
3721}
3722
3723declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3724
3725define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3726; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
3727; CHECK:       ## BB#0:
3728; CHECK-NEXT:    kmovw %edi, %k1
3729; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1}
3730; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
3731; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
3732; CHECK-NEXT:    retq
3733  %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3734  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
3735  %res2 = add <16 x i32> %res, %res1
3736  ret <16 x i32> %res2
3737}
3738
3739declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
3740
3741define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3742; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
3743; CHECK:       ## BB#0:
3744; CHECK-NEXT:    kmovw %edi, %k1
3745; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1}
3746; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
3747; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
3748; CHECK-NEXT:    retq
3749  %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
3750  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
3751  %res2 = add <8 x i64> %res, %res1
3752  ret <8 x i64> %res2
3753}
3754
3755define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3756; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
3757; CHECK:       ## BB#0:
3758; CHECK-NEXT:    kmovw %edi, %k1
3759; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1}
3760; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
3761; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
3762; CHECK-NEXT:    retq
3763  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3764  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
3765  %res2 = add <16 x i32> %res, %res1
3766  ret <16 x i32> %res2
3767}
3768
3769declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
3770
3771define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3772; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
3773; CHECK:       ## BB#0:
3774; CHECK-NEXT:    kmovw %edi, %k1
3775; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1}
3776; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
3777; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
3778; CHECK-NEXT:    retq
3779  %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
3780  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
3781  %res2 = add <8 x i64> %res, %res1
3782  ret <8 x i64> %res2
3783}
3784
3785declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3786
3787define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
3788; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
3789; CHECK:       ## BB#0:
3790; CHECK-NEXT:    kmovw %esi, %k1
3791; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3792; CHECK-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
3793; CHECK-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
3794; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
3795; CHECK-NEXT:    retq
3796  %x2 = load <16 x i32>, <16 x i32>* %x2p
3797  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3798  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
3799  %res2 = add <16 x i32> %res, %res1
3800  ret <16 x i32> %res2
3801}
3802
3803declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
3804
3805define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
3806; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
3807; CHECK:       ## BB#0:
3808; CHECK-NEXT:    kmovw %edi, %k1
3809; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3810; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
3811; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1
3812; CHECK-NEXT:    vaddpd %zmm1, %zmm3, %zmm0
3813; CHECK-NEXT:    retq
3814  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
3815  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
3816  %res2 = fadd <8 x double> %res, %res1
3817  ret <8 x double> %res2
3818}
3819
3820declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
3821
3822define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
3823; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3824; CHECK:       ## BB#0:
3825; CHECK-NEXT:    kmovw %edi, %k1
3826; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3827; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm3 {%k1}
3828; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
3829; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
3830; CHECK-NEXT:    retq
3831  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
3832  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
3833  %res2 = fadd <16 x float> %res, %res1
3834  ret <16 x float> %res2
3835}
3836
3837declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
3838
3839define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3840; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3841; CHECK:       ## BB#0:
3842; CHECK-NEXT:    kmovw %edi, %k1
3843; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3844; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
3845; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
3846; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
3847; CHECK-NEXT:    retq
3848  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
3849  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
3850  %res2 = add <8 x i64> %res, %res1
3851  ret <8 x i64> %res2
3852}
3853
3854declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3855
3856define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
3857; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3858; CHECK:       ## BB#0:
3859; CHECK-NEXT:    kmovw %esi, %k1
3860; CHECK-NEXT:    vmovaps %zmm1, %zmm2
3861; CHECK-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
3862; CHECK-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1
3863; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm0
3864; CHECK-NEXT:    retq
3865  %x2 = load <16 x i32>, <16 x i32>* %x2p
3866  %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3867  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1)
3868  %res2 = add <16 x i32> %res, %res1
3869  ret <16 x i32> %res2
3870}
3871
3872declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
3873
3874define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
3875; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3876; CHECK:       ## BB#0:
3877; CHECK-NEXT:    kmovw %esi, %k1
3878; CHECK-NEXT:    vmovaps %zmm1, %zmm2
3879; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
3880; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
3881; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm0
3882; CHECK-NEXT:    retq
3883  %x2s = load double, double* %x2ptr
3884  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
3885  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
3886  %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
3887  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1)
3888  %res2 = fadd <8 x double> %res, %res1
3889  ret <8 x double> %res2
3890}
3891
3892declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
3893
3894define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3895; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3896; CHECK:       ## BB#0:
3897; CHECK-NEXT:    kmovw %edi, %k1
3898; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3899; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z}
3900; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1
3901; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
3902; CHECK-NEXT:    retq
3903  %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
3904  %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
3905  %res2 = fadd <16 x float> %res, %res1
3906  ret <16 x float> %res2
3907}
3908
3909
3910declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
3911
3912define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3913; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3914; CHECK:       ## BB#0:
3915; CHECK-NEXT:    kmovw %edi, %k1
3916; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3917; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
3918; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
3919; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
3920; CHECK-NEXT:    retq
3921  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
3922  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
3923  %res2 = add <8 x i64> %res, %res1
3924  ret <8 x i64> %res2
3925}
3926
3927declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3928
3929define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3930; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3931; CHECK:       ## BB#0:
3932; CHECK-NEXT:    kmovw %edi, %k1
3933; CHECK-NEXT:    vmovaps %zmm1, %zmm3
3934; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 {%k1}
3935; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
3936; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
3937; CHECK-NEXT:    retq
3938  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
3939  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
3940  %res2 = add <16 x i32> %res, %res1
3941  ret <16 x i32> %res2
3942}
3943
3944declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
3945define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
3946; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3947; CHECK:       ## BB#0:
3948; CHECK-NEXT:    kmovw %edi, %k1
3949; CHECK-NEXT:    vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3950; CHECK-NEXT:    vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3951; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
3952; CHECK-NEXT:    retq
3953  %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3)
3954  %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
3955  %res2 = fadd <8 x double> %res, %res1
3956  ret <8 x double> %res2
3957}
3958
3959declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3960define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3961; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3962; CHECK:       ## BB#0:
3963; CHECK-NEXT:    kmovw %edi, %k1
3964; CHECK-NEXT:    vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3965; CHECK-NEXT:    vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3966; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
3967; CHECK-NEXT:    retq
3968  %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2)
3969  %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
3970  %res2 = fadd <16 x float> %res, %res1
3971  ret <16 x float> %res2
3972}
3973
3974declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
3975
3976define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3977; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3978; CHECK:       ## BB#0:
3979; CHECK-NEXT:    kmovw %edi, %k1
3980; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
3981; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
3982; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
3983; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
3984; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
3985; CHECK-NEXT:    retq
3986    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3987    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3988    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3989    %res3 = add <16 x i8> %res0, %res1
3990    %res4 = add <16 x i8> %res3, %res2
3991    ret <16 x i8> %res4
3992}
3993
3994declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3995
3996define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3997; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3998; CHECK:       ## BB#0:
3999; CHECK-NEXT:    kmovw %esi, %k1
4000; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
4001; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
4002; CHECK-NEXT:    retq
4003    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4004    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4005    ret void
4006}
4007
4008declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
4009
4010define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
4011; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
4012; CHECK:       ## BB#0:
4013; CHECK-NEXT:    kmovw %edi, %k1
4014; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
4015; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
4016; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
4017; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4018; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4019; CHECK-NEXT:    retq
4020    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
4021    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
4022    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
4023    %res3 = add <16 x i8> %res0, %res1
4024    %res4 = add <16 x i8> %res3, %res2
4025    ret <16 x i8> %res4
4026}
4027
4028declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
4029
4030define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4031; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
4032; CHECK:       ## BB#0:
4033; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi)
4034; CHECK-NEXT:    kmovw %esi, %k1
4035; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi) {%k1}
4036; CHECK-NEXT:    retq
4037    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4038    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4039    ret void
4040}
4041
4042declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
4043
4044define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
4045; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
4046; CHECK:       ## BB#0:
4047; CHECK-NEXT:    kmovw %edi, %k1
4048; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
4049; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
4050; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
4051; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4052; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4053; CHECK-NEXT:    retq
4054    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
4055    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
4056    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
4057    %res3 = add <16 x i8> %res0, %res1
4058    %res4 = add <16 x i8> %res3, %res2
4059    ret <16 x i8> %res4
4060}
4061
4062declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
4063
4064define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4065; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
4066; CHECK:       ## BB#0:
4067; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi)
4068; CHECK-NEXT:    kmovw %esi, %k1
4069; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi) {%k1}
4070; CHECK-NEXT:    retq
4071    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4072    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4073    ret void
4074}
4075
4076declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
4077
4078define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
4079; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
4080; CHECK:       ## BB#0:
4081; CHECK-NEXT:    kmovw %edi, %k1
4082; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
4083; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
4084; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
4085; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4086; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
4087; CHECK-NEXT:    retq
4088    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
4089    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
4090    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
4091    %res3 = add <8 x i16> %res0, %res1
4092    %res4 = add <8 x i16> %res3, %res2
4093    ret <8 x i16> %res4
4094}
4095
4096declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
4097
4098define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4099; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
4100; CHECK:       ## BB#0:
4101; CHECK-NEXT:    kmovw %esi, %k1
4102; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
4103; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
4104; CHECK-NEXT:    retq
4105    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4106    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4107    ret void
4108}
4109
4110declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
4111
4112define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
4113; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
4114; CHECK:       ## BB#0:
4115; CHECK-NEXT:    kmovw %edi, %k1
4116; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
4117; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
4118; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
4119; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4120; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
4121; CHECK-NEXT:    retq
4122    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
4123    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
4124    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
4125    %res3 = add <8 x i16> %res0, %res1
4126    %res4 = add <8 x i16> %res3, %res2
4127    ret <8 x i16> %res4
4128}
4129
4130declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
4131
4132define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4133; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
4134; CHECK:       ## BB#0:
4135; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi)
4136; CHECK-NEXT:    kmovw %esi, %k1
4137; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi) {%k1}
4138; CHECK-NEXT:    retq
4139    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4140    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4141    ret void
4142}
4143
4144declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
4145
4146define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
4147; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
4148; CHECK:       ## BB#0:
4149; CHECK-NEXT:    kmovw %edi, %k1
4150; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
4151; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
4152; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
4153; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4154; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
4155; CHECK-NEXT:    retq
4156    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
4157    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
4158    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
4159    %res3 = add <8 x i16> %res0, %res1
4160    %res4 = add <8 x i16> %res3, %res2
4161    ret <8 x i16> %res4
4162}
4163
4164declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
4165
4166define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4167; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
4168; CHECK:       ## BB#0:
4169; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi)
4170; CHECK-NEXT:    kmovw %esi, %k1
4171; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi) {%k1}
4172; CHECK-NEXT:    retq
4173    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4174    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4175    ret void
4176}
4177
4178declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
4179
4180define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
4181; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
4182; CHECK:       ## BB#0:
4183; CHECK-NEXT:    kmovw %edi, %k1
4184; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
4185; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
4186; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
4187; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4188; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
4189; CHECK-NEXT:    retq
4190    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
4191    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
4192    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
4193    %res3 = add <8 x i32> %res0, %res1
4194    %res4 = add <8 x i32> %res3, %res2
4195    ret <8 x i32> %res4
4196}
4197
4198declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
4199
4200define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4201; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
4202; CHECK:       ## BB#0:
4203; CHECK-NEXT:    kmovw %esi, %k1
4204; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
4205; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
4206; CHECK-NEXT:    retq
4207    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4208    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4209    ret void
4210}
4211
4212declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
4213
4214define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
4215; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
4216; CHECK:       ## BB#0:
4217; CHECK-NEXT:    kmovw %edi, %k1
4218; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
4219; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
4220; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
4221; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4222; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
4223; CHECK-NEXT:    retq
4224    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
4225    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
4226    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
4227    %res3 = add <8 x i32> %res0, %res1
4228    %res4 = add <8 x i32> %res3, %res2
4229    ret <8 x i32> %res4
4230}
4231
4232declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
4233
4234define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4235; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
4236; CHECK:       ## BB#0:
4237; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi)
4238; CHECK-NEXT:    kmovw %esi, %k1
4239; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi) {%k1}
4240; CHECK-NEXT:    retq
4241    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4242    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4243    ret void
4244}
4245
4246declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
4247
4248define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
4249; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
4250; CHECK:       ## BB#0:
4251; CHECK-NEXT:    kmovw %edi, %k1
4252; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
4253; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
4254; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
4255; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4256; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
4257; CHECK-NEXT:    retq
4258    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
4259    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
4260    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
4261    %res3 = add <8 x i32> %res0, %res1
4262    %res4 = add <8 x i32> %res3, %res2
4263    ret <8 x i32> %res4
4264}
4265
4266declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
4267
4268define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4269; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
4270; CHECK:       ## BB#0:
4271; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi)
4272; CHECK-NEXT:    kmovw %esi, %k1
4273; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi) {%k1}
4274; CHECK-NEXT:    retq
4275    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4276    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4277    ret void
4278}
4279
4280declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
4281
4282define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
4283; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
4284; CHECK:       ## BB#0:
4285; CHECK-NEXT:    kmovw %edi, %k1
4286; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
4287; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
4288; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
4289; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4290; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4291; CHECK-NEXT:    retq
4292    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
4293    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
4294    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
4295    %res3 = add <16 x i8> %res0, %res1
4296    %res4 = add <16 x i8> %res3, %res2
4297    ret <16 x i8> %res4
4298}
4299
4300declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
4301
4302define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4303; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
4304; CHECK:       ## BB#0:
4305; CHECK-NEXT:    kmovw %esi, %k1
4306; CHECK-NEXT:    vpmovdb %zmm0, (%rdi)
4307; CHECK-NEXT:    vpmovdb %zmm0, (%rdi) {%k1}
4308; CHECK-NEXT:    retq
4309    call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4310    call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4311    ret void
4312}
4313
4314declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
4315
4316define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
4317; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
4318; CHECK:       ## BB#0:
4319; CHECK-NEXT:    kmovw %edi, %k1
4320; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
4321; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
4322; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
4323; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4324; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4325; CHECK-NEXT:    retq
4326    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
4327    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
4328    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
4329    %res3 = add <16 x i8> %res0, %res1
4330    %res4 = add <16 x i8> %res3, %res2
4331    ret <16 x i8> %res4
4332}
4333
4334declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
4335
4336define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4337; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
4338; CHECK:       ## BB#0:
4339; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi)
4340; CHECK-NEXT:    kmovw %esi, %k1
4341; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi) {%k1}
4342; CHECK-NEXT:    retq
4343    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4344    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4345    ret void
4346}
4347
4348declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
4349
4350define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
4351; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
4352; CHECK:       ## BB#0:
4353; CHECK-NEXT:    kmovw %edi, %k1
4354; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
4355; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
4356; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
4357; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4358; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4359; CHECK-NEXT:    retq
4360    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
4361    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
4362    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
4363    %res3 = add <16 x i8> %res0, %res1
4364    %res4 = add <16 x i8> %res3, %res2
4365    ret <16 x i8> %res4
4366}
4367
4368declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
4369
4370define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4371; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
4372; CHECK:       ## BB#0:
4373; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi)
4374; CHECK-NEXT:    kmovw %esi, %k1
4375; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi) {%k1}
4376; CHECK-NEXT:    retq
4377    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4378    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4379    ret void
4380}
4381
4382declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
4383
4384define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4385; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
4386; CHECK:       ## BB#0:
4387; CHECK-NEXT:    kmovw %edi, %k1
4388; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
4389; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
4390; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
4391; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
4392; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
4393; CHECK-NEXT:    retq
4394    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4395    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4396    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4397    %res3 = add <16 x i16> %res0, %res1
4398    %res4 = add <16 x i16> %res3, %res2
4399    ret <16 x i16> %res4
4400}
4401
4402declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4403
4404define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4405; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
4406; CHECK:       ## BB#0:
4407; CHECK-NEXT:    kmovw %esi, %k1
4408; CHECK-NEXT:    vpmovdw %zmm0, (%rdi)
4409; CHECK-NEXT:    vpmovdw %zmm0, (%rdi) {%k1}
4410; CHECK-NEXT:    retq
4411    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4412    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4413    ret void
4414}
4415
4416declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4417
4418define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4419; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4420; CHECK:       ## BB#0:
4421; CHECK-NEXT:    kmovw %edi, %k1
4422; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
4423; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
4424; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
4425; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
4426; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
4427; CHECK-NEXT:    retq
4428    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4429    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4430    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4431    %res3 = add <16 x i16> %res0, %res1
4432    %res4 = add <16 x i16> %res3, %res2
4433    ret <16 x i16> %res4
4434}
4435
4436declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4437
4438define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4439; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4440; CHECK:       ## BB#0:
4441; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi)
4442; CHECK-NEXT:    kmovw %esi, %k1
4443; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi) {%k1}
4444; CHECK-NEXT:    retq
4445    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4446    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4447    ret void
4448}
4449
4450declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
4451
4452define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4453; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4454; CHECK:       ## BB#0:
4455; CHECK-NEXT:    kmovw %edi, %k1
4456; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
4457; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
4458; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
4459; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
4460; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
4461; CHECK-NEXT:    retq
4462    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4463    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4464    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4465    %res3 = add <16 x i16> %res0, %res1
4466    %res4 = add <16 x i16> %res3, %res2
4467    ret <16 x i16> %res4
4468}
4469
4470declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4471
4472define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4473; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4474; CHECK:       ## BB#0:
4475; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi)
4476; CHECK-NEXT:    kmovw %esi, %k1
4477; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi) {%k1}
4478; CHECK-NEXT:    retq
4479    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4480    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4481    ret void
4482}
4483
4484declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
4485
4486define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
4487; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
4488; CHECK:       ## BB#0:
4489; CHECK-NEXT:    kmovw %edi, %k1
4490; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1}
4491; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
4492; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
4493; CHECK-NEXT:    retq
4494  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
4495  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
4496  %res2 = fadd <8 x double> %res, %res1
4497  ret <8 x double> %res2
4498}
4499
4500declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
4501
4502define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4503; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4504; CHECK:       ## BB#0:
4505; CHECK-NEXT:    kmovw %edi, %k1
4506; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm1 {%k1}
4507; CHECK-NEXT:    vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4508; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
4509; CHECK-NEXT:    retq
4510  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
4511  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
4512  %res2 = fadd <16 x float> %res, %res1
4513  ret <16 x float> %res2
4514}
4515
4516declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4517
4518define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4519; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4520; CHECK:       ## BB#0:
4521; CHECK-NEXT:    kmovw %edi, %k1
4522; CHECK-NEXT:    vcvtpd2dq %zmm0, %ymm1 {%k1}
4523; CHECK-NEXT:    vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4524; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4525; CHECK-NEXT:    retq
4526  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4527  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
4528  %res2 = add <8 x i32> %res, %res1
4529  ret <8 x i32> %res2
4530}
4531
4532declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
4533
4534define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
4535; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4536; CHECK:       ## BB#0:
4537; CHECK-NEXT:    kmovw %edi, %k1
4538; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm1 {%k1}
4539; CHECK-NEXT:    vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4540; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
4541; CHECK-NEXT:    retq
4542  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
4543  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 2)
4544  %res2 = fadd <8 x float> %res, %res1
4545  ret <8 x float> %res2
4546}
4547
4548declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4549
4550define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4551; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4552; CHECK:       ## BB#0:
4553; CHECK-NEXT:    kmovw %edi, %k1
4554; CHECK-NEXT:    vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4555; CHECK-NEXT:    vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4556; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4557; CHECK-NEXT:    retq
4558  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 2)
4559  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
4560  %res2 = add <8 x i32> %res, %res1
4561  ret <8 x i32> %res2
4562}
4563
4564declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4565
4566define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4567; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4568; CHECK:       ## BB#0:
4569; CHECK-NEXT:    kmovw %edi, %k1
4570; CHECK-NEXT:    vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4571; CHECK-NEXT:    vcvtps2dq {rn-sae}, %zmm0, %zmm0
4572; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
4573; CHECK-NEXT:    retq
4574  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
4575  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
4576  %res2 = add <16 x i32> %res, %res1
4577  ret <16 x i32> %res2
4578}
4579
4580declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
4581
4582define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
4583; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4584; CHECK:       ## BB#0:
4585; CHECK-NEXT:    kmovw %edi, %k1
4586; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm1 {%k1}
4587; CHECK-NEXT:    vcvtps2pd {sae}, %ymm0, %zmm0
4588; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
4589; CHECK-NEXT:    retq
4590  %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
4591  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
4592  %res2 = fadd <8 x double> %res, %res1
4593  ret <8 x double> %res2
4594}
4595
4596declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4597
4598define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4599; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4600; CHECK:       ## BB#0:
4601; CHECK-NEXT:    kmovw %edi, %k1
4602; CHECK-NEXT:    vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4603; CHECK-NEXT:    vcvtps2udq {rn-sae}, %zmm0, %zmm0
4604; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
4605; CHECK-NEXT:    retq
4606  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
4607  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
4608  %res2 = add <16 x i32> %res, %res1
4609  ret <16 x i32> %res2
4610}
4611
4612declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4613
4614define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4615; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4616; CHECK:       ## BB#0:
4617; CHECK-NEXT:    kmovw %edi, %k1
4618; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm1 {%k1}
4619; CHECK-NEXT:    vcvttpd2dq {sae}, %zmm0, %ymm0
4620; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4621; CHECK-NEXT:    retq
4622  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4623  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4624  %res2 = add <8 x i32> %res, %res1
4625  ret <8 x i32> %res2
4626}
4627
4628declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
4629
4630define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
4631; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
4632; CHECK:       ## BB#0:
4633; CHECK-NEXT:    kmovw %edi, %k1
4634; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1}
4635; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
4636; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
4637; CHECK-NEXT:    retq
4638  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
4639  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
4640  %res2 = fadd <8 x double> %res, %res1
4641  ret <8 x double> %res2
4642}
4643
4644
4645declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
4646
4647define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4648; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4649; CHECK:       ## BB#0:
4650; CHECK-NEXT:    kmovw %edi, %k1
4651; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm1 {%k1}
4652; CHECK-NEXT:    vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4653; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
4654; CHECK-NEXT:    retq
4655  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
4656  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
4657  %res2 = fadd <16 x float> %res, %res1
4658  ret <16 x float> %res2
4659}
4660
4661declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4662
4663define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4664; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4665; CHECK:       ## BB#0:
4666; CHECK-NEXT:    kmovw %edi, %k1
4667; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm1 {%k1}
4668; CHECK-NEXT:    vcvttpd2udq {sae}, %zmm0, %ymm0
4669; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4670; CHECK-NEXT:    retq
4671  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4672  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4673  %res2 = add <8 x i32> %res, %res1
4674  ret <8 x i32> %res2
4675}
4676
4677declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4678
4679define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4680; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4681; CHECK:       ## BB#0:
4682; CHECK-NEXT:    kmovw %edi, %k1
4683; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm1 {%k1}
4684; CHECK-NEXT:    vcvttps2dq {sae}, %zmm0, %zmm0
4685; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
4686; CHECK-NEXT:    retq
4687  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4688  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4689  %res2 = add <16 x i32> %res, %res1
4690  ret <16 x i32> %res2
4691}
4692
4693declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4694
4695define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4696; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4697; CHECK:       ## BB#0:
4698; CHECK-NEXT:    kmovw %edi, %k1
4699; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm1 {%k1}
4700; CHECK-NEXT:    vcvttps2udq {sae}, %zmm0, %zmm0
4701; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
4702; CHECK-NEXT:    retq
4703  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4704  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4705  %res2 = add <16 x i32> %res, %res1
4706  ret <16 x i32> %res2
4707}
4708
4709declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
4710
4711define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
4712; CHECK-LABEL: test_getexp_ss:
4713; CHECK:       ## BB#0:
4714; CHECK-NEXT:    andl $1, %edi
4715; CHECK-NEXT:    kmovw %edi, %k1
4716; CHECK-NEXT:    vmovaps %zmm2, %zmm3
4717; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4718; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4719; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
4720; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm0
4721; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
4722; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
4723; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4724; CHECK-NEXT:    retq
4725  %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
4726  %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
4727  %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
4728  %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
4729
4730  %res.1 = fadd <4 x float> %res0, %res1
4731  %res.2 = fadd <4 x float> %res2, %res3
4732  %res   = fadd <4 x float> %res.1, %res.2
4733  ret <4 x float> %res
4734}
4735
4736declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
4737
4738define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
4739; CHECK-LABEL: test_getexp_sd:
4740; CHECK:       ## BB#0:
4741; CHECK-NEXT:    andl $1, %edi
4742; CHECK-NEXT:    kmovw %edi, %k1
4743; CHECK-NEXT:    vmovaps %zmm2, %zmm3
4744; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
4745; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
4746; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4747; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
4748; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
4749; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
4750; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
4751; CHECK-NEXT:    retq
4752  %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
4753  %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
4754  %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
4755  %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
4756
4757  %res.1 = fadd <2 x double> %res0, %res1
4758  %res.2 = fadd <2 x double> %res2, %res3
4759  %res   = fadd <2 x double> %res.1, %res.2
4760  ret <2 x double> %res
4761}
4762
4763declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
4764
4765define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4766; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
4767; CHECK:       ## BB#0:
4768; CHECK-NEXT:    andl $1, %edi
4769; CHECK-NEXT:    kmovw %edi, %k1
4770; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4771; CHECK-NEXT:    kmovw %k0, %eax
4772; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
4773; CHECK-NEXT:    retq
4774
4775  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4776  ret i8 %res4
4777}
4778
4779define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4780; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4781; CHECK:       ## BB#0:
4782; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4783; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k1
4784; CHECK-NEXT:    korw %k0, %k1, %k0
4785; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k1
4786; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k2
4787; CHECK-NEXT:    korw %k1, %k2, %k1
4788; CHECK-NEXT:    andl $1, %edi
4789; CHECK-NEXT:    kmovw %edi, %k2
4790; CHECK-NEXT:    kandw %k2, %k1, %k1
4791; CHECK-NEXT:    korw %k1, %k0, %k0
4792; CHECK-NEXT:    kmovw %k0, %eax
4793; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
4794; CHECK-NEXT:    retq
4795
4796  %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
4797  %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
4798  %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
4799  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4800
4801  %res11 = or i8 %res1, %res2
4802  %res12 = or i8 %res3, %res4
4803  %res13 = or i8 %res11, %res12
4804  ret i8 %res13
4805}
4806
4807declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
4808
4809define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4810; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
4811; CHECK:       ## BB#0:
4812; CHECK-NEXT:    andl $1, %edi
4813; CHECK-NEXT:    kmovw %edi, %k1
4814; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4815; CHECK-NEXT:    kmovw %k0, %eax
4816; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
4817; CHECK-NEXT:    retq
4818
4819  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
4820  ret i8 %res2
4821}
4822
4823
4824define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4825; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4826; CHECK:       ## BB#0:
4827; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k1
4828; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
4829; CHECK-NEXT:    andl $1, %edi
4830; CHECK-NEXT:    kmovw %edi, %k1
4831; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k2 {%k1}
4832; CHECK-NEXT:    kmovw %k2, %ecx
4833; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
4834; CHECK-NEXT:    kmovw %k1, %eax
4835; CHECK-NEXT:    kmovw %k0, %edx
4836; CHECK-NEXT:    andb %cl, %al
4837; CHECK-NEXT:    andb %dl, %al
4838; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
4839; CHECK-NEXT:    retq
4840  %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
4841  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
4842  %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
4843  %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
4844
4845  %res11 = and i8 %res1, %res2
4846  %res12 = and i8 %res3, %res4
4847  %res13 = and i8 %res11, %res12
4848  ret i8 %res13
4849}
4850
4851declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
4852
4853define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
4854; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
4855; CHECK:       ## BB#0:
4856; CHECK-NEXT:    kmovw %edi, %k1
4857; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
4858; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
4859; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
4860; CHECK-NEXT:    retq
4861  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
4862  %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
4863  %res2 = fadd <16 x float> %res, %res1
4864  ret <16 x float> %res2
4865}
4866
4867declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
4868
4869define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
4870; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
4871; CHECK:       ## BB#0:
4872; CHECK-NEXT:    kmovw %edi, %k1
4873; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
4874; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
4875; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
4876; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
4877; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
4878; CHECK-NEXT:    retq
4879  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
4880  %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
4881  %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
4882
4883  %res3 = fadd <8 x double> %res, %res1
4884  %res4 = fadd <8 x double> %res3, %res2
4885  ret <8 x double> %res4
4886}
4887
4888declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
4889
4890define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
4891; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
4892; CHECK:       ## BB#0:
4893; CHECK-NEXT:    kmovw %edi, %k1
4894; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
4895; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
4896; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4897; CHECK-NEXT:    retq
4898  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
4899  %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
4900  %res2 = add <16 x i32> %res, %res1
4901  ret <16 x i32> %res2
4902}
4903
4904declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
4905
4906define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
4907; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
4908; CHECK:       ## BB#0:
4909; CHECK-NEXT:    kmovw %edi, %k1
4910; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
4911; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
4912; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4913; CHECK-NEXT:    retq
4914  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
4915  %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
4916  %res2 = add <8 x i64> %res, %res1
4917  ret <8 x i64> %res2
4918}
4919
4920declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
4921
4922define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
4923; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4924; CHECK:       ## BB#0:
4925; CHECK-NEXT:    kmovw %edi, %k1
4926; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
4927; CHECK-NEXT:    vgetmantpd $11, {sae}, %zmm0, %zmm0
4928; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
4929; CHECK-NEXT:    retq
4930  %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
4931  %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
4932  %res2 = fadd <8 x double> %res, %res1
4933  ret <8 x double> %res2
4934}
4935
4936declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
4937
4938define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
4939; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4940; CHECK:       ## BB#0:
4941; CHECK-NEXT:    kmovw %edi, %k1
4942; CHECK-NEXT:    vgetmantps $11, %zmm0, %zmm1 {%k1}
4943; CHECK-NEXT:    vgetmantps $11, {sae}, %zmm0, %zmm0
4944; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
4945; CHECK-NEXT:    retq
4946  %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
4947  %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
4948  %res2 = fadd <16 x float> %res, %res1
4949  ret <16 x float> %res2
4950}
4951
4952declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
4953
4954define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
4955; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
4956; CHECK:       ## BB#0:
4957; CHECK-NEXT:    andl $1, %edi
4958; CHECK-NEXT:    kmovw %edi, %k1
4959; CHECK-NEXT:    vmovaps %zmm2, %zmm3
4960; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
4961; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
4962; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5
4963; CHECK-NEXT:    vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4964; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
4965; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
4966; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
4967; CHECK-NEXT:    retq
4968  %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
4969  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
4970  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
4971  %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
4972  %res11 = fadd <2 x double> %res, %res1
4973  %res12 = fadd <2 x double> %res2, %res3
4974  %res13 = fadd <2 x double> %res11, %res12
4975  ret <2 x double> %res13
4976}
4977
4978declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
4979
4980define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
4981; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
4982; CHECK:       ## BB#0:
4983; CHECK-NEXT:    andl $1, %edi
4984; CHECK-NEXT:    kmovw %edi, %k1
4985; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4986; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
4987; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4
4988; CHECK-NEXT:    vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
4989; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
4990; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
4991; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4992; CHECK-NEXT:    retq
4993  %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
4994  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
4995  %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
4996  %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
4997  %res11 = fadd <4 x float> %res, %res1
4998  %res12 = fadd <4 x float> %res2, %res3
4999  %res13 = fadd <4 x float> %res11, %res12
5000  ret <4 x float> %res13
5001}
5002
5003declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
5004
5005define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
5006; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
5007; CHECK:       ## BB#0:
5008; CHECK-NEXT:    kmovw %edi, %k1
5009; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
5010; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
5011; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
5012; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
5013; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
5014; CHECK-NEXT:    retq
5015  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
5016  %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
5017  %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
5018
5019  %res3 = fadd <8 x double> %res, %res1
5020  %res4 = fadd <8 x double> %res3, %res2
5021  ret <8 x double> %res4
5022}
5023
5024declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
5025
5026define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
5027; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
5028; CHECK:       ## BB#0:
5029; CHECK-NEXT:    kmovw %edi, %k1
5030; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
5031; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
5032; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
5033; CHECK-NEXT:    retq
5034  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
5035  %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
5036  %res2 = fadd <16 x float> %res, %res1
5037  ret <16 x float> %res2
5038}
5039
5040declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
5041
5042define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
5043; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
5044; CHECK:       ## BB#0:
5045; CHECK-NEXT:    kmovw %edi, %k1
5046; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
5047; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
5048; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0
5049; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
5050; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5051; CHECK-NEXT:    retq
5052  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
5053  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
5054  %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
5055  %res3 = fadd <8 x double> %res, %res1
5056  %res4 = fadd <8 x double> %res2, %res3
5057  ret <8 x double> %res4
5058}
5059
5060declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
5061
5062define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
5063; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
5064; CHECK:       ## BB#0:
5065; CHECK-NEXT:    kmovw %edi, %k1
5066; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1}
5067; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm3 {%k1} {z}
5068; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0
5069; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
5070; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
5071; CHECK-NEXT:    retq
5072  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
5073  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
5074  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
5075  %res3 = fadd <16 x float> %res, %res1
5076  %res4 = fadd <16 x float> %res2, %res3
5077  ret <16 x float> %res4
5078}
5079
5080declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
5081
5082define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
5083; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
5084; CHECK:       ## BB#0:
5085; CHECK-NEXT:    kmovw %edi, %k1
5086; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
5087; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
5088; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
5089; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
5090; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
5091; CHECK-NEXT:    retq
5092  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
5093  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
5094  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
5095  %res3 = fadd <16 x float> %res, %res1
5096  %res4 = fadd <16 x float> %res2, %res3
5097  ret <16 x float> %res4
5098}
5099
5100declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
5101
5102define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
5103; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
5104; CHECK:       ## BB#0:
5105; CHECK-NEXT:    kmovw %edi, %k1
5106; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
5107; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
5108; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
5109; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
5110; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
5111; CHECK-NEXT:    retq
5112  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
5113  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
5114  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
5115  %res3 = add <16 x i32> %res, %res1
5116  %res4 = add <16 x i32> %res2, %res3
5117  ret <16 x i32> %res4
5118}
5119
5120declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
5121
5122define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
5123; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
5124; CHECK:       ## BB#0:
5125; CHECK-NEXT:    kmovw %edi, %k1
5126; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
5127; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
5128; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
5129; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
5130; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
5131; CHECK-NEXT:    retq
5132  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
5133  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
5134  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
5135  %res3 = fadd <8 x double> %res, %res1
5136  %res4 = fadd <8 x double> %res2, %res3
5137  ret <8 x double> %res4
5138}
5139
5140declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
5141
5142define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
5143; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
5144; CHECK:       ## BB#0:
5145; CHECK-NEXT:    kmovw %edi, %k1
5146; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
5147; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
5148; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5149; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
5150; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
5151; CHECK-NEXT:    retq
5152  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
5153  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
5154  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
5155  %res3 = add <8 x i64> %res, %res1
5156  %res4 = add <8 x i64> %res2, %res3
5157  ret <8 x i64> %res4
5158}
5159
5160declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
5161
5162define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
5163; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
5164; CHECK:       ## BB#0:
5165; CHECK-NEXT:    andl $1, %edi
5166; CHECK-NEXT:    kmovw %edi, %k1
5167; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
5168; CHECK-NEXT:    vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
5169; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
5170; CHECK-NEXT:    retq
5171  %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
5172  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
5173  %res2 = fadd <2 x double> %res, %res1
5174  ret <2 x double> %res2
5175}
5176
5177declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
5178
5179define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
5180; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
5181; CHECK:       ## BB#0:
5182; CHECK-NEXT:    andl $1, %edi
5183; CHECK-NEXT:    kmovw %edi, %k1
5184; CHECK-NEXT:    vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5185; CHECK-NEXT:    vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
5186; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
5187; CHECK-NEXT:    retq
5188  %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
5189  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
5190  %res2 = fadd <4 x float> %res, %res1
5191  ret <4 x float> %res2
5192}
5193
5194declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
5195
5196define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5197; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5198; CHECK:       ## BB#0:
5199; CHECK-NEXT:    kmovw %edi, %k1
5200; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5201; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1}
5202; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
5203; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
5204; CHECK-NEXT:    retq
5205  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
5206  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
5207  %res2 = add <16 x i32> %res, %res1
5208  ret <16 x i32> %res2
5209}
5210
5211declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
5212
5213define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5214; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5215; CHECK:       ## BB#0:
5216; CHECK-NEXT:    kmovw %edi, %k1
5217; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5218; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
5219; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
5220; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
5221; CHECK-NEXT:    retq
5222  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
5223  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
5224  %res2 = add <16 x i32> %res, %res1
5225  ret <16 x i32> %res2
5226}
5227
5228declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
5229
5230define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5231; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5232; CHECK:       ## BB#0:
5233; CHECK-NEXT:    kmovw %edi, %k1
5234; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5235; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
5236; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
5237; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
5238; CHECK-NEXT:    retq
5239  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
5240  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
5241  %res2 = add <8 x i64> %res, %res1
5242  ret <8 x i64> %res2
5243}
5244
5245declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
5246
5247define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5248; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5249; CHECK:       ## BB#0:
5250; CHECK-NEXT:    kmovw %edi, %k1
5251; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5252; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
5253; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
5254; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
5255; CHECK-NEXT:    retq
5256  %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
5257  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
5258  %res2 = add <8 x i64> %res, %res1
5259  ret <8 x i64> %res2
5260}
5261
5262define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5263; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
5264; CHECK:       ## BB#0:
5265; CHECK-NEXT:    vcmpeqsd {sae}, %xmm1, %xmm0, %k0
5266; CHECK-NEXT:    kmovw %k0, %eax
5267; CHECK-NEXT:    retq
5268  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
5269  ret i32 %res
5270}
5271
5272define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5273; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
5274; CHECK:       ## BB#0:
5275; CHECK-NEXT:    vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
5276; CHECK-NEXT:    kmovw %k0, %eax
5277; CHECK-NEXT:    retq
5278  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
5279  ret i32 %res
5280}
5281
5282define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5283; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
5284; CHECK:       ## BB#0:
5285; CHECK-NEXT:    vcmpeqsd %xmm1, %xmm0, %k0
5286; CHECK-NEXT:    kmovw %k0, %eax
5287; CHECK-NEXT:    retq
5288  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
5289  ret i32 %res
5290}
5291
5292define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5293; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
5294; CHECK:       ## BB#0:
5295; CHECK-NEXT:    vcmpeq_uqsd %xmm1, %xmm0, %k0
5296; CHECK-NEXT:    kmovw %k0, %eax
5297; CHECK-NEXT:    retq
5298  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
5299  ret i32 %res
5300}
5301
5302define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5303; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
5304; CHECK:       ## BB#0:
5305; CHECK-NEXT:    vcmpltsd {sae}, %xmm1, %xmm0, %k0
5306; CHECK-NEXT:    kmovw %k0, %eax
5307; CHECK-NEXT:    retq
5308  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
5309  ret i32 %res
5310}
5311
5312define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5313; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
5314; CHECK:       ## BB#0:
5315; CHECK-NEXT:    vcmpngesd {sae}, %xmm1, %xmm0, %k0
5316; CHECK-NEXT:    kmovw %k0, %eax
5317; CHECK-NEXT:    retq
5318  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
5319  ret i32 %res
5320}
5321
5322define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5323; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
5324; CHECK:       ## BB#0:
5325; CHECK-NEXT:    vcmpltsd %xmm1, %xmm0, %k0
5326; CHECK-NEXT:    kmovw %k0, %eax
5327; CHECK-NEXT:    retq
5328  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
5329  ret i32 %res
5330}
5331
5332define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5333; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
5334; CHECK:       ## BB#0:
5335; CHECK-NEXT:    vcmpngesd %xmm1, %xmm0, %k0
5336; CHECK-NEXT:    kmovw %k0, %eax
5337; CHECK-NEXT:    retq
5338  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
5339  ret i32 %res
5340}
5341
5342declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
5343
5344define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
5345; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
5346; CHECK:       ## BB#0:
5347; CHECK-NEXT:    vcmpngess %xmm1, %xmm0, %k0
5348; CHECK-NEXT:    kmovw %k0, %eax
5349; CHECK-NEXT:    retq
5350  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
5351  ret i32 %res
5352}
5353
5354declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
5355declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
5356
5357define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
5358; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
5359; CHECK:       ## BB#0:
5360; CHECK-NEXT:    andl $1, %edi
5361; CHECK-NEXT:    kmovw %edi, %k1
5362; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
5363; CHECK-NEXT:    vmovaps %zmm2, %zmm0
5364; CHECK-NEXT:    retq
5365  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
5366  ret <4 x float> %res
5367}
5368
5369define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
5370; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
5371; CHECK:       ## BB#0:
5372; CHECK-NEXT:    andl $1, %edi
5373; CHECK-NEXT:    kmovw %edi, %k1
5374; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
5375; CHECK-NEXT:    retq
5376  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
5377  ret <4 x float> %res
5378}
5379
5380define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
5381; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
5382; CHECK:       ## BB#0:
5383; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0
5384; CHECK-NEXT:    retq
5385  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
5386  ret <4 x float> %res
5387}
5388
5389declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
5390define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
5391; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
5392; CHECK:       ## BB#0:
5393; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0
5394; CHECK-NEXT:    retq
5395  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
5396  ret <2 x double> %res
5397}
5398
5399define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
5400; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
5401; CHECK:       ## BB#0:
5402; CHECK-NEXT:    andl $1, %edi
5403; CHECK-NEXT:    kmovw %edi, %k1
5404; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
5405; CHECK-NEXT:    retq
5406  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
5407  ret <2 x double> %res
5408}
5409
5410define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
5411; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
5412; CHECK:       ## BB#0:
5413; CHECK-NEXT:    andl $1, %edi
5414; CHECK-NEXT:    kmovw %edi, %k1
5415; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
5416; CHECK-NEXT:    vmovaps %zmm2, %zmm0
5417; CHECK-NEXT:    retq
5418  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
5419  ret <2 x double> %res
5420}
5421
5422declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
5423
5424define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
5425; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
5426; CHECK:       ## BB#0:
5427; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
5428; CHECK-NEXT:    kmovw %edi, %k1
5429; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5430; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5431; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5432; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
5433; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
5434; CHECK-NEXT:    retq
5435
5436  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
5437  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
5438  %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
5439  %res4 = fadd <16 x float> %res1, %res2
5440  %res5 = fadd <16 x float> %res3, %res4
5441  ret <16 x float> %res5
5442}
5443
5444declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
5445
5446define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
5447; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
5448; CHECK:       ## BB#0:
5449; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
5450; CHECK-NEXT:    kmovw %edi, %k1
5451; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
5452; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
5453; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
5454; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5455; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
5456; CHECK-NEXT:    retq
5457
5458  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
5459  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
5460  %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
5461  %res4 = fadd <8 x double> %res1, %res2
5462  %res5 = fadd <8 x double> %res3, %res4
5463  ret <8 x double> %res5
5464}
5465
5466declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
5467
5468define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
5469; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
5470; CHECK:       ## BB#0:
5471; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
5472; CHECK-NEXT:    kmovw %edi, %k1
5473; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5474; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5475; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5476; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
5477; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
5478; CHECK-NEXT:    retq
5479
5480  %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
5481  %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
5482  %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
5483  %res4 = add <16 x i32> %res1, %res2
5484  %res5 = add <16 x i32> %res3, %res4
5485  ret <16 x i32> %res5
5486}
5487
5488declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
5489
5490define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
5491; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
5492; CHECK:       ## BB#0:
5493; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
5494; CHECK-NEXT:    kmovw %edi, %k1
5495; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
5496; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
5497; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
5498; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
5499; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
5500; CHECK-NEXT:    retq
5501
5502  %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
5503  %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
5504  %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
5505  %res4 = add <8 x i64> %res1, %res2
5506  %res5 = add <8 x i64> %res3, %res4
5507  ret <8 x i64> %res5
5508}
5509
5510declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
5511
5512define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
5513; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
5514; CHECK:       ## BB#0:
5515; CHECK-NEXT:    kmovw %esi, %k1
5516; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm1 {%k1}
5517; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm2 {%k1} {z}
5518; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm0
5519; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5520; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
5521; CHECK-NEXT:    retq
5522  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> %x2, i8 %x3)
5523  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> %x2, i8 -1)
5524  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> zeroinitializer, i8 %x3)
5525  %res3 = add <8 x i64> %res, %res1
5526  %res4 = add <8 x i64> %res3, %res2
5527  ret <8 x i64> %res4
5528}
5529
5530declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
5531
5532define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
5533; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
5534; CHECK:       ## BB#0:
5535; CHECK-NEXT:    kmovw %esi, %k1
5536; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm1 {%k1}
5537; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm2 {%k1} {z}
5538; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm0
5539; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5540; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
5541; CHECK-NEXT:    retq
5542  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 %x3)
5543  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 -1)
5544  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> zeroinitializer, i16 %x3)
5545  %res3 = add <16 x i32> %res, %res1
5546  %res4 = add <16 x i32> %res3, %res2
5547  ret <16 x i32> %res4
5548}
5549
5550declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
5551
5552define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
5553; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
5554; CHECK:       ## BB#0:
5555; CHECK-NEXT:    kmovw %esi, %k1
5556; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm1 {%k1}
5557; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm2 {%k1} {z}
5558; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm0
5559; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5560; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5561; CHECK-NEXT:    retq
5562  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
5563  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
5564  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
5565  %res3 = add <16 x i32> %res, %res1
5566  %res4 = add <16 x i32> %res3, %res2
5567  ret <16 x i32> %res4
5568}
5569
5570declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
5571
5572define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
5573; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
5574; CHECK:       ## BB#0:
5575; CHECK-NEXT:    kmovw %esi, %k1
5576; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1}
5577; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm2 {%k1} {z}
5578; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm0
5579; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5580; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5581; CHECK-NEXT:    retq
5582  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
5583  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
5584  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
5585  %res3 = add <8 x i64> %res, %res1
5586  %res4 = add <8 x i64> %res3, %res2
5587  ret <8 x i64> %res4
5588}
5589
5590declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
5591
5592define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
5593; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
5594; CHECK:       ## BB#0:
5595; CHECK-NEXT:    kmovw %esi, %k1
5596; CHECK-NEXT:    vpslld $3, %zmm0, %zmm1 {%k1}
5597; CHECK-NEXT:    vpslld $3, %zmm0, %zmm2 {%k1} {z}
5598; CHECK-NEXT:    vpslld $3, %zmm0, %zmm0
5599; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5600; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5601; CHECK-NEXT:    retq
5602  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
5603  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
5604  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
5605  %res3 = add <16 x i32> %res, %res1
5606  %res4 = add <16 x i32> %res3, %res2
5607  ret <16 x i32> %res4
5608}
5609
5610declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
5611
5612define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
5613; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
5614; CHECK:       ## BB#0:
5615; CHECK-NEXT:    kmovw %esi, %k1
5616; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1}
5617; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm2 {%k1} {z}
5618; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm0
5619; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5620; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5621; CHECK-NEXT:    retq
5622  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
5623  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
5624  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
5625  %res3 = add <8 x i64> %res, %res1
5626  %res4 = add <8 x i64> %res3, %res2
5627  ret <8 x i64> %res4
5628}
5629
5630declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
5631
5632define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
5633; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
5634; CHECK:       ## BB#0:
5635; CHECK-NEXT:    kmovw %edi, %k1
5636; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1}
5637; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
5638; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
5639; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
5640; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5641; CHECK-NEXT:    retq
5642  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
5643  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
5644  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
5645  %res3 = add <16 x i32> %res, %res1
5646  %res4 = add <16 x i32> %res3, %res2
5647  ret <16 x i32> %res4
5648}
5649
5650declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
5651
5652define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
5653; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
5654; CHECK:       ## BB#0:
5655; CHECK-NEXT:    kmovw %edi, %k1
5656; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1}
5657; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
5658; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
5659; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
5660; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5661; CHECK-NEXT:    retq
5662  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
5663  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
5664  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
5665  %res3 = add <8 x i64> %res, %res1
5666  %res4 = add <8 x i64> %res3, %res2
5667  ret <8 x i64> %res4
5668}
5669
5670declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16)
5671
5672define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
5673; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
5674; CHECK:       ## BB#0:
5675; CHECK-NEXT:    kmovw %esi, %k1
5676; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1}
5677; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z}
5678; CHECK-NEXT:    vprold $3, %zmm0, %zmm0
5679; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5680; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5681; CHECK-NEXT:    retq
5682  %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
5683  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
5684  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
5685  %res3 = add <16 x i32> %res, %res1
5686  %res4 = add <16 x i32> %res3, %res2
5687  ret <16 x i32> %res4
5688}
5689
5690declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8)
5691
5692define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
5693; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
5694; CHECK:       ## BB#0:
5695; CHECK-NEXT:    kmovw %esi, %k1
5696; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1}
5697; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z}
5698; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0
5699; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5700; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5701; CHECK-NEXT:    retq
5702  %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
5703  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
5704  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
5705  %res3 = add <8 x i64> %res, %res1
5706  %res4 = add <8 x i64> %res3, %res2
5707  ret <8 x i64> %res4
5708}
5709
5710declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
5711
5712define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
5713; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
5714; CHECK:       ## BB#0:
5715; CHECK-NEXT:    kmovw %edi, %k1
5716; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5717; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5718; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5719; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5720; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5721; CHECK-NEXT:    retq
5722  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
5723  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
5724  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
5725  %res3 = add <16 x i32> %res, %res1
5726  %res4 = add <16 x i32> %res3, %res2
5727  ret <16 x i32> %res4
5728}
5729
5730declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
5731
5732define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
5733; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
5734; CHECK:       ## BB#0:
5735; CHECK-NEXT:    kmovw %edi, %k1
5736; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
5737; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
5738; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
5739; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5740; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5741; CHECK-NEXT:    retq
5742  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
5743  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
5744  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
5745  %res3 = add <8 x i64> %res, %res1
5746  %res4 = add <8 x i64> %res3, %res2
5747  ret <8 x i64> %res4
5748}
5749
5750declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
5751
5752define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
5753; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
5754; CHECK:       ## BB#0:
5755; CHECK-NEXT:    kmovw %edi, %k1
5756; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5757; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5758; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5759; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5760; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5761; CHECK-NEXT:    retq
5762  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
5763  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
5764  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
5765  %res3 = add <8 x i64> %res, %res1
5766  %res4 = add <8 x i64> %res3, %res2
5767  ret <8 x i64> %res4
5768}
5769
5770declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16)
5771
5772define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
5773; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
5774; CHECK:       ## BB#0:
5775; CHECK-NEXT:    kmovw %edi, %k1
5776; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5777; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5778; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5779; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5780; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5781; CHECK-NEXT:    retq
5782  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
5783  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
5784  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
5785  %res3 = add <16 x i32> %res, %res1
5786  %res4 = add <16 x i32> %res3, %res2
5787  ret <16 x i32> %res4
5788}
5789
5790declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
5791
5792define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
5793; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
5794; CHECK:       ## BB#0:
5795; CHECK-NEXT:    kmovw %edi, %k1
5796; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5797; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5798; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5799; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5800; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5801; CHECK-NEXT:    retq
5802  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
5803  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
5804  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
5805  %res3 = add <8 x i64> %res, %res1
5806  %res4 = add <8 x i64> %res3, %res2
5807  ret <8 x i64> %res4
5808}
5809
5810declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16)
5811
5812define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
5813; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
5814; CHECK:       ## BB#0:
5815; CHECK-NEXT:    kmovw %edi, %k1
5816; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1}
5817; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm2 {%k1} {z}
5818; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
5819; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5820; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5821; CHECK-NEXT:    retq
5822  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
5823  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
5824  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
5825  %res3 = add <16 x i32> %res, %res1
5826  %res4 = add <16 x i32> %res3, %res2
5827  ret <16 x i32> %res4
5828}
5829
5830declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
5831
5832define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
5833; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
5834; CHECK:       ## BB#0:
5835; CHECK-NEXT:    kmovw %edi, %k1
5836; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1}
5837; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm2 {%k1} {z}
5838; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm0
5839; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5840; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5841; CHECK-NEXT:    retq
5842  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
5843  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
5844  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
5845  %res3 = add <8 x i64> %res, %res1
5846  %res4 = add <8 x i64> %res3, %res2
5847  ret <8 x i64> %res4
5848}
5849
5850declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
5851
5852define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
5853; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
5854; CHECK:       ## BB#0:
5855; CHECK-NEXT:    kmovw %edi, %k1
5856; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1}
5857; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm2 {%k1} {z}
5858; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0
5859; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5860; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5861; CHECK-NEXT:    retq
5862  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
5863  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
5864  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
5865  %res3 = add <8 x i64> %res, %res1
5866  %res4 = add <8 x i64> %res3, %res2
5867  ret <8 x i64> %res4
5868}
5869
5870
5871declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16)
5872
5873define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
5874; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
5875; CHECK:       ## BB#0:
5876; CHECK-NEXT:    kmovw %edi, %k1
5877; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1}
5878; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm2 {%k1} {z}
5879; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0
5880; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
5881; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5882; CHECK-NEXT:    retq
5883  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
5884  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
5885  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
5886  %res3 = add <16 x i32> %res, %res1
5887  %res4 = add <16 x i32> %res3, %res2
5888  ret <16 x i32> %res4
5889}
5890
5891
5892declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
5893
5894define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
5895; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
5896; CHECK:       ## BB#0:
5897; CHECK-NEXT:    kmovw %edi, %k1
5898; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1}
5899; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm2 {%k1} {z}
5900; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm0
5901; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
5902; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5903; CHECK-NEXT:    retq
5904  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
5905  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
5906  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
5907  %res3 = add <8 x i64> %res, %res1
5908  %res4 = add <8 x i64> %res3, %res2
5909  ret <8 x i64> %res4
5910}
5911
5912declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
5913
5914define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
5915; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5916; CHECK:       ## BB#0:
5917; CHECK-NEXT:    kmovw %edi, %k1
5918; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5919; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm3 {%k1} {z}
5920; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
5921; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
5922; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
5923; CHECK-NEXT:    retq
5924  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
5925  %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
5926  %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
5927  %res3 = fadd <8 x double> %res, %res1
5928  %res4 = fadd <8 x double> %res3, %res2
5929  ret <8 x double> %res4
5930}
5931
5932declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
5933
5934define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
5935; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5936; CHECK:       ## BB#0:
5937; CHECK-NEXT:    kmovw %edi, %k1
5938; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1}
5939; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm3 {%k1} {z}
5940; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
5941; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
5942; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
5943; CHECK-NEXT:    retq
5944  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
5945  %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
5946  %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
5947  %res3 = add <8 x i64> %res, %res1
5948  %res4 = add <8 x i64> %res3, %res2
5949  ret <8 x i64> %res4
5950}
5951
5952declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
5953
5954define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
5955; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5956; CHECK:       ## BB#0:
5957; CHECK-NEXT:    kmovw %edi, %k1
5958; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1}
5959; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm3 {%k1} {z}
5960; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
5961; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
5962; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
5963; CHECK-NEXT:    retq
5964  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
5965  %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
5966  %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
5967  %res3 = fadd <16 x float> %res, %res1
5968  %res4 = fadd <16 x float> %res3, %res2
5969  ret <16 x float> %res4
5970}
5971
5972declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
5973
5974define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
5975; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5976; CHECK:       ## BB#0:
5977; CHECK-NEXT:    kmovw %edi, %k1
5978; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1}
5979; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm3 {%k1} {z}
5980; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
5981; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
5982; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5983; CHECK-NEXT:    retq
5984  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
5985  %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
5986  %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
5987  %res3 = add <16 x i32> %res, %res1
5988  %res4 = add <16 x i32> %res3, %res2
5989  ret <16 x i32> %res4
5990}
5991
5992declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5993
5994define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5995; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5996; CHECK:       ## BB#0:
5997; CHECK-NEXT:    kmovw %edi, %k1
5998; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5999; CHECK-NEXT:    vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
6000; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
6001; CHECK-NEXT:    vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
6002; CHECK-NEXT:    vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
6003; CHECK-NEXT:    vaddpd %zmm4, %zmm3, %zmm1
6004; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
6005; CHECK-NEXT:    retq
6006  %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
6007  %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
6008  %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
6009  %res3 = fadd <8 x double> %res, %res1
6010  %res4 = fadd <8 x double> %res3, %res2
6011  ret <8 x double> %res4
6012}
6013
6014declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
6015
6016define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
6017; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
6018; CHECK:       ## BB#0:
6019; CHECK-NEXT:    kmovw %edi, %k1
6020; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6021; CHECK-NEXT:    vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
6022; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
6023; CHECK-NEXT:    vmovaps %zmm0, %zmm5
6024; CHECK-NEXT:    vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
6025; CHECK-NEXT:    vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
6026; CHECK-NEXT:    vaddpd %zmm5, %zmm3, %zmm1
6027; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
6028; CHECK-NEXT:    retq
6029  %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
6030  %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
6031  %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
6032  %res3 = fadd <8 x double> %res, %res1
6033  %res4 = fadd <8 x double> %res3, %res2
6034  ret <8 x double> %res4
6035}
6036
6037declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
6038
6039define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
6040; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
6041; CHECK:       ## BB#0:
6042; CHECK-NEXT:    andl $1, %edi
6043; CHECK-NEXT:    kmovw %edi, %k1
6044; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6045; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
6046; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
6047; CHECK-NEXT:    vmovaps %zmm0, %zmm5
6048; CHECK-NEXT:    vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
6049; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
6050; CHECK-NEXT:    vaddps %xmm5, %xmm3, %xmm1
6051; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
6052; CHECK-NEXT:    retq
6053  %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
6054  %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
6055  %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
6056  %res3 = fadd <4 x float> %res, %res1
6057  %res4 = fadd <4 x float> %res3, %res2
6058  ret <4 x float> %res4
6059}
6060
6061declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
6062
6063define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
6064; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
6065; CHECK:       ## BB#0:
6066; CHECK-NEXT:    andl $1, %edi
6067; CHECK-NEXT:    kmovw %edi, %k1
6068; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6069; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
6070; CHECK-NEXT:    vmovaps %zmm0, %zmm4
6071; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm4
6072; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6073; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6074; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
6075; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
6076; CHECK-NEXT:    retq
6077  %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
6078  %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
6079  %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4)
6080  %res3 = fadd <4 x float> %res, %res1
6081  %res4 = fadd <4 x float> %res3, %res2
6082  ret <4 x float> %res4
6083}
6084
6085declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
6086
6087define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
6088; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
6089; CHECK:       ## BB#0:
6090; CHECK-NEXT:    kmovw %edi, %k1
6091; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6092; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
6093; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
6094; CHECK-NEXT:    vmovaps %zmm0, %zmm5
6095; CHECK-NEXT:    vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
6096; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
6097; CHECK-NEXT:    vaddps %zmm5, %zmm3, %zmm1
6098; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
6099; CHECK-NEXT:    retq
6100  %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
6101  %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
6102  %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
6103  %res3 = fadd <16 x float> %res, %res1
6104  %res4 = fadd <16 x float> %res3, %res2
6105  ret <16 x float> %res4
6106}
6107
6108declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
6109
6110define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
6111; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
6112; CHECK:       ## BB#0:
6113; CHECK-NEXT:    kmovw %edi, %k1
6114; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6115; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
6116; CHECK-NEXT:    vmovaps %zmm0, %zmm4
6117; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm4
6118; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
6119; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
6120; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
6121; CHECK-NEXT:    vaddps %zmm4, %zmm0, %zmm0
6122; CHECK-NEXT:    retq
6123  %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
6124  %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
6125  %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
6126  %res3 = fadd <16 x float> %res, %res1
6127  %res4 = fadd <16 x float> %res3, %res2
6128  ret <16 x float> %res4
6129}
6130
6131declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
6132
6133define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
6134; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
6135; CHECK:       ## BB#0:
6136; CHECK-NEXT:    andl $1, %edi
6137; CHECK-NEXT:    kmovw %edi, %k1
6138; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6139; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
6140; CHECK-NEXT:    vmovaps %zmm0, %zmm4
6141; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm4
6142; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6143; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
6144; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
6145; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
6146; CHECK-NEXT:    retq
6147  %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
6148  %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
6149  %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4)
6150  %res3 = fadd <2 x double> %res, %res1
6151  %res4 = fadd <2 x double> %res3, %res2
6152  ret <2 x double> %res4
6153}
6154
6155declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
6156
6157define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
6158; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
6159; CHECK:       ## BB#0:
6160; CHECK-NEXT:    andl $1, %edi
6161; CHECK-NEXT:    kmovw %edi, %k1
6162; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6163; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
6164; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
6165; CHECK-NEXT:    vmovaps %zmm0, %zmm5
6166; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
6167; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6168; CHECK-NEXT:    vaddpd %xmm5, %xmm3, %xmm1
6169; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
6170; CHECK-NEXT:    retq
6171  %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
6172  %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
6173  %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
6174  %res3 = fadd <2 x double> %res, %res1
6175  %res4 = fadd <2 x double> %res3, %res2
6176  ret <2 x double> %res4
6177}
6178
6179declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
6180
6181define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
6182; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
6183; CHECK:       ## BB#0:
6184; CHECK-NEXT:    kmovw %edi, %k1
6185; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0 {%k1}
6186; CHECK-NEXT:    kmovw %k0, %ecx
6187; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0
6188; CHECK-NEXT:    kmovw %k0, %eax
6189; CHECK-NEXT:    addl %ecx, %eax
6190; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
6191; CHECK-NEXT:    retq
6192  %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
6193  %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
6194  %res2 = add i16 %res, %res1
6195  ret i16 %res2
6196}
6197
6198declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
6199
6200define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
6201; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
6202; CHECK:       ## BB#0:
6203; CHECK-NEXT:    kmovw %edi, %k1
6204; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0 {%k1}
6205; CHECK-NEXT:    kmovw %k0, %ecx
6206; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0
6207; CHECK-NEXT:    kmovw %k0, %eax
6208; CHECK-NEXT:    addb %cl, %al
6209; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
6210; CHECK-NEXT:    retq
6211  %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
6212  %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
6213  %res2 = add i8 %res, %res1
6214  ret i8 %res2
6215}
6216
6217define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
6218; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
6219; CHECK:       ## BB#0:
6220; CHECK-NEXT:    kmovw %esi, %k1
6221; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
6222; CHECK-NEXT:    vpbroadcastd %edi, %zmm1 {%k1} {z}
6223; CHECK-NEXT:    vpbroadcastd %edi, %zmm2
6224; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
6225; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
6226; CHECK-NEXT:    retq
6227  %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
6228  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
6229  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
6230  %res3 = add <16 x i32> %res, %res1
6231  %res4 = add <16 x i32> %res2, %res3
6232  ret <16 x i32> %res4
6233}
6234
6235declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
6236
6237define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
6238; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
6239; CHECK:       ## BB#0:
6240; CHECK-NEXT:    kmovw %esi, %k1
6241; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
6242; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
6243; CHECK-NEXT:    vpbroadcastq %rdi, %zmm2
6244; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
6245; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
6246; CHECK-NEXT:    retq
6247  %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
6248  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
6249  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
6250  %res3 = add <8 x i64> %res, %res1
6251  %res4 = add <8 x i64> %res2, %res3
6252  ret <8 x i64> %res4
6253}
6254declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
6255
6256declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
6257
6258define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6259; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
6260; CHECK:       ## BB#0:
6261; CHECK-NEXT:    andl $1, %edi
6262; CHECK-NEXT:    kmovw %edi, %k1
6263; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6264; CHECK-NEXT:    vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
6265; CHECK-NEXT:    vmovaps %zmm1, %zmm4
6266; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm4
6267; CHECK-NEXT:    vmovaps %zmm0, %zmm5
6268; CHECK-NEXT:    vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
6269; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
6270; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
6271; CHECK-NEXT:    vaddpd %xmm5, %xmm1, %xmm1
6272; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
6273; CHECK-NEXT:    retq
6274  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
6275  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
6276  %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
6277  %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
6278  %res4 = fadd <2 x double> %res, %res1
6279  %res5 = fadd <2 x double> %res2, %res3
6280  %res6 = fadd <2 x double> %res4, %res5
6281  ret <2 x double> %res6
6282}
6283
6284declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
6285
6286define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6287; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
6288; CHECK:       ## BB#0:
6289; CHECK-NEXT:    andl $1, %edi
6290; CHECK-NEXT:    kmovw %edi, %k1
6291; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6292; CHECK-NEXT:    vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
6293; CHECK-NEXT:    vmovaps %zmm1, %zmm4
6294; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm4
6295; CHECK-NEXT:    vmovaps %zmm0, %zmm5
6296; CHECK-NEXT:    vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
6297; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
6298; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
6299; CHECK-NEXT:    vaddps %xmm5, %xmm1, %xmm1
6300; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
6301; CHECK-NEXT:    retq
6302  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
6303  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
6304  %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
6305  %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
6306  %res4 = fadd <4 x float> %res, %res1
6307  %res5 = fadd <4 x float> %res2, %res3
6308  %res6 = fadd <4 x float> %res4, %res5
6309  ret <4 x float> %res6
6310}
6311
6312declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
6313
6314define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6315; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
6316; CHECK:       ## BB#0:
6317; CHECK-NEXT:    andl $1, %edi
6318; CHECK-NEXT:    kmovw %edi, %k1
6319; CHECK-NEXT:    vmovaps %zmm1, %zmm3
6320; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
6321; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
6322; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0
6323; CHECK-NEXT:    retq
6324  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
6325  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
6326  %res2 = fadd <2 x double> %res, %res1
6327  ret <2 x double> %res2
6328}
6329
6330declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
6331
6332define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6333; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
6334; CHECK:       ## BB#0:
6335; CHECK-NEXT:    andl $1, %edi
6336; CHECK-NEXT:    kmovw %edi, %k1
6337; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
6338; CHECK-NEXT:    vmovaps %zmm1, %zmm0
6339; CHECK-NEXT:    retq
6340  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
6341  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
6342  %res2 = fadd <4 x float> %res, %res1
6343  ret <4 x float> %res
6344}
6345declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
6346
6347define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6348; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
6349; CHECK:       ## BB#0:
6350; CHECK-NEXT:    andl $1, %edi
6351; CHECK-NEXT:    kmovw %edi, %k1
6352; CHECK-NEXT:    vmovaps %zmm2, %zmm3
6353; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
6354; CHECK-NEXT:    vmovaps %zmm1, %zmm4
6355; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm4
6356; CHECK-NEXT:    vmovaps %zmm2, %zmm5
6357; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
6358; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
6359; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
6360; CHECK-NEXT:    vaddpd %xmm5, %xmm1, %xmm1
6361; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
6362; CHECK-NEXT:    retq
6363  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
6364  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
6365  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
6366  %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
6367  %res4 = fadd <2 x double> %res, %res1
6368  %res5 = fadd <2 x double> %res2, %res3
6369  %res6 = fadd <2 x double> %res4, %res5
6370  ret <2 x double> %res6
6371}
6372
6373declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
6374
6375define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6376; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
6377; CHECK:       ## BB#0:
6378; CHECK-NEXT:    andl $1, %edi
6379; CHECK-NEXT:    kmovw %edi, %k1
6380; CHECK-NEXT:    vmovaps %zmm2, %zmm3
6381; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
6382; CHECK-NEXT:    vmovaps %zmm1, %zmm4
6383; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm4
6384; CHECK-NEXT:    vmovaps %zmm2, %zmm5
6385; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
6386; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
6387; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
6388; CHECK-NEXT:    vaddps %xmm5, %xmm1, %xmm1
6389; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
6390; CHECK-NEXT:    retq
6391  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
6392  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
6393  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
6394  %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
6395  %res4 = fadd <4 x float> %res, %res1
6396  %res5 = fadd <4 x float> %res2, %res3
6397  %res6 = fadd <4 x float> %res4, %res5
6398  ret <4 x float> %res6
6399}
6400
6401define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
6402; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6403; CHECK:       ## BB#0:
6404; CHECK-NEXT:    andl $1, %esi
6405; CHECK-NEXT:    kmovw %esi, %k1
6406; CHECK-NEXT:    vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
6407; CHECK-NEXT:    vmovaps %zmm1, %zmm0
6408; CHECK-NEXT:    retq
6409  %q = load float, float* %ptr_b
6410  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6411  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
6412  ret < 4 x float> %res
6413}
6414
6415define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
6416; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6417; CHECK:       ## BB#0:
6418; CHECK-NEXT:    andl $1, %esi
6419; CHECK-NEXT:    kmovw %esi, %k1
6420; CHECK-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
6421; CHECK-NEXT:    retq
6422  %q = load float, float* %ptr_b
6423  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6424  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1,  i8 %x3, i32 4)
6425  ret < 4 x float> %res
6426}
6427
6428
6429define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
6430; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
6431; CHECK:       ## BB#0:
6432; CHECK-NEXT:    kxorw %k0, %k0, %k1
6433; CHECK-NEXT:    vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
6434; CHECK-NEXT:    vmovaps %zmm1, %zmm0
6435; CHECK-NEXT:    retq
6436  %q = load float, float* %ptr_b
6437  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6438  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
6439  ret < 4 x float> %res
6440}
6441