xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4
5;
6; sdiv by 7
7;
8
9define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
10; AVX1-LABEL: test_div7_4i64:
11; AVX1:       # BB#0:
12; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
13; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
14; AVX1-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
15; AVX1-NEXT:    imulq %rcx
16; AVX1-NEXT:    movq %rdx, %rax
17; AVX1-NEXT:    shrq $63, %rax
18; AVX1-NEXT:    sarq %rdx
19; AVX1-NEXT:    addq %rax, %rdx
20; AVX1-NEXT:    vmovq %rdx, %xmm2
21; AVX1-NEXT:    vmovq %xmm1, %rax
22; AVX1-NEXT:    imulq %rcx
23; AVX1-NEXT:    movq %rdx, %rax
24; AVX1-NEXT:    shrq $63, %rax
25; AVX1-NEXT:    sarq %rdx
26; AVX1-NEXT:    addq %rax, %rdx
27; AVX1-NEXT:    vmovq %rdx, %xmm1
28; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
29; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
30; AVX1-NEXT:    imulq %rcx
31; AVX1-NEXT:    movq %rdx, %rax
32; AVX1-NEXT:    shrq $63, %rax
33; AVX1-NEXT:    sarq %rdx
34; AVX1-NEXT:    addq %rax, %rdx
35; AVX1-NEXT:    vmovq %rdx, %xmm2
36; AVX1-NEXT:    vmovq %xmm0, %rax
37; AVX1-NEXT:    imulq %rcx
38; AVX1-NEXT:    movq %rdx, %rax
39; AVX1-NEXT:    shrq $63, %rax
40; AVX1-NEXT:    sarq %rdx
41; AVX1-NEXT:    addq %rax, %rdx
42; AVX1-NEXT:    vmovq %rdx, %xmm0
43; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
44; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
45; AVX1-NEXT:    retq
46;
47; AVX2-LABEL: test_div7_4i64:
48; AVX2:       # BB#0:
49; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
50; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
51; AVX2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
52; AVX2-NEXT:    imulq %rcx
53; AVX2-NEXT:    movq %rdx, %rax
54; AVX2-NEXT:    shrq $63, %rax
55; AVX2-NEXT:    sarq %rdx
56; AVX2-NEXT:    addq %rax, %rdx
57; AVX2-NEXT:    vmovq %rdx, %xmm2
58; AVX2-NEXT:    vmovq %xmm1, %rax
59; AVX2-NEXT:    imulq %rcx
60; AVX2-NEXT:    movq %rdx, %rax
61; AVX2-NEXT:    shrq $63, %rax
62; AVX2-NEXT:    sarq %rdx
63; AVX2-NEXT:    addq %rax, %rdx
64; AVX2-NEXT:    vmovq %rdx, %xmm1
65; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
66; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
67; AVX2-NEXT:    imulq %rcx
68; AVX2-NEXT:    movq %rdx, %rax
69; AVX2-NEXT:    shrq $63, %rax
70; AVX2-NEXT:    sarq %rdx
71; AVX2-NEXT:    addq %rax, %rdx
72; AVX2-NEXT:    vmovq %rdx, %xmm2
73; AVX2-NEXT:    vmovq %xmm0, %rax
74; AVX2-NEXT:    imulq %rcx
75; AVX2-NEXT:    movq %rdx, %rax
76; AVX2-NEXT:    shrq $63, %rax
77; AVX2-NEXT:    sarq %rdx
78; AVX2-NEXT:    addq %rax, %rdx
79; AVX2-NEXT:    vmovq %rdx, %xmm0
80; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
81; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
82; AVX2-NEXT:    retq
83  %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
84  ret <4 x i64> %res
85}
86
87define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
88; AVX1-LABEL: test_div7_8i32:
89; AVX1:       # BB#0:
90; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
91; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
92; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
93; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
94; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
95; AVX1-NEXT:    vpmuldq %xmm3, %xmm5, %xmm3
96; AVX1-NEXT:    vpmuldq %xmm2, %xmm4, %xmm2
97; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
98; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
99; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
100; AVX1-NEXT:    vpsrld $31, %xmm2, %xmm3
101; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm2
102; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
103; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
104; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
105; AVX1-NEXT:    vpmuldq %xmm3, %xmm4, %xmm3
106; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
107; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
108; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
109; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
110; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
111; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
112; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
113; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
114; AVX1-NEXT:    retq
115;
116; AVX2-LABEL: test_div7_8i32:
117; AVX2:       # BB#0:
118; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
119; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
120; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
121; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
122; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
123; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
124; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
125; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
126; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm1
127; AVX2-NEXT:    vpsrad $2, %ymm0, %ymm0
128; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
129; AVX2-NEXT:    retq
130  %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
131  ret <8 x i32> %res
132}
133
134define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
135; AVX1-LABEL: test_div7_16i16:
136; AVX1:       # BB#0:
137; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
138; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
139; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm1
140; AVX1-NEXT:    vpsrlw $15, %xmm1, %xmm3
141; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
142; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
143; AVX1-NEXT:    vpmulhw %xmm2, %xmm0, %xmm0
144; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm2
145; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
146; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
147; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
148; AVX1-NEXT:    retq
149;
150; AVX2-LABEL: test_div7_16i16:
151; AVX2:       # BB#0:
152; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm0
153; AVX2-NEXT:    vpsrlw $15, %ymm0, %ymm1
154; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
155; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
156; AVX2-NEXT:    retq
157  %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
158  ret <16 x i16> %res
159}
160
161define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
162; AVX1-LABEL: test_div7_32i8:
163; AVX1:       # BB#0:
164; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
165; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
166; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm3
167; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
168; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
169; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
170; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
171; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
172; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
173; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
174; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
175; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
176; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
177; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
178; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
179; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
180; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
181; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
182; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
183; AVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
184; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
185; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
186; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
187; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
188; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
189; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm7
190; AVX1-NEXT:    vpmullw %xmm3, %xmm7, %xmm3
191; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
192; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
193; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
194; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm2
195; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
196; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
197; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
198; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
199; AVX1-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
200; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
201; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
202; AVX1-NEXT:    retq
203;
204; AVX2-LABEL: test_div7_32i8:
205; AVX2:       # BB#0:
206; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
207; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
208; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
209; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
210; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
211; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
212; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
213; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
214; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
215; AVX2-NEXT:    vpmullw %ymm1, %ymm3, %ymm1
216; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
217; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
218; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
219; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
220; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
221; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm1
222; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
223; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
224; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
225; AVX2-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
226; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
227; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
228; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
229; AVX2-NEXT:    retq
230  %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
231  ret <32 x i8> %res
232}
233
234;
235; srem by 7
236;
237
238define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
239; AVX1-LABEL: test_rem7_4i64:
240; AVX1:       # BB#0:
241; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
242; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
243; AVX1-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
244; AVX1-NEXT:    movq %rcx, %rax
245; AVX1-NEXT:    imulq %rsi
246; AVX1-NEXT:    movq %rdx, %rax
247; AVX1-NEXT:    shrq $63, %rax
248; AVX1-NEXT:    sarq %rdx
249; AVX1-NEXT:    addq %rax, %rdx
250; AVX1-NEXT:    leaq (,%rdx,8), %rax
251; AVX1-NEXT:    subq %rdx, %rax
252; AVX1-NEXT:    subq %rax, %rcx
253; AVX1-NEXT:    vmovq %rcx, %xmm2
254; AVX1-NEXT:    vmovq %xmm1, %rcx
255; AVX1-NEXT:    movq %rcx, %rax
256; AVX1-NEXT:    imulq %rsi
257; AVX1-NEXT:    movq %rdx, %rax
258; AVX1-NEXT:    shrq $63, %rax
259; AVX1-NEXT:    sarq %rdx
260; AVX1-NEXT:    addq %rax, %rdx
261; AVX1-NEXT:    leaq (,%rdx,8), %rax
262; AVX1-NEXT:    subq %rdx, %rax
263; AVX1-NEXT:    subq %rax, %rcx
264; AVX1-NEXT:    vmovq %rcx, %xmm1
265; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
266; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
267; AVX1-NEXT:    movq %rcx, %rax
268; AVX1-NEXT:    imulq %rsi
269; AVX1-NEXT:    movq %rdx, %rax
270; AVX1-NEXT:    shrq $63, %rax
271; AVX1-NEXT:    sarq %rdx
272; AVX1-NEXT:    addq %rax, %rdx
273; AVX1-NEXT:    leaq (,%rdx,8), %rax
274; AVX1-NEXT:    subq %rdx, %rax
275; AVX1-NEXT:    subq %rax, %rcx
276; AVX1-NEXT:    vmovq %rcx, %xmm2
277; AVX1-NEXT:    vmovq %xmm0, %rcx
278; AVX1-NEXT:    movq %rcx, %rax
279; AVX1-NEXT:    imulq %rsi
280; AVX1-NEXT:    movq %rdx, %rax
281; AVX1-NEXT:    shrq $63, %rax
282; AVX1-NEXT:    sarq %rdx
283; AVX1-NEXT:    addq %rax, %rdx
284; AVX1-NEXT:    leaq (,%rdx,8), %rax
285; AVX1-NEXT:    subq %rdx, %rax
286; AVX1-NEXT:    subq %rax, %rcx
287; AVX1-NEXT:    vmovq %rcx, %xmm0
288; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
289; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
290; AVX1-NEXT:    retq
291;
292; AVX2-LABEL: test_rem7_4i64:
293; AVX2:       # BB#0:
294; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
295; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
296; AVX2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
297; AVX2-NEXT:    movq %rcx, %rax
298; AVX2-NEXT:    imulq %rsi
299; AVX2-NEXT:    movq %rdx, %rax
300; AVX2-NEXT:    shrq $63, %rax
301; AVX2-NEXT:    sarq %rdx
302; AVX2-NEXT:    addq %rax, %rdx
303; AVX2-NEXT:    leaq (,%rdx,8), %rax
304; AVX2-NEXT:    subq %rdx, %rax
305; AVX2-NEXT:    subq %rax, %rcx
306; AVX2-NEXT:    vmovq %rcx, %xmm2
307; AVX2-NEXT:    vmovq %xmm1, %rcx
308; AVX2-NEXT:    movq %rcx, %rax
309; AVX2-NEXT:    imulq %rsi
310; AVX2-NEXT:    movq %rdx, %rax
311; AVX2-NEXT:    shrq $63, %rax
312; AVX2-NEXT:    sarq %rdx
313; AVX2-NEXT:    addq %rax, %rdx
314; AVX2-NEXT:    leaq (,%rdx,8), %rax
315; AVX2-NEXT:    subq %rdx, %rax
316; AVX2-NEXT:    subq %rax, %rcx
317; AVX2-NEXT:    vmovq %rcx, %xmm1
318; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
319; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
320; AVX2-NEXT:    movq %rcx, %rax
321; AVX2-NEXT:    imulq %rsi
322; AVX2-NEXT:    movq %rdx, %rax
323; AVX2-NEXT:    shrq $63, %rax
324; AVX2-NEXT:    sarq %rdx
325; AVX2-NEXT:    addq %rax, %rdx
326; AVX2-NEXT:    leaq (,%rdx,8), %rax
327; AVX2-NEXT:    subq %rdx, %rax
328; AVX2-NEXT:    subq %rax, %rcx
329; AVX2-NEXT:    vmovq %rcx, %xmm2
330; AVX2-NEXT:    vmovq %xmm0, %rcx
331; AVX2-NEXT:    movq %rcx, %rax
332; AVX2-NEXT:    imulq %rsi
333; AVX2-NEXT:    movq %rdx, %rax
334; AVX2-NEXT:    shrq $63, %rax
335; AVX2-NEXT:    sarq %rdx
336; AVX2-NEXT:    addq %rax, %rdx
337; AVX2-NEXT:    leaq (,%rdx,8), %rax
338; AVX2-NEXT:    subq %rdx, %rax
339; AVX2-NEXT:    subq %rax, %rcx
340; AVX2-NEXT:    vmovq %rcx, %xmm0
341; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
342; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
343; AVX2-NEXT:    retq
344  %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
345  ret <4 x i64> %res
346}
347
348define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
349; AVX1-LABEL: test_rem7_8i32:
350; AVX1:       # BB#0:
351; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
352; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
353; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
354; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
355; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
356; AVX1-NEXT:    vpmuldq %xmm3, %xmm5, %xmm3
357; AVX1-NEXT:    vpmuldq %xmm2, %xmm4, %xmm2
358; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
359; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
360; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
361; AVX1-NEXT:    vpsrld $31, %xmm2, %xmm3
362; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm2
363; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
364; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
365; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
366; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
367; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
368; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
369; AVX1-NEXT:    vpmuldq %xmm4, %xmm5, %xmm4
370; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
371; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
372; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
373; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
374; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm4
375; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
376; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
377; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
378; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
379; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
380; AVX1-NEXT:    retq
381;
382; AVX2-LABEL: test_rem7_8i32:
383; AVX2:       # BB#0:
384; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
385; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
386; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
387; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
388; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
389; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
390; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
391; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
392; AVX2-NEXT:    vpsrld $31, %ymm1, %ymm2
393; AVX2-NEXT:    vpsrad $2, %ymm1, %ymm1
394; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
395; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
396; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
397; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
398; AVX2-NEXT:    retq
399  %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
400  ret <8 x i32> %res
401}
402
403define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
404; AVX1-LABEL: test_rem7_16i16:
405; AVX1:       # BB#0:
406; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
407; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
408; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm3
409; AVX1-NEXT:    vpsrlw $15, %xmm3, %xmm4
410; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm3
411; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
412; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
413; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
414; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
415; AVX1-NEXT:    vpmulhw %xmm2, %xmm0, %xmm2
416; AVX1-NEXT:    vpsrlw $15, %xmm2, %xmm3
417; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
418; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
419; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
420; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
421; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
422; AVX1-NEXT:    retq
423;
424; AVX2-LABEL: test_rem7_16i16:
425; AVX2:       # BB#0:
426; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm1
427; AVX2-NEXT:    vpsrlw $15, %ymm1, %ymm2
428; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm1
429; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
430; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
431; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
432; AVX2-NEXT:    retq
433  %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
434  ret <16 x i16> %res
435}
436
437define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
438; AVX1-LABEL: test_rem7_32i8:
439; AVX1:       # BB#0:
440; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
441; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
442; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm1
443; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
444; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
445; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
446; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
447; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm4
448; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
449; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
450; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
451; AVX1-NEXT:    vpsrlw $7, %xmm3, %xmm4
452; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
453; AVX1-NEXT:    vpand %xmm8, %xmm4, %xmm4
454; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
455; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
456; AVX1-NEXT:    vpand %xmm9, %xmm3, %xmm3
457; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
458; AVX1-NEXT:    vpxor %xmm7, %xmm3, %xmm3
459; AVX1-NEXT:    vpsubb %xmm7, %xmm3, %xmm3
460; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
461; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm4
462; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm5
463; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
464; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
465; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
466; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
467; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
468; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
469; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
470; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
471; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
472; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
473; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
474; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
475; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
476; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
477; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm1
478; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
479; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
480; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
481; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm3
482; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
483; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
484; AVX1-NEXT:    vpand %xmm9, %xmm1, %xmm1
485; AVX1-NEXT:    vpxor %xmm7, %xmm1, %xmm1
486; AVX1-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
487; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
488; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm3
489; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
490; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
491; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
492; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
493; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
494; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
495; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
496; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
497; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
498; AVX1-NEXT:    retq
499;
500; AVX2-LABEL: test_rem7_32i8:
501; AVX2:       # BB#0:
502; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
503; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
504; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
505; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
506; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
507; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
508; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
509; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
510; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
511; AVX2-NEXT:    vpmullw %ymm1, %ymm3, %ymm1
512; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
513; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
514; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
515; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
516; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
517; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm2
518; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
519; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
520; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
521; AVX2-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
522; AVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
523; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
524; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
525; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
526; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
527; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm3
528; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
529; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
530; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
531; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
532; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
533; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
534; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
535; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
536; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
537; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
538; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
539; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
540; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
541; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
542; AVX2-NEXT:    retq
543  %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
544  ret <32 x i8> %res
545}
546