1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 4 5; 6; sdiv by 7 7; 8 9define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { 10; AVX1-LABEL: test_div7_4i64: 11; AVX1: # BB#0: 12; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 13; AVX1-NEXT: vpextrq $1, %xmm1, %rax 14; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 15; AVX1-NEXT: imulq %rcx 16; AVX1-NEXT: movq %rdx, %rax 17; AVX1-NEXT: shrq $63, %rax 18; AVX1-NEXT: sarq %rdx 19; AVX1-NEXT: addq %rax, %rdx 20; AVX1-NEXT: vmovq %rdx, %xmm2 21; AVX1-NEXT: vmovq %xmm1, %rax 22; AVX1-NEXT: imulq %rcx 23; AVX1-NEXT: movq %rdx, %rax 24; AVX1-NEXT: shrq $63, %rax 25; AVX1-NEXT: sarq %rdx 26; AVX1-NEXT: addq %rax, %rdx 27; AVX1-NEXT: vmovq %rdx, %xmm1 28; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 29; AVX1-NEXT: vpextrq $1, %xmm0, %rax 30; AVX1-NEXT: imulq %rcx 31; AVX1-NEXT: movq %rdx, %rax 32; AVX1-NEXT: shrq $63, %rax 33; AVX1-NEXT: sarq %rdx 34; AVX1-NEXT: addq %rax, %rdx 35; AVX1-NEXT: vmovq %rdx, %xmm2 36; AVX1-NEXT: vmovq %xmm0, %rax 37; AVX1-NEXT: imulq %rcx 38; AVX1-NEXT: movq %rdx, %rax 39; AVX1-NEXT: shrq $63, %rax 40; AVX1-NEXT: sarq %rdx 41; AVX1-NEXT: addq %rax, %rdx 42; AVX1-NEXT: vmovq %rdx, %xmm0 43; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 44; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 45; AVX1-NEXT: retq 46; 47; AVX2-LABEL: test_div7_4i64: 48; AVX2: # BB#0: 49; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 50; AVX2-NEXT: vpextrq $1, %xmm1, %rax 51; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 52; AVX2-NEXT: imulq %rcx 53; AVX2-NEXT: movq %rdx, %rax 54; AVX2-NEXT: shrq $63, %rax 55; AVX2-NEXT: sarq %rdx 56; AVX2-NEXT: addq %rax, %rdx 57; AVX2-NEXT: vmovq %rdx, %xmm2 58; AVX2-NEXT: vmovq %xmm1, %rax 59; AVX2-NEXT: imulq %rcx 60; AVX2-NEXT: movq %rdx, %rax 61; AVX2-NEXT: shrq $63, %rax 62; AVX2-NEXT: sarq %rdx 63; AVX2-NEXT: addq %rax, %rdx 64; AVX2-NEXT: vmovq %rdx, %xmm1 65; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 66; AVX2-NEXT: vpextrq $1, %xmm0, %rax 67; AVX2-NEXT: imulq %rcx 68; AVX2-NEXT: movq %rdx, %rax 69; AVX2-NEXT: shrq $63, %rax 70; AVX2-NEXT: sarq %rdx 71; AVX2-NEXT: addq %rax, %rdx 72; AVX2-NEXT: vmovq %rdx, %xmm2 73; AVX2-NEXT: vmovq %xmm0, %rax 74; AVX2-NEXT: imulq %rcx 75; AVX2-NEXT: movq %rdx, %rax 76; AVX2-NEXT: shrq $63, %rax 77; AVX2-NEXT: sarq %rdx 78; AVX2-NEXT: addq %rax, %rdx 79; AVX2-NEXT: vmovq %rdx, %xmm0 80; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 81; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 82; AVX2-NEXT: retq 83 %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 84 ret <4 x i64> %res 85} 86 87define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { 88; AVX1-LABEL: test_div7_8i32: 89; AVX1: # BB#0: 90; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 91; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 92; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 93; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 94; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 95; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3 96; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2 97; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 98; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 99; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 100; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 101; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 102; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 103; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 104; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 105; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 106; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 107; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 108; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 109; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 110; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 111; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 112; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 113; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 114; AVX1-NEXT: retq 115; 116; AVX2-LABEL: test_div7_8i32: 117; AVX2: # BB#0: 118; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 119; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 120; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 121; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 122; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 123; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 124; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 125; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 126; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1 127; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0 128; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 129; AVX2-NEXT: retq 130 %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 131 ret <8 x i32> %res 132} 133 134define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { 135; AVX1-LABEL: test_div7_16i16: 136; AVX1: # BB#0: 137; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 138; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] 139; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1 140; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3 141; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 142; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 143; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 144; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 145; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 146; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 147; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 148; AVX1-NEXT: retq 149; 150; AVX2-LABEL: test_div7_16i16: 151; AVX2: # BB#0: 152; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0 153; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1 154; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 155; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 156; AVX2-NEXT: retq 157 %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 158 ret <16 x i16> %res 159} 160 161define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { 162; AVX1-LABEL: test_div7_32i8: 163; AVX1: # BB#0: 164; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 165; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 166; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3 167; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 168; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 169; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 170; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 171; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 172; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 173; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 174; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 175; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2 176; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 177; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 178; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 179; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 180; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 181; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 182; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 183; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 184; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 185; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 186; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 187; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 188; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] 189; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 190; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3 191; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 192; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 193; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 194; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2 195; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 196; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 197; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 198; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 199; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 200; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 201; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 202; AVX1-NEXT: retq 203; 204; AVX2-LABEL: test_div7_32i8: 205; AVX2: # BB#0: 206; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147] 207; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 208; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 209; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 210; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3 211; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 212; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 213; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 214; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3 215; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1 216; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 217; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] 218; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 219; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 220; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 221; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1 222; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 223; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 224; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 225; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1 226; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 227; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 228; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 229; AVX2-NEXT: retq 230 %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 231 ret <32 x i8> %res 232} 233 234; 235; srem by 7 236; 237 238define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { 239; AVX1-LABEL: test_rem7_4i64: 240; AVX1: # BB#0: 241; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 242; AVX1-NEXT: vpextrq $1, %xmm1, %rcx 243; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 244; AVX1-NEXT: movq %rcx, %rax 245; AVX1-NEXT: imulq %rsi 246; AVX1-NEXT: movq %rdx, %rax 247; AVX1-NEXT: shrq $63, %rax 248; AVX1-NEXT: sarq %rdx 249; AVX1-NEXT: addq %rax, %rdx 250; AVX1-NEXT: leaq (,%rdx,8), %rax 251; AVX1-NEXT: subq %rdx, %rax 252; AVX1-NEXT: subq %rax, %rcx 253; AVX1-NEXT: vmovq %rcx, %xmm2 254; AVX1-NEXT: vmovq %xmm1, %rcx 255; AVX1-NEXT: movq %rcx, %rax 256; AVX1-NEXT: imulq %rsi 257; AVX1-NEXT: movq %rdx, %rax 258; AVX1-NEXT: shrq $63, %rax 259; AVX1-NEXT: sarq %rdx 260; AVX1-NEXT: addq %rax, %rdx 261; AVX1-NEXT: leaq (,%rdx,8), %rax 262; AVX1-NEXT: subq %rdx, %rax 263; AVX1-NEXT: subq %rax, %rcx 264; AVX1-NEXT: vmovq %rcx, %xmm1 265; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 266; AVX1-NEXT: vpextrq $1, %xmm0, %rcx 267; AVX1-NEXT: movq %rcx, %rax 268; AVX1-NEXT: imulq %rsi 269; AVX1-NEXT: movq %rdx, %rax 270; AVX1-NEXT: shrq $63, %rax 271; AVX1-NEXT: sarq %rdx 272; AVX1-NEXT: addq %rax, %rdx 273; AVX1-NEXT: leaq (,%rdx,8), %rax 274; AVX1-NEXT: subq %rdx, %rax 275; AVX1-NEXT: subq %rax, %rcx 276; AVX1-NEXT: vmovq %rcx, %xmm2 277; AVX1-NEXT: vmovq %xmm0, %rcx 278; AVX1-NEXT: movq %rcx, %rax 279; AVX1-NEXT: imulq %rsi 280; AVX1-NEXT: movq %rdx, %rax 281; AVX1-NEXT: shrq $63, %rax 282; AVX1-NEXT: sarq %rdx 283; AVX1-NEXT: addq %rax, %rdx 284; AVX1-NEXT: leaq (,%rdx,8), %rax 285; AVX1-NEXT: subq %rdx, %rax 286; AVX1-NEXT: subq %rax, %rcx 287; AVX1-NEXT: vmovq %rcx, %xmm0 288; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 289; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 290; AVX1-NEXT: retq 291; 292; AVX2-LABEL: test_rem7_4i64: 293; AVX2: # BB#0: 294; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 295; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 296; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 297; AVX2-NEXT: movq %rcx, %rax 298; AVX2-NEXT: imulq %rsi 299; AVX2-NEXT: movq %rdx, %rax 300; AVX2-NEXT: shrq $63, %rax 301; AVX2-NEXT: sarq %rdx 302; AVX2-NEXT: addq %rax, %rdx 303; AVX2-NEXT: leaq (,%rdx,8), %rax 304; AVX2-NEXT: subq %rdx, %rax 305; AVX2-NEXT: subq %rax, %rcx 306; AVX2-NEXT: vmovq %rcx, %xmm2 307; AVX2-NEXT: vmovq %xmm1, %rcx 308; AVX2-NEXT: movq %rcx, %rax 309; AVX2-NEXT: imulq %rsi 310; AVX2-NEXT: movq %rdx, %rax 311; AVX2-NEXT: shrq $63, %rax 312; AVX2-NEXT: sarq %rdx 313; AVX2-NEXT: addq %rax, %rdx 314; AVX2-NEXT: leaq (,%rdx,8), %rax 315; AVX2-NEXT: subq %rdx, %rax 316; AVX2-NEXT: subq %rax, %rcx 317; AVX2-NEXT: vmovq %rcx, %xmm1 318; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 319; AVX2-NEXT: vpextrq $1, %xmm0, %rcx 320; AVX2-NEXT: movq %rcx, %rax 321; AVX2-NEXT: imulq %rsi 322; AVX2-NEXT: movq %rdx, %rax 323; AVX2-NEXT: shrq $63, %rax 324; AVX2-NEXT: sarq %rdx 325; AVX2-NEXT: addq %rax, %rdx 326; AVX2-NEXT: leaq (,%rdx,8), %rax 327; AVX2-NEXT: subq %rdx, %rax 328; AVX2-NEXT: subq %rax, %rcx 329; AVX2-NEXT: vmovq %rcx, %xmm2 330; AVX2-NEXT: vmovq %xmm0, %rcx 331; AVX2-NEXT: movq %rcx, %rax 332; AVX2-NEXT: imulq %rsi 333; AVX2-NEXT: movq %rdx, %rax 334; AVX2-NEXT: shrq $63, %rax 335; AVX2-NEXT: sarq %rdx 336; AVX2-NEXT: addq %rax, %rdx 337; AVX2-NEXT: leaq (,%rdx,8), %rax 338; AVX2-NEXT: subq %rdx, %rax 339; AVX2-NEXT: subq %rax, %rcx 340; AVX2-NEXT: vmovq %rcx, %xmm0 341; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 342; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 343; AVX2-NEXT: retq 344 %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 345 ret <4 x i64> %res 346} 347 348define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { 349; AVX1-LABEL: test_rem7_8i32: 350; AVX1: # BB#0: 351; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 352; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 353; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 354; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 355; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 356; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3 357; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2 358; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 359; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 360; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 361; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 362; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 363; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 364; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7] 365; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 366; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 367; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 368; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 369; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 370; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 371; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 372; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] 373; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 374; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4 375; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 376; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 377; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 378; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 379; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 380; AVX1-NEXT: retq 381; 382; AVX2-LABEL: test_rem7_8i32: 383; AVX2: # BB#0: 384; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 385; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 386; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 387; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 388; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 389; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 390; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 391; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 392; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 393; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 394; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 395; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 396; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 397; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 398; AVX2-NEXT: retq 399 %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 400 ret <8 x i32> %res 401} 402 403define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { 404; AVX1-LABEL: test_rem7_16i16: 405; AVX1: # BB#0: 406; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 407; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] 408; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3 409; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4 410; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 411; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 412; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7] 413; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 414; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 415; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2 416; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3 417; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 418; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 419; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 420; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 421; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 422; AVX1-NEXT: retq 423; 424; AVX2-LABEL: test_rem7_16i16: 425; AVX2: # BB#0: 426; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 427; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 428; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 429; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 430; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 431; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 432; AVX2-NEXT: retq 433 %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 434 ret <16 x i16> %res 435} 436 437define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { 438; AVX1-LABEL: test_rem7_32i8: 439; AVX1: # BB#0: 440; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 441; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 442; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm1 443; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 444; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 445; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 446; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 447; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4 448; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 449; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 450; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm3 451; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm4 452; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 453; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4 454; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 455; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 456; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3 457; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 458; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3 459; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 460; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 461; AVX1-NEXT: vpmovsxbw %xmm3, %xmm4 462; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm5 463; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 464; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 465; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 466; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 467; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 468; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 469; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 470; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 471; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 472; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 473; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 474; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 475; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 476; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 477; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1 478; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 479; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 480; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 481; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3 482; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 483; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 484; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1 485; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1 486; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 487; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 488; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3 489; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 490; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 491; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 492; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 493; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 494; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 495; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 496; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 497; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 498; AVX1-NEXT: retq 499; 500; AVX2-LABEL: test_rem7_32i8: 501; AVX2: # BB#0: 502; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147] 503; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 504; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 505; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 506; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3 507; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 508; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 509; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 510; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3 511; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1 512; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 513; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] 514; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 515; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 516; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm1 517; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm2 518; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 519; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 520; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 521; AVX2-NEXT: vpsubb %ymm3, %ymm2, %ymm2 522; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 523; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 524; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 525; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 526; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 527; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3 528; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 529; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 530; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 531; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 532; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 533; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 534; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 535; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 536; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 537; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 538; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 539; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 540; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 541; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 542; AVX2-NEXT: retq 543 %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 544 ret <32 x i8> %res 545} 546