1*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mattr=sse2 -mtriple=i386-unknown-linux-gnu | FileCheck %s 2*9880d681SAndroid Build Coastguard Worker 3*9880d681SAndroid Build Coastguard Worker; Source file looks something like this: 4*9880d681SAndroid Build Coastguard Worker; 5*9880d681SAndroid Build Coastguard Worker; typedef int AAA[100][100]; 6*9880d681SAndroid Build Coastguard Worker; 7*9880d681SAndroid Build Coastguard Worker; void testCombineMultiplies(AAA a,int lll) 8*9880d681SAndroid Build Coastguard Worker; { 9*9880d681SAndroid Build Coastguard Worker; int LOC = lll + 5; 10*9880d681SAndroid Build Coastguard Worker; 11*9880d681SAndroid Build Coastguard Worker; a[LOC][LOC] = 11; 12*9880d681SAndroid Build Coastguard Worker; 13*9880d681SAndroid Build Coastguard Worker; a[LOC][20] = 22; 14*9880d681SAndroid Build Coastguard Worker; a[LOC+20][20] = 33; 15*9880d681SAndroid Build Coastguard Worker; } 16*9880d681SAndroid Build Coastguard Worker; 17*9880d681SAndroid Build Coastguard Worker; We want to make sure we don't generate 2 multiply instructions, 18*9880d681SAndroid Build Coastguard Worker; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp 19*9880d681SAndroid Build Coastguard Worker; should combine the instructions in such a way to avoid the extra 20*9880d681SAndroid Build Coastguard Worker; multiply. 21*9880d681SAndroid Build Coastguard Worker; 22*9880d681SAndroid Build Coastguard Worker; Output looks roughly like this: 23*9880d681SAndroid Build Coastguard Worker; 24*9880d681SAndroid Build Coastguard Worker; movl 8(%esp), %eax 25*9880d681SAndroid Build Coastguard Worker; movl 12(%esp), %ecx 26*9880d681SAndroid Build Coastguard Worker; imull $400, %ecx, %edx # imm = 0x190 27*9880d681SAndroid Build Coastguard Worker; leal (%edx,%eax), %esi 28*9880d681SAndroid Build Coastguard Worker; movl $11, 2020(%esi,%ecx,4) 29*9880d681SAndroid Build Coastguard Worker; movl $22, 2080(%edx,%eax) 30*9880d681SAndroid Build Coastguard Worker; movl $33, 10080(%edx,%eax) 31*9880d681SAndroid Build Coastguard Worker; 32*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: testCombineMultiplies 33*9880d681SAndroid Build Coastguard Worker; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190 34*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: leal ([[ARG2:%[a-z]+]],[[MUL]]), [[LEA:%[a-z]+]] 35*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4) 36*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movl $22, {{[0-9]+}}([[ARG2]],[[MUL]]) 37*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movl $33, {{[0-9]+}}([[ARG2]],[[MUL]]) 38*9880d681SAndroid Build Coastguard Worker; CHECK: retl 39*9880d681SAndroid Build Coastguard Worker; 40*9880d681SAndroid Build Coastguard Worker 41*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind 42*9880d681SAndroid Build Coastguard Workerdefine void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) { 43*9880d681SAndroid Build Coastguard Workerentry: 44*9880d681SAndroid Build Coastguard Worker %add = add nsw i32 %lll, 5 45*9880d681SAndroid Build Coastguard Worker %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add 46*9880d681SAndroid Build Coastguard Worker store i32 11, i32* %arrayidx1, align 4 47*9880d681SAndroid Build Coastguard Worker %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 48*9880d681SAndroid Build Coastguard Worker store i32 22, i32* %arrayidx3, align 4 49*9880d681SAndroid Build Coastguard Worker %add4 = add nsw i32 %lll, 25 50*9880d681SAndroid Build Coastguard Worker %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 51*9880d681SAndroid Build Coastguard Worker store i32 33, i32* %arrayidx6, align 4 52*9880d681SAndroid Build Coastguard Worker ret void 53*9880d681SAndroid Build Coastguard Worker} 54*9880d681SAndroid Build Coastguard Worker 55*9880d681SAndroid Build Coastguard Worker 56*9880d681SAndroid Build Coastguard Worker; Test for the same optimization on vector multiplies. 57*9880d681SAndroid Build Coastguard Worker; 58*9880d681SAndroid Build Coastguard Worker; Source looks something like this: 59*9880d681SAndroid Build Coastguard Worker; 60*9880d681SAndroid Build Coastguard Worker; typedef int v4int __attribute__((__vector_size__(16))); 61*9880d681SAndroid Build Coastguard Worker; 62*9880d681SAndroid Build Coastguard Worker; v4int x; 63*9880d681SAndroid Build Coastguard Worker; v4int v2, v3; 64*9880d681SAndroid Build Coastguard Worker; void testCombineMultiplies_splat(v4int v1) { 65*9880d681SAndroid Build Coastguard Worker; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22}; 66*9880d681SAndroid Build Coastguard Worker; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22}; 67*9880d681SAndroid Build Coastguard Worker; x = (v1 + (v4int){ 11, 11, 11, 11 }); 68*9880d681SAndroid Build Coastguard Worker; } 69*9880d681SAndroid Build Coastguard Worker; 70*9880d681SAndroid Build Coastguard Worker; Output looks something like this: 71*9880d681SAndroid Build Coastguard Worker; 72*9880d681SAndroid Build Coastguard Worker; testCombineMultiplies_splat: # @testCombineMultiplies_splat 73*9880d681SAndroid Build Coastguard Worker; # BB#0: # %entry 74*9880d681SAndroid Build Coastguard Worker; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11] 75*9880d681SAndroid Build Coastguard Worker; paddd %xmm0, %xmm1 76*9880d681SAndroid Build Coastguard Worker; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22] 77*9880d681SAndroid Build Coastguard Worker; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3] 78*9880d681SAndroid Build Coastguard Worker; pmuludq %xmm2, %xmm0 79*9880d681SAndroid Build Coastguard Worker; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] 80*9880d681SAndroid Build Coastguard Worker; pmuludq %xmm2, %xmm3 81*9880d681SAndroid Build Coastguard Worker; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3] 82*9880d681SAndroid Build Coastguard Worker; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 83*9880d681SAndroid Build Coastguard Worker; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242] 84*9880d681SAndroid Build Coastguard Worker; paddd %xmm0, %xmm2 85*9880d681SAndroid Build Coastguard Worker; paddd .LCPI1_3, %xmm0 86*9880d681SAndroid Build Coastguard Worker; movdqa %xmm2, v2 87*9880d681SAndroid Build Coastguard Worker; movdqa %xmm0, v3 88*9880d681SAndroid Build Coastguard Worker; movdqa %xmm1, x 89*9880d681SAndroid Build Coastguard Worker; retl 90*9880d681SAndroid Build Coastguard Worker; 91*9880d681SAndroid Build Coastguard Worker; Again, we want to make sure we don't generate two different multiplies. 92*9880d681SAndroid Build Coastguard Worker; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two 93*9880d681SAndroid Build Coastguard Worker; pmuludq instructions), followed by two adds. Without this optimization, we'd 94*9880d681SAndroid Build Coastguard Worker; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions). 95*9880d681SAndroid Build Coastguard Worker; 96*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: testCombineMultiplies_splat 97*9880d681SAndroid Build Coastguard Worker; CHECK: movdqa .LCPI1_0, [[C11:%xmm[0-9]]] 98*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: paddd %xmm0, [[C11]] 99*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movdqa .LCPI1_1, [[C22:%xmm[0-9]]] 100*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]] 101*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]] 102*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]] 103*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pmuludq [[C22]], [[T4:%xmm[0-9]]] 104*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $232, [[T4]], [[T5:%xmm[0-9]]] 105*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]] 106*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movdqa .LCPI1_2, [[C242:%xmm[0-9]]] 107*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: paddd [[T6]], [[C242]] 108*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: paddd .LCPI1_3, [[C726:%xmm[0-9]]] 109*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movdqa [[C242]], v2 110*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: [[C726]], v3 111*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: [[C11]], x 112*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: retl 113*9880d681SAndroid Build Coastguard Worker 114*9880d681SAndroid Build Coastguard Worker@v2 = common global <4 x i32> zeroinitializer, align 16 115*9880d681SAndroid Build Coastguard Worker@v3 = common global <4 x i32> zeroinitializer, align 16 116*9880d681SAndroid Build Coastguard Worker@x = common global <4 x i32> zeroinitializer, align 16 117*9880d681SAndroid Build Coastguard Worker 118*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind 119*9880d681SAndroid Build Coastguard Workerdefine void @testCombineMultiplies_splat(<4 x i32> %v1) { 120*9880d681SAndroid Build Coastguard Workerentry: 121*9880d681SAndroid Build Coastguard Worker %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11> 122*9880d681SAndroid Build Coastguard Worker %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22> 123*9880d681SAndroid Build Coastguard Worker %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33> 124*9880d681SAndroid Build Coastguard Worker %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22> 125*9880d681SAndroid Build Coastguard Worker store <4 x i32> %mul1, <4 x i32>* @v2, align 16 126*9880d681SAndroid Build Coastguard Worker store <4 x i32> %mul2, <4 x i32>* @v3, align 16 127*9880d681SAndroid Build Coastguard Worker store <4 x i32> %add1, <4 x i32>* @x, align 16 128*9880d681SAndroid Build Coastguard Worker ret void 129*9880d681SAndroid Build Coastguard Worker} 130*9880d681SAndroid Build Coastguard Worker 131*9880d681SAndroid Build Coastguard Worker; Finally, check the non-splatted vector case. This is very similar 132*9880d681SAndroid Build Coastguard Worker; to the previous test case, except for the vector values. 133*9880d681SAndroid Build Coastguard Worker; 134*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: testCombineMultiplies_non_splat 135*9880d681SAndroid Build Coastguard Worker; CHECK: movdqa .LCPI2_0, [[C11:%xmm[0-9]]] 136*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: paddd %xmm0, [[C11]] 137*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movdqa .LCPI2_1, [[C22:%xmm[0-9]]] 138*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]] 139*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]] 140*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]] 141*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $245, [[C22]], [[T7:%xmm[0-9]]] 142*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pmuludq [[T1]], [[T7]] 143*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: pshufd $232, [[T7]], [[T5:%xmm[0-9]]] 144*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]] 145*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movdqa .LCPI2_2, [[C242:%xmm[0-9]]] 146*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: paddd [[T6]], [[C242]] 147*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: paddd .LCPI2_3, [[C726:%xmm[0-9]]] 148*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: movdqa [[C242]], v2 149*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: [[C726]], v3 150*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: [[C11]], x 151*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: retl 152*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind 153*9880d681SAndroid Build Coastguard Workerdefine void @testCombineMultiplies_non_splat(<4 x i32> %v1) { 154*9880d681SAndroid Build Coastguard Workerentry: 155*9880d681SAndroid Build Coastguard Worker %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44> 156*9880d681SAndroid Build Coastguard Worker %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55> 157*9880d681SAndroid Build Coastguard Worker %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66> 158*9880d681SAndroid Build Coastguard Worker %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55> 159*9880d681SAndroid Build Coastguard Worker store <4 x i32> %mul1, <4 x i32>* @v2, align 16 160*9880d681SAndroid Build Coastguard Worker store <4 x i32> %mul2, <4 x i32>* @v3, align 16 161*9880d681SAndroid Build Coastguard Worker store <4 x i32> %add1, <4 x i32>* @x, align 16 162*9880d681SAndroid Build Coastguard Worker ret void 163*9880d681SAndroid Build Coastguard Worker} 164