1*9880d681SAndroid Build Coastguard Worker; NOTE: Assertions have been autogenerated by update_llc_test_checks.py 2*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 5*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW 6*9880d681SAndroid Build Coastguard Worker 7*9880d681SAndroid Build Coastguard Worker@a = global [1024 x i8] zeroinitializer, align 16 8*9880d681SAndroid Build Coastguard Worker@b = global [1024 x i8] zeroinitializer, align 16 9*9880d681SAndroid Build Coastguard Worker 10*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_16i8() nounwind { 11*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_16i8: 12*9880d681SAndroid Build Coastguard Worker; SSE2: # BB#0: # %entry 13*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 14*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 15*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm1, %xmm1 16*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .p2align 4, 0x90 17*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .LBB0_1: # %vector.body 18*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 19*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqu a+1024(%rax), %xmm2 20*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqu b+1024(%rax), %xmm3 21*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psadbw %xmm2, %xmm3 22*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm3, %xmm1 23*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: addq $4, %rax 24*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: jne .LBB0_1 25*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # BB#2: # %middle.block 26*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm1 27*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm0 28*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm1, %xmm0 29*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 30*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm1 31*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 32*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm1, %xmm0 33*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd %xmm0, %eax 34*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: retq 35*9880d681SAndroid Build Coastguard Worker; 36*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_16i8: 37*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: # %entry 38*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 39*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 40*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 41*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .p2align 4, 0x90 42*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .LBB0_1: # %vector.body 43*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 44*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2 45*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 46*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2 47*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 48*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: addq $4, %rax 49*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: jne .LBB0_1 50*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # BB#2: # %middle.block 51*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 52*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 53*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 54*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 55*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 56*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 57*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovd %xmm0, %eax 58*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vzeroupper 59*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 60*9880d681SAndroid Build Coastguard Worker; 61*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_16i8: 62*9880d681SAndroid Build Coastguard Worker; AVX512F: # BB#0: # %entry 63*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0 64*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 65*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .p2align 4, 0x90 66*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .LBB0_1: # %vector.body 67*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 68*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1 69*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 70*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1 71*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 72*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: addq $4, %rax 73*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: jne .LBB0_1 74*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # BB#2: # %middle.block 75*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 76*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 77*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 78*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 79*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 80*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 81*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 82*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 83*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovd %xmm0, %eax 84*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: retq 85*9880d681SAndroid Build Coastguard Worker; 86*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_16i8: 87*9880d681SAndroid Build Coastguard Worker; AVX512BW: # BB#0: # %entry 88*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0 89*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 90*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .p2align 4, 0x90 91*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .LBB0_1: # %vector.body 92*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 93*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1 94*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 95*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1 96*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 97*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: addq $4, %rax 98*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: jne .LBB0_1 99*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # BB#2: # %middle.block 100*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 101*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 102*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 103*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 104*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 105*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 106*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 107*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 108*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovd %xmm0, %eax 109*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: retq 110*9880d681SAndroid Build Coastguard Workerentry: 111*9880d681SAndroid Build Coastguard Worker br label %vector.body 112*9880d681SAndroid Build Coastguard Worker 113*9880d681SAndroid Build Coastguard Workervector.body: 114*9880d681SAndroid Build Coastguard Worker %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 115*9880d681SAndroid Build Coastguard Worker %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 116*9880d681SAndroid Build Coastguard Worker %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 117*9880d681SAndroid Build Coastguard Worker %1 = bitcast i8* %0 to <16 x i8>* 118*9880d681SAndroid Build Coastguard Worker %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 119*9880d681SAndroid Build Coastguard Worker %2 = zext <16 x i8> %wide.load to <16 x i32> 120*9880d681SAndroid Build Coastguard Worker %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 121*9880d681SAndroid Build Coastguard Worker %4 = bitcast i8* %3 to <16 x i8>* 122*9880d681SAndroid Build Coastguard Worker %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 123*9880d681SAndroid Build Coastguard Worker %5 = zext <16 x i8> %wide.load1 to <16 x i32> 124*9880d681SAndroid Build Coastguard Worker %6 = sub nsw <16 x i32> %2, %5 125*9880d681SAndroid Build Coastguard Worker %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 126*9880d681SAndroid Build Coastguard Worker %8 = sub nsw <16 x i32> zeroinitializer, %6 127*9880d681SAndroid Build Coastguard Worker %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 128*9880d681SAndroid Build Coastguard Worker %10 = add nsw <16 x i32> %9, %vec.phi 129*9880d681SAndroid Build Coastguard Worker %index.next = add i64 %index, 4 130*9880d681SAndroid Build Coastguard Worker %11 = icmp eq i64 %index.next, 1024 131*9880d681SAndroid Build Coastguard Worker br i1 %11, label %middle.block, label %vector.body 132*9880d681SAndroid Build Coastguard Worker 133*9880d681SAndroid Build Coastguard Workermiddle.block: 134*9880d681SAndroid Build Coastguard Worker %.lcssa = phi <16 x i32> [ %10, %vector.body ] 135*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 136*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf 137*9880d681SAndroid Build Coastguard Worker %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 138*9880d681SAndroid Build Coastguard Worker %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 139*9880d681SAndroid Build Coastguard Worker %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 140*9880d681SAndroid Build Coastguard Worker %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 141*9880d681SAndroid Build Coastguard Worker %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 142*9880d681SAndroid Build Coastguard Worker %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 143*9880d681SAndroid Build Coastguard Worker %12 = extractelement <16 x i32> %bin.rdx4, i32 0 144*9880d681SAndroid Build Coastguard Worker ret i32 %12 145*9880d681SAndroid Build Coastguard Worker} 146*9880d681SAndroid Build Coastguard Worker 147*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_32i8() nounwind { 148*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_32i8: 149*9880d681SAndroid Build Coastguard Worker; SSE2: # BB#0: # %entry 150*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm12, %xmm12 151*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 152*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm4 153*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm2, %xmm2 154*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 155*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm1, %xmm1 156*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm13, %xmm13 157*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm15, %xmm15 158*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm5, %xmm5 159*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm14, %xmm14 160*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .p2align 4, 0x90 161*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .LBB1_1: # %vector.body 162*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 163*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill 164*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 165*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 166*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 167*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa a+1040(%rax), %xmm0 168*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa a+1024(%rax), %xmm1 169*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1] 170*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 171*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] 172*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 173*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 174*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm1, %xmm6 175*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 176*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 177*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, %xmm2 178*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 179*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 180*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa b+1040(%rax), %xmm3 181*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa b+1024(%rax), %xmm5 182*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,0,1] 183*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 184*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm3, %xmm10 185*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 186*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm3, %xmm0 187*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1] 188*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 189*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] 190*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm10, %xmm2 191*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm5, %xmm3 192*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 193*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm5, %xmm1 194*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, %xmm5 195*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 196*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 197*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 198*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] 199*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm3, %xmm6 200*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, %xmm10 201*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm9, %xmm4 202*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 203*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm9, %xmm7 204*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm8, %xmm3 205*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 206*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] 207*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 208*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] 209*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm4, %xmm5 210*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm11, %xmm4 211*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 212*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm11, %xmm8 213*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] 214*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] 215*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm4, %xmm3 216*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm3, %xmm4 217*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 218*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm3 219*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm3 220*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm8, %xmm4 221*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 222*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm8 223*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm8 224*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm5, %xmm4 225*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 226*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm5 227*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm5 228*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, %xmm4 229*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 230*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm7 231*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm7 232*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm6, %xmm4 233*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 234*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm6 235*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm6 236*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm1, %xmm4 237*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 238*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm1 239*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm1 240*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm4 241*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 242*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm2 243*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm2 244*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, %xmm4 245*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 246*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm0 247*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm0 248*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm10, %xmm4 249*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm15 250*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 251*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm2, %xmm13 252*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 253*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm1, %xmm2 254*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 255*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm6, %xmm4 256*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm7, %xmm14 257*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload 258*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm5, %xmm6 259*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill 260*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 261*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm8, %xmm1 262*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm3, %xmm0 263*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: addq $4, %rax 264*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: jne .LBB1_1 265*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # BB#2: # %middle.block 266*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm15, %xmm2 267*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm14, %xmm1 268*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm13, %xmm4 269*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm5, %xmm0 270*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm0 271*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm2, %xmm1 272*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm1 273*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 274*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm1, %xmm0 275*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 276*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm1 277*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd %xmm1, %eax 278*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: retq 279*9880d681SAndroid Build Coastguard Worker; 280*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_32i8: 281*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: # %entry 282*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 283*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 284*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 285*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .p2align 4, 0x90 286*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .LBB1_1: # %vector.body 287*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 288*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2 289*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 290*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 291*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: addq $4, %rax 292*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: jne .LBB1_1 293*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # BB#2: # %middle.block 294*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 295*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 296*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 297*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 298*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 299*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 300*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 301*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 302*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovd %xmm0, %eax 303*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vzeroupper 304*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 305*9880d681SAndroid Build Coastguard Worker; 306*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_32i8: 307*9880d681SAndroid Build Coastguard Worker; AVX512F: # BB#0: # %entry 308*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0 309*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 310*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 311*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .p2align 4, 0x90 312*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .LBB1_1: # %vector.body 313*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 314*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 315*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 316*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2 317*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 318*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: addq $4, %rax 319*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: jne .LBB1_1 320*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # BB#2: # %middle.block 321*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 322*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 323*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 324*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 325*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 326*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 327*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 328*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 329*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 330*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovd %xmm0, %eax 331*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: retq 332*9880d681SAndroid Build Coastguard Worker; 333*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_32i8: 334*9880d681SAndroid Build Coastguard Worker; AVX512BW: # BB#0: # %entry 335*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0 336*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 337*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 338*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .p2align 4, 0x90 339*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .LBB1_1: # %vector.body 340*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 341*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2 342*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 343*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2 344*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 345*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: addq $4, %rax 346*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: jne .LBB1_1 347*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # BB#2: # %middle.block 348*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 349*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 350*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 351*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 352*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 353*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 354*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 355*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 356*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 357*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovd %xmm0, %eax 358*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: retq 359*9880d681SAndroid Build Coastguard Workerentry: 360*9880d681SAndroid Build Coastguard Worker br label %vector.body 361*9880d681SAndroid Build Coastguard Worker 362*9880d681SAndroid Build Coastguard Workervector.body: 363*9880d681SAndroid Build Coastguard Worker %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 364*9880d681SAndroid Build Coastguard Worker %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 365*9880d681SAndroid Build Coastguard Worker %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 366*9880d681SAndroid Build Coastguard Worker %1 = bitcast i8* %0 to <32 x i8>* 367*9880d681SAndroid Build Coastguard Worker %wide.load = load <32 x i8>, <32 x i8>* %1, align 32 368*9880d681SAndroid Build Coastguard Worker %2 = zext <32 x i8> %wide.load to <32 x i32> 369*9880d681SAndroid Build Coastguard Worker %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 370*9880d681SAndroid Build Coastguard Worker %4 = bitcast i8* %3 to <32 x i8>* 371*9880d681SAndroid Build Coastguard Worker %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32 372*9880d681SAndroid Build Coastguard Worker %5 = zext <32 x i8> %wide.load1 to <32 x i32> 373*9880d681SAndroid Build Coastguard Worker %6 = sub nsw <32 x i32> %2, %5 374*9880d681SAndroid Build Coastguard Worker %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 375*9880d681SAndroid Build Coastguard Worker %8 = sub nsw <32 x i32> zeroinitializer, %6 376*9880d681SAndroid Build Coastguard Worker %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8 377*9880d681SAndroid Build Coastguard Worker %10 = add nsw <32 x i32> %9, %vec.phi 378*9880d681SAndroid Build Coastguard Worker %index.next = add i64 %index, 4 379*9880d681SAndroid Build Coastguard Worker %11 = icmp eq i64 %index.next, 1024 380*9880d681SAndroid Build Coastguard Worker br i1 %11, label %middle.block, label %vector.body 381*9880d681SAndroid Build Coastguard Worker 382*9880d681SAndroid Build Coastguard Workermiddle.block: 383*9880d681SAndroid Build Coastguard Worker %.lcssa = phi <32 x i32> [ %10, %vector.body ] 384*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 385*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf 386*9880d681SAndroid Build Coastguard Worker %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 387*9880d681SAndroid Build Coastguard Worker %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2 388*9880d681SAndroid Build Coastguard Worker %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 389*9880d681SAndroid Build Coastguard Worker %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3 390*9880d681SAndroid Build Coastguard Worker %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 391*9880d681SAndroid Build Coastguard Worker %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4 392*9880d681SAndroid Build Coastguard Worker %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 393*9880d681SAndroid Build Coastguard Worker %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5 394*9880d681SAndroid Build Coastguard Worker %12 = extractelement <32 x i32> %bin.rdx5, i32 0 395*9880d681SAndroid Build Coastguard Worker ret i32 %12 396*9880d681SAndroid Build Coastguard Worker} 397*9880d681SAndroid Build Coastguard Worker 398*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_avx64i8() nounwind { 399*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_avx64i8: 400*9880d681SAndroid Build Coastguard Worker; SSE2: # BB#0: # %entry 401*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: subq $232, %rsp 402*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm8, %xmm8 403*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 404*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm5, %xmm5 405*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm2, %xmm2 406*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm1, %xmm1 407*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm3, %xmm3 408*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm6, %xmm6 409*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 410*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 411*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm13, %xmm13 412*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm10, %xmm10 413*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 414*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 415*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm12, %xmm12 416*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 417*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 418*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm11, %xmm11 419*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm15, %xmm15 420*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm9, %xmm9 421*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm7, %xmm7 422*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 423*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .p2align 4, 0x90 424*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .LBB2_1: # %vector.body 425*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 426*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill 427*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill 428*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill 429*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 430*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill 431*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill 432*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill 433*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill 434*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill 435*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 436*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill 437*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill 438*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill 439*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa a+1040(%rax), %xmm13 440*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa a+1024(%rax), %xmm1 441*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa a+1056(%rax), %xmm3 442*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa a+1072(%rax), %xmm6 443*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 444*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 445*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 446*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm3, %xmm12 447*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 448*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 449*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,0,1] 450*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 451*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill 452*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 453*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 454*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 455*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm1, %xmm0 456*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 457*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, %xmm15 458*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 459*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] 460*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm13, %xmm0 461*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 462*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] 463*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa b+1040(%rax), %xmm7 464*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 465*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa b+1056(%rax), %xmm9 466*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1] 467*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 468*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, %xmm4 469*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 470*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm7, %xmm13 471*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1] 472*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 473*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 474*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm4, %xmm0 475*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 476*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm11, %xmm4 477*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 478*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm11, %xmm1 479*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1] 480*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 481*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 482*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm4, %xmm15 483*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill 484*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm9, %xmm4 485*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 486*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 487*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm9, %xmm3 488*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm5, %xmm0 489*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 490*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm5, %xmm10 491*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm15 492*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 493*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 494*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] 495*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 496*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 497*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm0, %xmm5 498*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill 499*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, %xmm0 500*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 501*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm7, %xmm2 502*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 503*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1] 504*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] 505*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 506*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 507*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] 508*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 509*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm0, %xmm15 510*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm11 511*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] 512*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 513*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] 514*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 515*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm4, %xmm12 516*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm14, %xmm0 517*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3] 518*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm14, %xmm2 519*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm14 520*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm6, %xmm9 521*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] 522*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 523*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 524*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm0, %xmm11 525*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa b+1072(%rax), %xmm0 526*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 527*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 528*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, %xmm5 529*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 530*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm0, %xmm6 531*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 532*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 533*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm5, %xmm9 534*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 535*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, %xmm0 536*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 537*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 538*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, %xmm5 539*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 540*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm4, %xmm7 541*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 542*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 543*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psubd %xmm5, %xmm0 544*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm0, %xmm4 545*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 546*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm0 547*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm0 548*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm7, %xmm4 549*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 550*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm7 551*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm7 552*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm9, %xmm4 553*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 554*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm9 555*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm9 556*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm6, %xmm4 557*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 558*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm6 559*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm6 560*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill 561*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm11, %xmm4 562*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 563*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm11 564*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm11 565*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm14, %xmm4 566*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 567*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm14 568*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm14 569*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm12, %xmm4 570*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 571*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm12 572*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm12 573*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill 574*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm15, %xmm4 575*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 576*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm15 577*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm15 578*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 579*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm4 580*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 581*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm2 582*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm2 583*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 584*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 585*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm4 586*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 587*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm2 588*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm2 589*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 590*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm10, %xmm4 591*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 592*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm10 593*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm10 594*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm3, %xmm4 595*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 596*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm3 597*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm3 598*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 599*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm4 600*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 601*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm2 602*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm2 603*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 604*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm1, %xmm4 605*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 606*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm1 607*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm1 608*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 609*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm4 610*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 611*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm2 612*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm2 613*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, %xmm5 614*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm13, %xmm4 615*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psrad $31, %xmm4 616*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm4, %xmm13 617*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm4, %xmm13 618*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 619*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm13, %xmm2 620*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill 621*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload 622*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload 623*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm5, %xmm6 624*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 625*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm1, %xmm4 626*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 627*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 628*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 629*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload 630*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 631*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm3, %xmm4 632*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 633*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 634*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 635*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm10, %xmm4 636*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 637*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload 638*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload 639*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload 640*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload 641*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 642*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm15, %xmm1 643*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload 644*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload 645*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 646*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm14, %xmm4 647*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 648*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 649*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm11, %xmm4 650*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 651*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload 652*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload 653*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 654*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm9, %xmm4 655*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill 656*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload 657*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 658*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm7, %xmm4 659*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 660*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload 661*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload 662*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm4 663*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill 664*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 665*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: addq $4, %rax 666*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: jne .LBB2_1 667*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # BB#2: # %middle.block 668*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 669*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm7, %xmm13 670*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload 671*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm15, %xmm6 672*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm11, %xmm3 673*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm10 674*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm12, %xmm2 675*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 676*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm9, %xmm0 677*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm2, %xmm0 678*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm3, %xmm10 679*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm5, %xmm6 680*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm1, %xmm13 681*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm6, %xmm13 682*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm10 683*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm13, %xmm10 684*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1] 685*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm10, %xmm0 686*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 687*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddd %xmm0, %xmm1 688*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd %xmm1, %eax 689*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: addq $232, %rsp 690*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: retq 691*9880d681SAndroid Build Coastguard Worker; 692*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_avx64i8: 693*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: # %entry 694*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 695*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 696*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 697*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 698*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 699*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 700*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 701*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 702*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 703*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .p2align 4, 0x90 704*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .LBB2_1: # %vector.body 705*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 706*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 707*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill 708*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 709*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 710*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 711*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 712*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 713*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 714*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 715*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 716*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 717*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 718*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 719*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 720*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 721*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 722*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 723*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 724*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 725*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 726*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 727*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 728*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 729*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill 730*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 731*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload 732*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 733*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm8, %ymm8 734*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 735*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm14, %ymm8 736*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 737*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm13, %ymm8 738*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 739*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm12, %ymm8 740*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 741*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm11, %ymm8 742*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 743*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm10, %ymm8 744*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 745*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload 746*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 747*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpabsd %ymm15, %ymm8 748*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 749*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: addq $4, %rax 750*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: jne .LBB2_1 751*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # BB#2: # %middle.block 752*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 753*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 754*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 755*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 756*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 757*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 758*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 759*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 760*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 761*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 762*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 763*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 764*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovd %xmm0, %eax 765*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vzeroupper 766*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 767*9880d681SAndroid Build Coastguard Worker; 768*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_avx64i8: 769*9880d681SAndroid Build Coastguard Worker; AVX512F: # BB#0: # %entry 770*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0 771*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 772*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 773*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 774*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxord %zmm3, %zmm3, %zmm3 775*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .p2align 4, 0x90 776*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .LBB2_1: # %vector.body 777*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 778*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 779*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 780*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 781*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 782*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 783*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 784*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 785*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 786*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 787*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 788*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 789*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 790*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpabsd %zmm4, %zmm4 791*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpabsd %zmm5, %zmm5 792*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpabsd %zmm6, %zmm6 793*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpabsd %zmm7, %zmm7 794*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 795*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 796*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 797*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 798*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: addq $4, %rax 799*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: jne .LBB2_1 800*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # BB#2: # %middle.block 801*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 802*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 803*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 804*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 805*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 806*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 807*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 808*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 809*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 810*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 811*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 812*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovd %xmm0, %eax 813*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: retq 814*9880d681SAndroid Build Coastguard Worker; 815*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_avx64i8: 816*9880d681SAndroid Build Coastguard Worker; AVX512BW: # BB#0: # %entry 817*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0 818*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 819*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 820*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .p2align 4, 0x90 821*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .LBB2_1: # %vector.body 822*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 823*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2 824*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2 825*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 826*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: addq $4, %rax 827*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: jne .LBB2_1 828*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # BB#2: # %middle.block 829*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1 830*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 831*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 832*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] 833*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 834*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] 835*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 836*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] 837*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 838*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] 839*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 840*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovd %xmm0, %eax 841*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: retq 842*9880d681SAndroid Build Coastguard Workerentry: 843*9880d681SAndroid Build Coastguard Worker br label %vector.body 844*9880d681SAndroid Build Coastguard Worker 845*9880d681SAndroid Build Coastguard Workervector.body: 846*9880d681SAndroid Build Coastguard Worker %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 847*9880d681SAndroid Build Coastguard Worker %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 848*9880d681SAndroid Build Coastguard Worker %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 849*9880d681SAndroid Build Coastguard Worker %1 = bitcast i8* %0 to <64 x i8>* 850*9880d681SAndroid Build Coastguard Worker %wide.load = load <64 x i8>, <64 x i8>* %1, align 64 851*9880d681SAndroid Build Coastguard Worker %2 = zext <64 x i8> %wide.load to <64 x i32> 852*9880d681SAndroid Build Coastguard Worker %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 853*9880d681SAndroid Build Coastguard Worker %4 = bitcast i8* %3 to <64 x i8>* 854*9880d681SAndroid Build Coastguard Worker %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64 855*9880d681SAndroid Build Coastguard Worker %5 = zext <64 x i8> %wide.load1 to <64 x i32> 856*9880d681SAndroid Build Coastguard Worker %6 = sub nsw <64 x i32> %2, %5 857*9880d681SAndroid Build Coastguard Worker %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 858*9880d681SAndroid Build Coastguard Worker %8 = sub nsw <64 x i32> zeroinitializer, %6 859*9880d681SAndroid Build Coastguard Worker %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8 860*9880d681SAndroid Build Coastguard Worker %10 = add nsw <64 x i32> %9, %vec.phi 861*9880d681SAndroid Build Coastguard Worker %index.next = add i64 %index, 4 862*9880d681SAndroid Build Coastguard Worker %11 = icmp eq i64 %index.next, 1024 863*9880d681SAndroid Build Coastguard Worker br i1 %11, label %middle.block, label %vector.body 864*9880d681SAndroid Build Coastguard Worker 865*9880d681SAndroid Build Coastguard Workermiddle.block: 866*9880d681SAndroid Build Coastguard Worker %.lcssa = phi <64 x i32> [ %10, %vector.body ] 867*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 868*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf 869*9880d681SAndroid Build Coastguard Worker %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 870*9880d681SAndroid Build Coastguard Worker %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2 871*9880d681SAndroid Build Coastguard Worker %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 872*9880d681SAndroid Build Coastguard Worker %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3 873*9880d681SAndroid Build Coastguard Worker %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 874*9880d681SAndroid Build Coastguard Worker %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4 875*9880d681SAndroid Build Coastguard Worker %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 876*9880d681SAndroid Build Coastguard Worker %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5 877*9880d681SAndroid Build Coastguard Worker %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 878*9880d681SAndroid Build Coastguard Worker %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6 879*9880d681SAndroid Build Coastguard Worker %12 = extractelement <64 x i32> %bin.rdx6, i32 0 880*9880d681SAndroid Build Coastguard Worker ret i32 %12 881*9880d681SAndroid Build Coastguard Worker} 882*9880d681SAndroid Build Coastguard Worker 883*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_2i8() nounwind { 884*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_2i8: 885*9880d681SAndroid Build Coastguard Worker; SSE2: # BB#0: # %entry 886*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pxor %xmm0, %xmm0 887*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 888*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF 889*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd %ecx, %xmm1 890*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .p2align 4, 0x90 891*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: .LBB3_1: # %vector.body 892*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 893*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 894*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 895*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pand %xmm1, %xmm3 896*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pand %xmm1, %xmm2 897*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: psadbw %xmm3, %xmm2 898*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddq %xmm2, %xmm0 899*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: addq $4, %rax 900*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: jne .LBB3_1 901*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: # BB#2: # %middle.block 902*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 903*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: paddq %xmm0, %xmm1 904*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: movd %xmm1, %eax 905*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT: retq 906*9880d681SAndroid Build Coastguard Worker; 907*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_2i8: 908*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: # %entry 909*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 910*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 911*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 912*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .p2align 4, 0x90 913*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: .LBB3_1: # %vector.body 914*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 915*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 916*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 917*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 918*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 919*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 920*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 921*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: addq $4, %rax 922*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: jne .LBB3_1 923*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: # BB#2: # %middle.block 924*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 925*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 926*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovd %xmm0, %eax 927*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 928*9880d681SAndroid Build Coastguard Worker; 929*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_2i8: 930*9880d681SAndroid Build Coastguard Worker; AVX512F: # BB#0: # %entry 931*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 932*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 933*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 934*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .p2align 4, 0x90 935*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: .LBB3_1: # %vector.body 936*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 937*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 938*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 939*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 940*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 941*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 942*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 943*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: addq $4, %rax 944*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: jne .LBB3_1 945*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: # BB#2: # %middle.block 946*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 947*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 948*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: vmovd %xmm0, %eax 949*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT: retq 950*9880d681SAndroid Build Coastguard Worker; 951*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_2i8: 952*9880d681SAndroid Build Coastguard Worker; AVX512BW: # BB#0: # %entry 953*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 954*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 955*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 956*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .p2align 4, 0x90 957*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: .LBB3_1: # %vector.body 958*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 959*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 960*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 961*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 962*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 963*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 964*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 965*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: addq $4, %rax 966*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: jne .LBB3_1 967*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: # BB#2: # %middle.block 968*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 969*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 970*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: vmovd %xmm0, %eax 971*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT: retq 972*9880d681SAndroid Build Coastguard Workerentry: 973*9880d681SAndroid Build Coastguard Worker br label %vector.body 974*9880d681SAndroid Build Coastguard Worker 975*9880d681SAndroid Build Coastguard Workervector.body: 976*9880d681SAndroid Build Coastguard Worker %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 977*9880d681SAndroid Build Coastguard Worker %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 978*9880d681SAndroid Build Coastguard Worker %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 979*9880d681SAndroid Build Coastguard Worker %1 = bitcast i8* %0 to <2 x i8>* 980*9880d681SAndroid Build Coastguard Worker %wide.load = load <2 x i8>, <2 x i8>* %1, align 4 981*9880d681SAndroid Build Coastguard Worker %2 = zext <2 x i8> %wide.load to <2 x i32> 982*9880d681SAndroid Build Coastguard Worker %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 983*9880d681SAndroid Build Coastguard Worker %4 = bitcast i8* %3 to <2 x i8>* 984*9880d681SAndroid Build Coastguard Worker %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4 985*9880d681SAndroid Build Coastguard Worker %5 = zext <2 x i8> %wide.load1 to <2 x i32> 986*9880d681SAndroid Build Coastguard Worker %6 = sub nsw <2 x i32> %2, %5 987*9880d681SAndroid Build Coastguard Worker %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1> 988*9880d681SAndroid Build Coastguard Worker %8 = sub nsw <2 x i32> zeroinitializer, %6 989*9880d681SAndroid Build Coastguard Worker %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8 990*9880d681SAndroid Build Coastguard Worker %10 = add nsw <2 x i32> %9, %vec.phi 991*9880d681SAndroid Build Coastguard Worker %index.next = add i64 %index, 4 992*9880d681SAndroid Build Coastguard Worker %11 = icmp eq i64 %index.next, 1024 993*9880d681SAndroid Build Coastguard Worker br i1 %11, label %middle.block, label %vector.body 994*9880d681SAndroid Build Coastguard Worker 995*9880d681SAndroid Build Coastguard Workermiddle.block: 996*9880d681SAndroid Build Coastguard Worker %.lcssa = phi <2 x i32> [ %10, %vector.body ] 997*9880d681SAndroid Build Coastguard Worker %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 998*9880d681SAndroid Build Coastguard Worker %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf 999*9880d681SAndroid Build Coastguard Worker %12 = extractelement <2 x i32> %bin.rdx, i32 0 1000*9880d681SAndroid Build Coastguard Worker ret i32 %12 1001*9880d681SAndroid Build Coastguard Worker} 1002*9880d681SAndroid Build Coastguard Worker 1003