xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/sad.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
2*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
5*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
6*9880d681SAndroid Build Coastguard Worker
7*9880d681SAndroid Build Coastguard Worker@a = global [1024 x i8] zeroinitializer, align 16
8*9880d681SAndroid Build Coastguard Worker@b = global [1024 x i8] zeroinitializer, align 16
9*9880d681SAndroid Build Coastguard Worker
10*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_16i8() nounwind {
11*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_16i8:
12*9880d681SAndroid Build Coastguard Worker; SSE2:       # BB#0: # %entry
13*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
14*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
15*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm1, %xmm1
16*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    .p2align 4, 0x90
17*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  .LBB0_1: # %vector.body
18*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
19*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqu a+1024(%rax), %xmm2
20*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqu b+1024(%rax), %xmm3
21*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psadbw %xmm2, %xmm3
22*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm3, %xmm1
23*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    addq $4, %rax
24*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    jne .LBB0_1
25*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  # BB#2: # %middle.block
26*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm1
27*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm0
28*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm1, %xmm0
29*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
30*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm1
31*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
32*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm1, %xmm0
33*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd %xmm0, %eax
34*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    retq
35*9880d681SAndroid Build Coastguard Worker;
36*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_16i8:
37*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0: # %entry
38*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm0, %ymm0, %ymm0
39*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
40*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
41*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    .p2align 4, 0x90
42*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  .LBB0_1: # %vector.body
43*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
44*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovdqu a+1024(%rax), %xmm2
45*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
46*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm2
47*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
48*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    addq $4, %rax
49*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    jne .LBB0_1
50*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  # BB#2: # %middle.block
51*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
52*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
53*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
54*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
55*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
56*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
57*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovd %xmm0, %eax
58*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vzeroupper
59*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
60*9880d681SAndroid Build Coastguard Worker;
61*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_16i8:
62*9880d681SAndroid Build Coastguard Worker; AVX512F:       # BB#0: # %entry
63*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
64*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
65*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    .p2align 4, 0x90
66*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  .LBB0_1: # %vector.body
67*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
68*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovdqu a+1024(%rax), %xmm1
69*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
70*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
71*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
72*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    addq $4, %rax
73*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    jne .LBB0_1
74*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  # BB#2: # %middle.block
75*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
76*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
77*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
78*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
79*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
80*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
81*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
82*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
83*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovd %xmm0, %eax
84*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    retq
85*9880d681SAndroid Build Coastguard Worker;
86*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_16i8:
87*9880d681SAndroid Build Coastguard Worker; AVX512BW:       # BB#0: # %entry
88*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxord %zmm0, %zmm0, %zmm0
89*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
90*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    .p2align 4, 0x90
91*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  .LBB0_1: # %vector.body
92*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
93*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovdqu a+1024(%rax), %xmm1
94*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
95*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
96*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
97*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    addq $4, %rax
98*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    jne .LBB0_1
99*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  # BB#2: # %middle.block
100*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
101*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
102*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
103*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
104*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
105*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
106*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
107*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
108*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovd %xmm0, %eax
109*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    retq
110*9880d681SAndroid Build Coastguard Workerentry:
111*9880d681SAndroid Build Coastguard Worker  br label %vector.body
112*9880d681SAndroid Build Coastguard Worker
113*9880d681SAndroid Build Coastguard Workervector.body:
114*9880d681SAndroid Build Coastguard Worker  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
115*9880d681SAndroid Build Coastguard Worker  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
116*9880d681SAndroid Build Coastguard Worker  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
117*9880d681SAndroid Build Coastguard Worker  %1 = bitcast i8* %0 to <16 x i8>*
118*9880d681SAndroid Build Coastguard Worker  %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
119*9880d681SAndroid Build Coastguard Worker  %2 = zext <16 x i8> %wide.load to <16 x i32>
120*9880d681SAndroid Build Coastguard Worker  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
121*9880d681SAndroid Build Coastguard Worker  %4 = bitcast i8* %3 to <16 x i8>*
122*9880d681SAndroid Build Coastguard Worker  %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
123*9880d681SAndroid Build Coastguard Worker  %5 = zext <16 x i8> %wide.load1 to <16 x i32>
124*9880d681SAndroid Build Coastguard Worker  %6 = sub nsw <16 x i32> %2, %5
125*9880d681SAndroid Build Coastguard Worker  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
126*9880d681SAndroid Build Coastguard Worker  %8 = sub nsw <16 x i32> zeroinitializer, %6
127*9880d681SAndroid Build Coastguard Worker  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
128*9880d681SAndroid Build Coastguard Worker  %10 = add nsw <16 x i32> %9, %vec.phi
129*9880d681SAndroid Build Coastguard Worker  %index.next = add i64 %index, 4
130*9880d681SAndroid Build Coastguard Worker  %11 = icmp eq i64 %index.next, 1024
131*9880d681SAndroid Build Coastguard Worker  br i1 %11, label %middle.block, label %vector.body
132*9880d681SAndroid Build Coastguard Worker
133*9880d681SAndroid Build Coastguard Workermiddle.block:
134*9880d681SAndroid Build Coastguard Worker  %.lcssa = phi <16 x i32> [ %10, %vector.body ]
135*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
136*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
137*9880d681SAndroid Build Coastguard Worker  %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
138*9880d681SAndroid Build Coastguard Worker  %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
139*9880d681SAndroid Build Coastguard Worker  %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
140*9880d681SAndroid Build Coastguard Worker  %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
141*9880d681SAndroid Build Coastguard Worker  %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
142*9880d681SAndroid Build Coastguard Worker  %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
143*9880d681SAndroid Build Coastguard Worker  %12 = extractelement <16 x i32> %bin.rdx4, i32 0
144*9880d681SAndroid Build Coastguard Worker  ret i32 %12
145*9880d681SAndroid Build Coastguard Worker}
146*9880d681SAndroid Build Coastguard Worker
147*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_32i8() nounwind {
148*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_32i8:
149*9880d681SAndroid Build Coastguard Worker; SSE2:       # BB#0: # %entry
150*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm12, %xmm12
151*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
152*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm4
153*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm2, %xmm2
154*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
155*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm1, %xmm1
156*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm13, %xmm13
157*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm15, %xmm15
158*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm5, %xmm5
159*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm14, %xmm14
160*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    .p2align 4, 0x90
161*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  .LBB1_1: # %vector.body
162*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
163*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
164*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
165*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
166*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
167*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa a+1040(%rax), %xmm0
168*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa a+1024(%rax), %xmm1
169*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1]
170*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
171*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
172*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
173*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
174*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm1, %xmm6
175*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
176*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
177*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, %xmm2
178*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
179*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
180*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa b+1040(%rax), %xmm3
181*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa b+1024(%rax), %xmm5
182*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[2,3,0,1]
183*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
184*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm3, %xmm10
185*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
186*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm3, %xmm0
187*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1]
188*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
189*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
190*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm10, %xmm2
191*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm5, %xmm3
192*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
193*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm5, %xmm1
194*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, %xmm5
195*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
196*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
197*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
198*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
199*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm3, %xmm6
200*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, %xmm10
201*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm9, %xmm4
202*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
203*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm9, %xmm7
204*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm8, %xmm3
205*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
206*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
207*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
208*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
209*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm4, %xmm5
210*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm11, %xmm4
211*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
212*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm11, %xmm8
213*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
214*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
215*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm4, %xmm3
216*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm3, %xmm4
217*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
218*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm3
219*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm3
220*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm8, %xmm4
221*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
222*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm8
223*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm8
224*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm5, %xmm4
225*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
226*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm5
227*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm5
228*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, %xmm4
229*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
230*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm7
231*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm7
232*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm6, %xmm4
233*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
234*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm6
235*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm6
236*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm1, %xmm4
237*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
238*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm1
239*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm1
240*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm4
241*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
242*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm2
243*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm2
244*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, %xmm4
245*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
246*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm0
247*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm0
248*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm10, %xmm4
249*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm15
250*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
251*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm2, %xmm13
252*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
253*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm1, %xmm2
254*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
255*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm6, %xmm4
256*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm7, %xmm14
257*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
258*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm5, %xmm6
259*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
260*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
261*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm8, %xmm1
262*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm3, %xmm0
263*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    addq $4, %rax
264*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    jne .LBB1_1
265*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  # BB#2: # %middle.block
266*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm15, %xmm2
267*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm14, %xmm1
268*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm13, %xmm4
269*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm5, %xmm0
270*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm0
271*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm2, %xmm1
272*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm1
273*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
274*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm1, %xmm0
275*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
276*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm1
277*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd %xmm1, %eax
278*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    retq
279*9880d681SAndroid Build Coastguard Worker;
280*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_32i8:
281*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0: # %entry
282*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm0, %ymm0, %ymm0
283*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
284*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
285*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    .p2align 4, 0x90
286*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  .LBB1_1: # %vector.body
287*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
288*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm2
289*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
290*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
291*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    addq $4, %rax
292*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    jne .LBB1_1
293*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  # BB#2: # %middle.block
294*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
295*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
296*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
297*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
298*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
299*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
300*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
301*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
302*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovd %xmm0, %eax
303*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vzeroupper
304*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
305*9880d681SAndroid Build Coastguard Worker;
306*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_32i8:
307*9880d681SAndroid Build Coastguard Worker; AVX512F:       # BB#0: # %entry
308*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
309*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
310*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
311*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    .p2align 4, 0x90
312*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  .LBB1_1: # %vector.body
313*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
314*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm2
315*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
316*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %ymm1, %ymm2, %ymm2
317*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
318*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    addq $4, %rax
319*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    jne .LBB1_1
320*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  # BB#2: # %middle.block
321*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
322*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
323*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
324*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
325*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
326*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
327*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
328*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
329*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
330*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovd %xmm0, %eax
331*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    retq
332*9880d681SAndroid Build Coastguard Worker;
333*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_32i8:
334*9880d681SAndroid Build Coastguard Worker; AVX512BW:       # BB#0: # %entry
335*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxord %zmm0, %zmm0, %zmm0
336*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
337*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
338*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    .p2align 4, 0x90
339*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  .LBB1_1: # %vector.body
340*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
341*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovdqa a+1024(%rax), %ymm2
342*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
343*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %ymm1, %ymm2, %ymm2
344*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
345*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    addq $4, %rax
346*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    jne .LBB1_1
347*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  # BB#2: # %middle.block
348*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
349*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
350*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
351*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
352*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
353*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
354*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
355*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
356*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
357*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovd %xmm0, %eax
358*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    retq
359*9880d681SAndroid Build Coastguard Workerentry:
360*9880d681SAndroid Build Coastguard Worker  br label %vector.body
361*9880d681SAndroid Build Coastguard Worker
362*9880d681SAndroid Build Coastguard Workervector.body:
363*9880d681SAndroid Build Coastguard Worker  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
364*9880d681SAndroid Build Coastguard Worker  %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
365*9880d681SAndroid Build Coastguard Worker  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
366*9880d681SAndroid Build Coastguard Worker  %1 = bitcast i8* %0 to <32 x i8>*
367*9880d681SAndroid Build Coastguard Worker  %wide.load = load <32 x i8>, <32 x i8>* %1, align 32
368*9880d681SAndroid Build Coastguard Worker  %2 = zext <32 x i8> %wide.load to <32 x i32>
369*9880d681SAndroid Build Coastguard Worker  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
370*9880d681SAndroid Build Coastguard Worker  %4 = bitcast i8* %3 to <32 x i8>*
371*9880d681SAndroid Build Coastguard Worker  %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32
372*9880d681SAndroid Build Coastguard Worker  %5 = zext <32 x i8> %wide.load1 to <32 x i32>
373*9880d681SAndroid Build Coastguard Worker  %6 = sub nsw <32 x i32> %2, %5
374*9880d681SAndroid Build Coastguard Worker  %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
375*9880d681SAndroid Build Coastguard Worker  %8 = sub nsw <32 x i32> zeroinitializer, %6
376*9880d681SAndroid Build Coastguard Worker  %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8
377*9880d681SAndroid Build Coastguard Worker  %10 = add nsw <32 x i32> %9, %vec.phi
378*9880d681SAndroid Build Coastguard Worker  %index.next = add i64 %index, 4
379*9880d681SAndroid Build Coastguard Worker  %11 = icmp eq i64 %index.next, 1024
380*9880d681SAndroid Build Coastguard Worker  br i1 %11, label %middle.block, label %vector.body
381*9880d681SAndroid Build Coastguard Worker
382*9880d681SAndroid Build Coastguard Workermiddle.block:
383*9880d681SAndroid Build Coastguard Worker  %.lcssa = phi <32 x i32> [ %10, %vector.body ]
384*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
385*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf
386*9880d681SAndroid Build Coastguard Worker  %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
387*9880d681SAndroid Build Coastguard Worker  %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2
388*9880d681SAndroid Build Coastguard Worker  %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
389*9880d681SAndroid Build Coastguard Worker  %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3
390*9880d681SAndroid Build Coastguard Worker  %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
391*9880d681SAndroid Build Coastguard Worker  %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4
392*9880d681SAndroid Build Coastguard Worker  %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
393*9880d681SAndroid Build Coastguard Worker  %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5
394*9880d681SAndroid Build Coastguard Worker  %12 = extractelement <32 x i32> %bin.rdx5, i32 0
395*9880d681SAndroid Build Coastguard Worker  ret i32 %12
396*9880d681SAndroid Build Coastguard Worker}
397*9880d681SAndroid Build Coastguard Worker
398*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_avx64i8() nounwind {
399*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_avx64i8:
400*9880d681SAndroid Build Coastguard Worker; SSE2:       # BB#0: # %entry
401*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    subq $232, %rsp
402*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm8, %xmm8
403*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
404*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm5, %xmm5
405*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm2, %xmm2
406*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm1, %xmm1
407*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm3, %xmm3
408*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm6, %xmm6
409*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
410*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
411*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm13, %xmm13
412*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm10, %xmm10
413*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
414*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
415*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm12, %xmm12
416*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
417*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
418*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm11, %xmm11
419*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm15, %xmm15
420*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm9, %xmm9
421*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm7, %xmm7
422*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
423*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    .p2align 4, 0x90
424*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  .LBB2_1: # %vector.body
425*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
426*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
427*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
428*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
429*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
430*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
431*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
432*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
433*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
434*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
435*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
436*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
437*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
438*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
439*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa a+1040(%rax), %xmm13
440*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa a+1024(%rax), %xmm1
441*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa a+1056(%rax), %xmm3
442*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa a+1072(%rax), %xmm6
443*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
444*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
445*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
446*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm3, %xmm12
447*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
448*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
449*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm13[2,3,0,1]
450*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
451*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
452*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
453*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
454*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
455*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm1, %xmm0
456*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
457*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, %xmm15
458*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
459*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
460*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm13, %xmm0
461*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
462*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
463*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa b+1040(%rax), %xmm7
464*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
465*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa b+1056(%rax), %xmm9
466*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
467*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
468*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, %xmm4
469*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
470*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm7, %xmm13
471*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1]
472*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
473*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
474*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm4, %xmm0
475*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
476*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm11, %xmm4
477*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
478*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm11, %xmm1
479*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1]
480*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
481*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
482*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm4, %xmm15
483*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
484*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm9, %xmm4
485*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
486*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
487*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm9, %xmm3
488*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm5, %xmm0
489*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
490*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm5, %xmm10
491*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm15
492*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
493*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
494*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
495*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
496*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
497*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm0, %xmm5
498*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
499*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, %xmm0
500*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
501*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm7, %xmm2
502*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
503*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1]
504*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
505*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
506*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
507*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
508*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
509*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm0, %xmm15
510*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm11
511*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
512*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
513*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
514*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
515*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm4, %xmm12
516*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm14, %xmm0
517*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
518*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm14, %xmm2
519*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm14
520*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm6, %xmm9
521*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
522*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
523*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
524*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm0, %xmm11
525*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa b+1072(%rax), %xmm0
526*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
527*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
528*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, %xmm5
529*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
530*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm0, %xmm6
531*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
532*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
533*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm5, %xmm9
534*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
535*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, %xmm0
536*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
537*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
538*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, %xmm5
539*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
540*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm4, %xmm7
541*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
542*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
543*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psubd %xmm5, %xmm0
544*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm0, %xmm4
545*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
546*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm0
547*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm0
548*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm7, %xmm4
549*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
550*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm7
551*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm7
552*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm9, %xmm4
553*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
554*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm9
555*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm9
556*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm6, %xmm4
557*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
558*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm6
559*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm6
560*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
561*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm11, %xmm4
562*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
563*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm11
564*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm11
565*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm14, %xmm4
566*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
567*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm14
568*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm14
569*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm12, %xmm4
570*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
571*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm12
572*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm12
573*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
574*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm15, %xmm4
575*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
576*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm15
577*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm15
578*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
579*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm4
580*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
581*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm2
582*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm2
583*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
584*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
585*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm4
586*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
587*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm2
588*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm2
589*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
590*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm10, %xmm4
591*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
592*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm10
593*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm10
594*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm3, %xmm4
595*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
596*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm3
597*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm3
598*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
599*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm4
600*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
601*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm2
602*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm2
603*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
604*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm1, %xmm4
605*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
606*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm1
607*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm1
608*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
609*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm4
610*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
611*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm2
612*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm2
613*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, %xmm5
614*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm13, %xmm4
615*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psrad $31, %xmm4
616*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm4, %xmm13
617*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm4, %xmm13
618*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
619*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm13, %xmm2
620*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
621*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
622*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
623*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm5, %xmm6
624*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
625*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm1, %xmm4
626*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
627*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
628*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
629*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
630*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
631*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm3, %xmm4
632*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
633*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
634*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
635*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm10, %xmm4
636*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
637*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
638*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
639*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
640*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
641*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
642*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm15, %xmm1
643*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
644*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
645*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
646*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm14, %xmm4
647*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
648*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
649*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm11, %xmm4
650*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
651*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
652*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
653*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
654*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm9, %xmm4
655*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
656*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
657*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
658*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm7, %xmm4
659*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
660*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
661*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa (%rsp), %xmm4 # 16-byte Reload
662*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm4
663*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
664*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
665*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    addq $4, %rax
666*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    jne .LBB2_1
667*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  # BB#2: # %middle.block
668*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
669*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm7, %xmm13
670*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
671*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm15, %xmm6
672*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm11, %xmm3
673*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm10
674*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm12, %xmm2
675*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
676*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm9, %xmm0
677*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm2, %xmm0
678*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm3, %xmm10
679*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm5, %xmm6
680*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm1, %xmm13
681*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm6, %xmm13
682*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm10
683*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm13, %xmm10
684*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
685*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm10, %xmm0
686*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
687*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddd %xmm0, %xmm1
688*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd %xmm1, %eax
689*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    addq $232, %rsp
690*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    retq
691*9880d681SAndroid Build Coastguard Worker;
692*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_avx64i8:
693*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0: # %entry
694*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm0, %ymm0, %ymm0
695*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
696*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
697*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
698*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
699*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
700*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm6, %ymm6, %ymm6
701*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm5, %ymm5, %ymm5
702*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %ymm7, %ymm7, %ymm7
703*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    .p2align 4, 0x90
704*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  .LBB2_1: # %vector.body
705*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
706*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
707*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
708*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
709*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
710*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
711*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
712*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
713*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
714*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
715*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
716*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm8, %ymm15, %ymm8
717*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
718*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
719*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
720*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
721*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
722*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
723*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
724*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm11, %ymm11
725*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
726*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
727*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
728*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
729*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
730*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
731*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
732*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm15
733*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm8, %ymm8
734*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
735*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm14, %ymm8
736*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
737*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm13, %ymm8
738*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
739*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm12, %ymm8
740*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
741*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm11, %ymm8
742*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
743*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm10, %ymm8
744*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
745*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
746*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
747*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpabsd %ymm15, %ymm8
748*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
749*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    addq $4, %rax
750*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    jne .LBB2_1
751*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  # BB#2: # %middle.block
752*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
753*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
754*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
755*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
756*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
757*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
758*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
759*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
760*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
761*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
762*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
763*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
764*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovd %xmm0, %eax
765*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vzeroupper
766*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
767*9880d681SAndroid Build Coastguard Worker;
768*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_avx64i8:
769*9880d681SAndroid Build Coastguard Worker; AVX512F:       # BB#0: # %entry
770*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
771*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
772*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
773*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
774*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxord %zmm3, %zmm3, %zmm3
775*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    .p2align 4, 0x90
776*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  .LBB2_1: # %vector.body
777*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
778*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
779*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
780*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
781*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
782*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
783*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
784*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
785*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
786*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsubd %zmm11, %zmm7, %zmm7
787*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsubd %zmm10, %zmm6, %zmm6
788*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsubd %zmm9, %zmm5, %zmm5
789*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsubd %zmm8, %zmm4, %zmm4
790*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpabsd %zmm4, %zmm4
791*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpabsd %zmm5, %zmm5
792*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpabsd %zmm6, %zmm6
793*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpabsd %zmm7, %zmm7
794*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm3, %zmm7, %zmm3
795*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm2, %zmm6, %zmm2
796*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm5, %zmm1
797*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
798*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    addq $4, %rax
799*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    jne .LBB2_1
800*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  # BB#2: # %middle.block
801*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
802*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
803*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
804*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
805*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
806*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
807*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
808*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
809*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
810*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
811*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
812*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovd %xmm0, %eax
813*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    retq
814*9880d681SAndroid Build Coastguard Worker;
815*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_avx64i8:
816*9880d681SAndroid Build Coastguard Worker; AVX512BW:       # BB#0: # %entry
817*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxord %zmm0, %zmm0, %zmm0
818*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
819*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
820*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    .p2align 4, 0x90
821*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  .LBB2_1: # %vector.body
822*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
823*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovdqu8 a+1024(%rax), %zmm2
824*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %zmm2, %zmm2
825*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
826*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    addq $4, %rax
827*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    jne .LBB2_1
828*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  # BB#2: # %middle.block
829*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm1
830*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
831*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
832*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
833*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
834*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
835*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
836*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
837*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
838*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
839*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
840*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovd %xmm0, %eax
841*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    retq
842*9880d681SAndroid Build Coastguard Workerentry:
843*9880d681SAndroid Build Coastguard Worker  br label %vector.body
844*9880d681SAndroid Build Coastguard Worker
845*9880d681SAndroid Build Coastguard Workervector.body:
846*9880d681SAndroid Build Coastguard Worker  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
847*9880d681SAndroid Build Coastguard Worker  %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
848*9880d681SAndroid Build Coastguard Worker  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
849*9880d681SAndroid Build Coastguard Worker  %1 = bitcast i8* %0 to <64 x i8>*
850*9880d681SAndroid Build Coastguard Worker  %wide.load = load <64 x i8>, <64 x i8>* %1, align 64
851*9880d681SAndroid Build Coastguard Worker  %2 = zext <64 x i8> %wide.load to <64 x i32>
852*9880d681SAndroid Build Coastguard Worker  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
853*9880d681SAndroid Build Coastguard Worker  %4 = bitcast i8* %3 to <64 x i8>*
854*9880d681SAndroid Build Coastguard Worker  %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64
855*9880d681SAndroid Build Coastguard Worker  %5 = zext <64 x i8> %wide.load1 to <64 x i32>
856*9880d681SAndroid Build Coastguard Worker  %6 = sub nsw <64 x i32> %2, %5
857*9880d681SAndroid Build Coastguard Worker  %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
858*9880d681SAndroid Build Coastguard Worker  %8 = sub nsw <64 x i32> zeroinitializer, %6
859*9880d681SAndroid Build Coastguard Worker  %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8
860*9880d681SAndroid Build Coastguard Worker  %10 = add nsw <64 x i32> %9, %vec.phi
861*9880d681SAndroid Build Coastguard Worker  %index.next = add i64 %index, 4
862*9880d681SAndroid Build Coastguard Worker  %11 = icmp eq i64 %index.next, 1024
863*9880d681SAndroid Build Coastguard Worker  br i1 %11, label %middle.block, label %vector.body
864*9880d681SAndroid Build Coastguard Worker
865*9880d681SAndroid Build Coastguard Workermiddle.block:
866*9880d681SAndroid Build Coastguard Worker  %.lcssa = phi <64 x i32> [ %10, %vector.body ]
867*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
868*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf
869*9880d681SAndroid Build Coastguard Worker  %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
870*9880d681SAndroid Build Coastguard Worker  %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2
871*9880d681SAndroid Build Coastguard Worker  %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
872*9880d681SAndroid Build Coastguard Worker  %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3
873*9880d681SAndroid Build Coastguard Worker  %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
874*9880d681SAndroid Build Coastguard Worker  %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4
875*9880d681SAndroid Build Coastguard Worker  %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
876*9880d681SAndroid Build Coastguard Worker  %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5
877*9880d681SAndroid Build Coastguard Worker  %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
878*9880d681SAndroid Build Coastguard Worker  %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6
879*9880d681SAndroid Build Coastguard Worker  %12 = extractelement <64 x i32> %bin.rdx6, i32 0
880*9880d681SAndroid Build Coastguard Worker  ret i32 %12
881*9880d681SAndroid Build Coastguard Worker}
882*9880d681SAndroid Build Coastguard Worker
883*9880d681SAndroid Build Coastguard Workerdefine i32 @sad_2i8() nounwind {
884*9880d681SAndroid Build Coastguard Worker; SSE2-LABEL: sad_2i8:
885*9880d681SAndroid Build Coastguard Worker; SSE2:       # BB#0: # %entry
886*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pxor %xmm0, %xmm0
887*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
888*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movl $65535, %ecx # imm = 0xFFFF
889*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd %ecx, %xmm1
890*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    .p2align 4, 0x90
891*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  .LBB3_1: # %vector.body
892*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
893*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
894*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
895*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pand %xmm1, %xmm3
896*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pand %xmm1, %xmm2
897*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    psadbw %xmm3, %xmm2
898*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddq %xmm2, %xmm0
899*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    addq $4, %rax
900*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    jne .LBB3_1
901*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:  # BB#2: # %middle.block
902*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
903*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    paddq %xmm0, %xmm1
904*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    movd %xmm1, %eax
905*9880d681SAndroid Build Coastguard Worker; SSE2-NEXT:    retq
906*9880d681SAndroid Build Coastguard Worker;
907*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: sad_2i8:
908*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0: # %entry
909*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
910*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
911*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
912*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    .p2align 4, 0x90
913*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  .LBB3_1: # %vector.body
914*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
915*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
916*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
917*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
918*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
919*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
920*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
921*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    addq $4, %rax
922*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    jne .LBB3_1
923*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:  # BB#2: # %middle.block
924*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
925*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
926*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovd %xmm0, %eax
927*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
928*9880d681SAndroid Build Coastguard Worker;
929*9880d681SAndroid Build Coastguard Worker; AVX512F-LABEL: sad_2i8:
930*9880d681SAndroid Build Coastguard Worker; AVX512F:       # BB#0: # %entry
931*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
932*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
933*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
934*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    .p2align 4, 0x90
935*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  .LBB3_1: # %vector.body
936*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
937*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
938*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
939*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
940*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
941*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
942*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
943*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    addq $4, %rax
944*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    jne .LBB3_1
945*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:  # BB#2: # %middle.block
946*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
947*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
948*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    vmovd %xmm0, %eax
949*9880d681SAndroid Build Coastguard Worker; AVX512F-NEXT:    retq
950*9880d681SAndroid Build Coastguard Worker;
951*9880d681SAndroid Build Coastguard Worker; AVX512BW-LABEL: sad_2i8:
952*9880d681SAndroid Build Coastguard Worker; AVX512BW:       # BB#0: # %entry
953*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
954*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
955*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
956*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    .p2align 4, 0x90
957*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  .LBB3_1: # %vector.body
958*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
959*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
960*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
961*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
962*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
963*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
964*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
965*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    addq $4, %rax
966*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    jne .LBB3_1
967*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:  # BB#2: # %middle.block
968*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
969*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
970*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    vmovd %xmm0, %eax
971*9880d681SAndroid Build Coastguard Worker; AVX512BW-NEXT:    retq
972*9880d681SAndroid Build Coastguard Workerentry:
973*9880d681SAndroid Build Coastguard Worker  br label %vector.body
974*9880d681SAndroid Build Coastguard Worker
975*9880d681SAndroid Build Coastguard Workervector.body:
976*9880d681SAndroid Build Coastguard Worker  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
977*9880d681SAndroid Build Coastguard Worker  %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
978*9880d681SAndroid Build Coastguard Worker  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
979*9880d681SAndroid Build Coastguard Worker  %1 = bitcast i8* %0 to <2 x i8>*
980*9880d681SAndroid Build Coastguard Worker  %wide.load = load <2 x i8>, <2 x i8>* %1, align 4
981*9880d681SAndroid Build Coastguard Worker  %2 = zext <2 x i8> %wide.load to <2 x i32>
982*9880d681SAndroid Build Coastguard Worker  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
983*9880d681SAndroid Build Coastguard Worker  %4 = bitcast i8* %3 to <2 x i8>*
984*9880d681SAndroid Build Coastguard Worker  %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4
985*9880d681SAndroid Build Coastguard Worker  %5 = zext <2 x i8> %wide.load1 to <2 x i32>
986*9880d681SAndroid Build Coastguard Worker  %6 = sub nsw <2 x i32> %2, %5
987*9880d681SAndroid Build Coastguard Worker  %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1>
988*9880d681SAndroid Build Coastguard Worker  %8 = sub nsw <2 x i32> zeroinitializer, %6
989*9880d681SAndroid Build Coastguard Worker  %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8
990*9880d681SAndroid Build Coastguard Worker  %10 = add nsw <2 x i32> %9, %vec.phi
991*9880d681SAndroid Build Coastguard Worker  %index.next = add i64 %index, 4
992*9880d681SAndroid Build Coastguard Worker  %11 = icmp eq i64 %index.next, 1024
993*9880d681SAndroid Build Coastguard Worker  br i1 %11, label %middle.block, label %vector.body
994*9880d681SAndroid Build Coastguard Worker
995*9880d681SAndroid Build Coastguard Workermiddle.block:
996*9880d681SAndroid Build Coastguard Worker  %.lcssa = phi <2 x i32> [ %10, %vector.body ]
997*9880d681SAndroid Build Coastguard Worker  %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
998*9880d681SAndroid Build Coastguard Worker  %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf
999*9880d681SAndroid Build Coastguard Worker  %12 = extractelement <2 x i32> %bin.rdx, i32 0
1000*9880d681SAndroid Build Coastguard Worker  ret i32 %12
1001*9880d681SAndroid Build Coastguard Worker}
1002*9880d681SAndroid Build Coastguard Worker
1003