1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end 16%macro PROCESS_4x2x4 5-6 0 17 movd m0, [srcq +%2] 18%if %1 == 1 19 movd m6, [ref1q+%3] 20 movd m4, [ref2q+%3] 21 movd m7, [ref3q+%3] 22 movd m5, [ref4q+%3] 23 movd m1, [srcq +%4] 24 movd m2, [ref1q+%5] 25 punpckldq m0, m1 26 punpckldq m6, m2 27 movd m1, [ref2q+%5] 28 movd m2, [ref3q+%5] 29 movd m3, [ref4q+%5] 30 punpckldq m4, m1 31 punpckldq m7, m2 32 punpckldq m5, m3 33 movlhps m0, m0 34 movlhps m6, m4 35 movlhps m7, m5 36 psadbw m6, m0 37 psadbw m7, m0 38%else 39 movd m1, [ref1q+%3] 40 movd m5, [ref1q+%5] 41 movd m2, [ref2q+%3] 42 movd m4, [ref2q+%5] 43 punpckldq m1, m5 44 punpckldq m2, m4 45 movd m3, [ref3q+%3] 46 movd m5, [ref3q+%5] 47 punpckldq m3, m5 48 movd m4, [ref4q+%3] 49 movd m5, [ref4q+%5] 50 punpckldq m4, m5 51 movd m5, [srcq +%4] 52 punpckldq m0, m5 53 movlhps m0, m0 54 movlhps m1, m2 55 movlhps m3, m4 56 psadbw m1, m0 57 psadbw m3, m0 58 paddd m6, m1 59 paddd m7, m3 60%endif 61%if %6 == 1 62 lea srcq, [srcq +src_strideq*2] 63 lea ref1q, [ref1q+ref_strideq*2] 64 lea ref2q, [ref2q+ref_strideq*2] 65 lea ref3q, [ref3q+ref_strideq*2] 66 lea ref4q, [ref4q+ref_strideq*2] 67%endif 68%endmacro 69 70; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end 71%macro PROCESS_8x2x4 5-6 0 72 movh m0, [srcq +%2] 73%if %1 == 1 74 movh m4, [ref1q+%3] 75 movh m5, [ref2q+%3] 76 movh m6, [ref3q+%3] 77 movh m7, [ref4q+%3] 78 movhps m0, [srcq +%4] 79 movhps m4, [ref1q+%5] 80 movhps m5, [ref2q+%5] 81 movhps m6, [ref3q+%5] 82 movhps m7, [ref4q+%5] 83 psadbw m4, m0 84 psadbw m5, m0 85 psadbw m6, m0 86 psadbw m7, m0 87%else 88 movh m1, [ref1q+%3] 89 movh m2, [ref2q+%3] 90 movh m3, [ref3q+%3] 91 movhps m0, [srcq +%4] 92 movhps m1, [ref1q+%5] 93 movhps m2, [ref2q+%5] 94 movhps m3, [ref3q+%5] 95 psadbw m1, m0 96 psadbw m2, m0 97 psadbw m3, m0 98 paddd m4, m1 99 movh m1, [ref4q+%3] 100 movhps m1, [ref4q+%5] 101 paddd m5, m2 102 paddd m6, m3 103 psadbw m1, m0 104 paddd m7, m1 105%endif 106%if %6 == 1 107 lea srcq, [srcq +src_strideq*2] 108 lea ref1q, [ref1q+ref_strideq*2] 109 lea ref2q, [ref2q+ref_strideq*2] 110 lea ref3q, [ref3q+ref_strideq*2] 111 lea ref4q, [ref4q+ref_strideq*2] 112%endif 113%endmacro 114 115; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end 116%macro PROCESS_16x2x4 5-6 0 117 ; 1st 16 px 118 mova m0, [srcq +%2] 119%if %1 == 1 120 movu m4, [ref1q+%3] 121 movu m5, [ref2q+%3] 122 movu m6, [ref3q+%3] 123 movu m7, [ref4q+%3] 124 psadbw m4, m0 125 psadbw m5, m0 126 psadbw m6, m0 127 psadbw m7, m0 128%else 129 movu m1, [ref1q+%3] 130 movu m2, [ref2q+%3] 131 movu m3, [ref3q+%3] 132 psadbw m1, m0 133 psadbw m2, m0 134 psadbw m3, m0 135 paddd m4, m1 136 movu m1, [ref4q+%3] 137 paddd m5, m2 138 paddd m6, m3 139 psadbw m1, m0 140 paddd m7, m1 141%endif 142 143 ; 2nd 16 px 144 mova m0, [srcq +%4] 145 movu m1, [ref1q+%5] 146 movu m2, [ref2q+%5] 147 movu m3, [ref3q+%5] 148 psadbw m1, m0 149 psadbw m2, m0 150 psadbw m3, m0 151 paddd m4, m1 152 movu m1, [ref4q+%5] 153 paddd m5, m2 154 paddd m6, m3 155%if %6 == 1 156 lea srcq, [srcq +src_strideq*2] 157 lea ref1q, [ref1q+ref_strideq*2] 158 lea ref2q, [ref2q+ref_strideq*2] 159 lea ref3q, [ref3q+ref_strideq*2] 160 lea ref4q, [ref4q+ref_strideq*2] 161%endif 162 psadbw m1, m0 163 paddd m7, m1 164%endmacro 165 166; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end 167%macro PROCESS_32x2x4 5-6 0 168 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 169 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 170%endmacro 171 172; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end 173%macro PROCESS_64x2x4 5-6 0 174 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 175 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 176%endmacro 177 178; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, 179; uint8_t *ref[4], int ref_stride, 180; uint32_t res[4]); 181; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 182%macro SADNXN4D 2-3 0 183%if %3 == 1 ; skip rows 184%if UNIX64 185cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 186 res, ref2, ref3, ref4 187%else 188cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 189 ref2, ref3, ref4 190%endif 191%else ; normal sad 192%if UNIX64 193cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 194 res, ref2, ref3, ref4 195%else 196cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 197 ref2, ref3, ref4 198%endif 199%endif 200%if %3 == 1 201 lea src_strided, [2*src_strided] 202 lea ref_strided, [2*ref_strided] 203%endif 204 movsxdifnidn src_strideq, src_strided 205 movsxdifnidn ref_strideq, ref_strided 206 mov ref2q, [ref1q+gprsize*1] 207 mov ref3q, [ref1q+gprsize*2] 208 mov ref4q, [ref1q+gprsize*3] 209 mov ref1q, [ref1q+gprsize*0] 210 211 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 212%if %3 == 1 ; downsample number of rows by 2 213%define num_rep (%2-8)/4 214%else 215%define num_rep (%2-4)/2 216%endif 217%rep num_rep 218 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 219%endrep 220%undef num_rep 221 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 222 223%if %1 > 4 224 pslldq m5, 4 225 pslldq m7, 4 226 por m4, m5 227 por m6, m7 228 mova m5, m4 229 mova m7, m6 230 punpcklqdq m4, m6 231 punpckhqdq m5, m7 232 movifnidn r4, r4mp 233 paddd m4, m5 234%if %3 == 1 235 pslld m4, 1 236%endif 237 movu [r4], m4 238 RET 239%else 240 movifnidn r4, r4mp 241 pshufd m6, m6, 0x08 242 pshufd m7, m7, 0x08 243%if %3 == 1 244 pslld m6, 1 245 pslld m7, 1 246%endif 247 movq [r4+0], m6 248 movq [r4+8], m7 249 RET 250%endif 251%endmacro 252 253INIT_XMM sse2 254SADNXN4D 64, 64 255SADNXN4D 64, 32 256SADNXN4D 32, 64 257SADNXN4D 32, 32 258SADNXN4D 32, 16 259SADNXN4D 16, 32 260SADNXN4D 16, 16 261SADNXN4D 16, 8 262SADNXN4D 8, 16 263SADNXN4D 8, 8 264SADNXN4D 8, 4 265SADNXN4D 4, 8 266SADNXN4D 4, 4 267 268SADNXN4D 64, 64, 1 269SADNXN4D 64, 32, 1 270SADNXN4D 32, 64, 1 271SADNXN4D 32, 32, 1 272SADNXN4D 32, 16, 1 273SADNXN4D 16, 32, 1 274SADNXN4D 16, 16, 1 275SADNXN4D 16, 8, 1 276SADNXN4D 8, 16, 1 277SADNXN4D 8, 8, 1 278SADNXN4D 4, 8, 1 279