1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end 16%macro HIGH_PROCESS_4x2x4 5-6 0 17 movh m0, [srcq +%2*2] 18%if %1 == 1 19 movu m4, [ref1q+%3*2] 20 movu m5, [ref2q+%3*2] 21 movu m6, [ref3q+%3*2] 22 movu m7, [ref4q+%3*2] 23 movhps m0, [srcq +%4*2] 24 movhps m4, [ref1q+%5*2] 25 movhps m5, [ref2q+%5*2] 26 movhps m6, [ref3q+%5*2] 27 movhps m7, [ref4q+%5*2] 28 mova m3, m0 29 mova m2, m0 30 psubusw m3, m4 31 psubusw m2, m5 32 psubusw m4, m0 33 psubusw m5, m0 34 por m4, m3 35 por m5, m2 36 pmaddwd m4, m1 37 pmaddwd m5, m1 38 mova m3, m0 39 mova m2, m0 40 psubusw m3, m6 41 psubusw m2, m7 42 psubusw m6, m0 43 psubusw m7, m0 44 por m6, m3 45 por m7, m2 46 pmaddwd m6, m1 47 pmaddwd m7, m1 48%else 49 movu m2, [ref1q+%3*2] 50 movhps m0, [srcq +%4*2] 51 movhps m2, [ref1q+%5*2] 52 mova m3, m0 53 psubusw m3, m2 54 psubusw m2, m0 55 por m2, m3 56 pmaddwd m2, m1 57 paddd m4, m2 58 59 movu m2, [ref2q+%3*2] 60 mova m3, m0 61 movhps m2, [ref2q+%5*2] 62 psubusw m3, m2 63 psubusw m2, m0 64 por m2, m3 65 pmaddwd m2, m1 66 paddd m5, m2 67 68 movu m2, [ref3q+%3*2] 69 mova m3, m0 70 movhps m2, [ref3q+%5*2] 71 psubusw m3, m2 72 psubusw m2, m0 73 por m2, m3 74 pmaddwd m2, m1 75 paddd m6, m2 76 77 movu m2, [ref4q+%3*2] 78 mova m3, m0 79 movhps m2, [ref4q+%5*2] 80 psubusw m3, m2 81 psubusw m2, m0 82 por m2, m3 83 pmaddwd m2, m1 84 paddd m7, m2 85%endif 86%if %6 == 1 87 lea srcq, [srcq +src_strideq*4] 88 lea ref1q, [ref1q+ref_strideq*4] 89 lea ref2q, [ref2q+ref_strideq*4] 90 lea ref3q, [ref3q+ref_strideq*4] 91 lea ref4q, [ref4q+ref_strideq*4] 92%endif 93%endmacro 94 95; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end 96%macro HIGH_PROCESS_8x2x4 5-6 0 97 ; 1st 8 px 98 mova m0, [srcq +%2*2] 99%if %1 == 1 100 movu m4, [ref1q+%3*2] 101 movu m5, [ref2q+%3*2] 102 movu m6, [ref3q+%3*2] 103 movu m7, [ref4q+%3*2] 104 mova m3, m0 105 mova m2, m0 106 psubusw m3, m4 107 psubusw m2, m5 108 psubusw m4, m0 109 psubusw m5, m0 110 por m4, m3 111 por m5, m2 112 pmaddwd m4, m1 113 pmaddwd m5, m1 114 mova m3, m0 115 mova m2, m0 116 psubusw m3, m6 117 psubusw m2, m7 118 psubusw m6, m0 119 psubusw m7, m0 120 por m6, m3 121 por m7, m2 122 pmaddwd m6, m1 123 pmaddwd m7, m1 124%else 125 mova m3, m0 126 movu m2, [ref1q+%3*2] 127 psubusw m3, m2 128 psubusw m2, m0 129 por m2, m3 130 mova m3, m0 131 pmaddwd m2, m1 132 paddd m4, m2 133 movu m2, [ref2q+%3*2] 134 psubusw m3, m2 135 psubusw m2, m0 136 por m2, m3 137 mova m3, m0 138 pmaddwd m2, m1 139 paddd m5, m2 140 movu m2, [ref3q+%3*2] 141 psubusw m3, m2 142 psubusw m2, m0 143 por m2, m3 144 mova m3, m0 145 pmaddwd m2, m1 146 paddd m6, m2 147 movu m2, [ref4q+%3*2] 148 psubusw m3, m2 149 psubusw m2, m0 150 por m2, m3 151 pmaddwd m2, m1 152 paddd m7, m2 153%endif 154 155 ; 2nd 8 px 156 mova m0, [srcq +(%4)*2] 157 mova m3, m0 158 movu m2, [ref1q+(%5)*2] 159 psubusw m3, m2 160 psubusw m2, m0 161 por m2, m3 162 mova m3, m0 163 pmaddwd m2, m1 164 paddd m4, m2 165 movu m2, [ref2q+(%5)*2] 166 psubusw m3, m2 167 psubusw m2, m0 168 por m2, m3 169 mova m3, m0 170 pmaddwd m2, m1 171 paddd m5, m2 172 movu m2, [ref3q+(%5)*2] 173 psubusw m3, m2 174 psubusw m2, m0 175 por m2, m3 176 mova m3, m0 177 pmaddwd m2, m1 178 paddd m6, m2 179 movu m2, [ref4q+(%5)*2] 180 psubusw m3, m2 181 psubusw m2, m0 182%if %6 == 1 183 lea srcq, [srcq +src_strideq*4] 184 lea ref1q, [ref1q+ref_strideq*4] 185 lea ref2q, [ref2q+ref_strideq*4] 186 lea ref3q, [ref3q+ref_strideq*4] 187 lea ref4q, [ref4q+ref_strideq*4] 188%endif 189 por m2, m3 190 pmaddwd m2, m1 191 paddd m7, m2 192%endmacro 193 194; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end 195%macro HIGH_PROCESS_16x2x4 5-6 0 196 HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) 197 HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 198%endmacro 199 200; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end 201%macro HIGH_PROCESS_32x2x4 5-6 0 202 HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) 203 HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 204%endmacro 205 206; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end 207%macro HIGH_PROCESS_64x2x4 5-6 0 208 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) 209 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 210%endmacro 211 212; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, 213; uint8_t *ref[4], int ref_stride, 214; uint32_t res[4]); 215; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 216; Macro Arguments: 217; 1: Width 218; 2: Height 219; 3: If 0, then normal sad, if 2, then skip every other row 220%macro HIGH_SADNXN4D 2-3 0 221%if %3 == 0 ; normal sad 222%if UNIX64 223cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 224 res, ref2, ref3, ref4 225%else 226cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 227 ref2, ref3, ref4 228%endif 229%else ; %3 == 2, downsample 230%if UNIX64 231cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 232 res, ref2, ref3, ref4 233%else 234cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 235 ref2, ref3, ref4 236%endif ; 237%endif ; sad/avg/skip 238 239; set m1 240 push srcq 241 mov srcd, 0x00010001 242 movd m1, srcd 243 pshufd m1, m1, 0x0 244 pop srcq 245 246%if %3 == 2 ; skip rows 247 lea src_strided, [2*src_strided] 248 lea ref_strided, [2*ref_strided] 249%endif ; skip rows 250 movsxdifnidn src_strideq, src_strided 251 movsxdifnidn ref_strideq, ref_strided 252 mov ref2q, [ref1q+gprsize*1] 253 mov ref3q, [ref1q+gprsize*2] 254 mov ref4q, [ref1q+gprsize*3] 255 mov ref1q, [ref1q+gprsize*0] 256 257; convert byte pointers to short pointers 258 shl srcq, 1 259 shl ref2q, 1 260 shl ref3q, 1 261 shl ref4q, 1 262 shl ref1q, 1 263 264 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 265%if %3 == 2 ; Downsampling by two 266%define num_rep (%2-8)/4 267%else 268%define num_rep (%2-4)/2 269%endif 270%rep num_rep 271 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 272%endrep 273%undef rep 274 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 275 ; N.B. HIGH_PROCESS outputs dwords (32 bits) 276 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM 277 movhlps m0, m4 278 movhlps m1, m5 279 movhlps m2, m6 280 movhlps m3, m7 281 paddd m4, m0 282 paddd m5, m1 283 paddd m6, m2 284 paddd m7, m3 285 punpckldq m4, m5 286 punpckldq m6, m7 287 movhlps m0, m4 288 movhlps m1, m6 289 paddd m4, m0 290 paddd m6, m1 291 punpcklqdq m4, m6 292%if %3 == 2 ; skip rows 293 pslld m4, 1 294%endif 295 movifnidn r4, r4mp 296 movu [r4], m4 297 RET 298%endmacro 299 300 301INIT_XMM sse2 302HIGH_SADNXN4D 64, 64 303HIGH_SADNXN4D 64, 32 304HIGH_SADNXN4D 32, 64 305HIGH_SADNXN4D 32, 32 306HIGH_SADNXN4D 32, 16 307HIGH_SADNXN4D 16, 32 308HIGH_SADNXN4D 16, 16 309HIGH_SADNXN4D 16, 8 310HIGH_SADNXN4D 8, 16 311HIGH_SADNXN4D 8, 8 312HIGH_SADNXN4D 8, 4 313HIGH_SADNXN4D 4, 8 314HIGH_SADNXN4D 4, 4 315 316HIGH_SADNXN4D 64, 64, 2 317HIGH_SADNXN4D 64, 32, 2 318HIGH_SADNXN4D 32, 64, 2 319HIGH_SADNXN4D 32, 32, 2 320HIGH_SADNXN4D 32, 16, 2 321HIGH_SADNXN4D 16, 32, 2 322HIGH_SADNXN4D 16, 16, 2 323HIGH_SADNXN4D 16, 8, 2 324HIGH_SADNXN4D 8, 16, 2 325HIGH_SADNXN4D 8, 8, 2 326HIGH_SADNXN4D 4, 8, 2 327