1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end 19%macro HIGH_PROCESS_4x2x4 5-6 0 20 movh m0, [srcq +%2*2] 21%if %1 == 1 22 movu m4, [ref1q+%3*2] 23 movu m5, [ref2q+%3*2] 24 movu m6, [ref3q+%3*2] 25 movu m7, [ref4q+%3*2] 26 movhps m0, [srcq +%4*2] 27 movhps m4, [ref1q+%5*2] 28 movhps m5, [ref2q+%5*2] 29 movhps m6, [ref3q+%5*2] 30 movhps m7, [ref4q+%5*2] 31 mova m3, m0 32 mova m2, m0 33 psubusw m3, m4 34 psubusw m2, m5 35 psubusw m4, m0 36 psubusw m5, m0 37 por m4, m3 38 por m5, m2 39 pmaddwd m4, m1 40 pmaddwd m5, m1 41 mova m3, m0 42 mova m2, m0 43 psubusw m3, m6 44 psubusw m2, m7 45 psubusw m6, m0 46 psubusw m7, m0 47 por m6, m3 48 por m7, m2 49 pmaddwd m6, m1 50 pmaddwd m7, m1 51%else 52 movu m2, [ref1q+%3*2] 53 movhps m0, [srcq +%4*2] 54 movhps m2, [ref1q+%5*2] 55 mova m3, m0 56 psubusw m3, m2 57 psubusw m2, m0 58 por m2, m3 59 pmaddwd m2, m1 60 paddd m4, m2 61 62 movu m2, [ref2q+%3*2] 63 mova m3, m0 64 movhps m2, [ref2q+%5*2] 65 psubusw m3, m2 66 psubusw m2, m0 67 por m2, m3 68 pmaddwd m2, m1 69 paddd m5, m2 70 71 movu m2, [ref3q+%3*2] 72 mova m3, m0 73 movhps m2, [ref3q+%5*2] 74 psubusw m3, m2 75 psubusw m2, m0 76 por m2, m3 77 pmaddwd m2, m1 78 paddd m6, m2 79 80 movu m2, [ref4q+%3*2] 81 mova m3, m0 82 movhps m2, [ref4q+%5*2] 83 psubusw m3, m2 84 psubusw m2, m0 85 por m2, m3 86 pmaddwd m2, m1 87 paddd m7, m2 88%endif 89%if %6 == 1 90 lea srcq, [srcq +src_strideq*4] 91 lea ref1q, [ref1q+ref_strideq*4] 92 lea ref2q, [ref2q+ref_strideq*4] 93 lea ref3q, [ref3q+ref_strideq*4] 94 lea ref4q, [ref4q+ref_strideq*4] 95%endif 96%endmacro 97 98; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end 99%macro HIGH_PROCESS_8x2x4 5-6 0 100 ; 1st 8 px 101 mova m0, [srcq +%2*2] 102%if %1 == 1 103 movu m4, [ref1q+%3*2] 104 movu m5, [ref2q+%3*2] 105 movu m6, [ref3q+%3*2] 106 movu m7, [ref4q+%3*2] 107 mova m3, m0 108 mova m2, m0 109 psubusw m3, m4 110 psubusw m2, m5 111 psubusw m4, m0 112 psubusw m5, m0 113 por m4, m3 114 por m5, m2 115 pmaddwd m4, m1 116 pmaddwd m5, m1 117 mova m3, m0 118 mova m2, m0 119 psubusw m3, m6 120 psubusw m2, m7 121 psubusw m6, m0 122 psubusw m7, m0 123 por m6, m3 124 por m7, m2 125 pmaddwd m6, m1 126 pmaddwd m7, m1 127%else 128 mova m3, m0 129 movu m2, [ref1q+%3*2] 130 psubusw m3, m2 131 psubusw m2, m0 132 por m2, m3 133 mova m3, m0 134 pmaddwd m2, m1 135 paddd m4, m2 136 movu m2, [ref2q+%3*2] 137 psubusw m3, m2 138 psubusw m2, m0 139 por m2, m3 140 mova m3, m0 141 pmaddwd m2, m1 142 paddd m5, m2 143 movu m2, [ref3q+%3*2] 144 psubusw m3, m2 145 psubusw m2, m0 146 por m2, m3 147 mova m3, m0 148 pmaddwd m2, m1 149 paddd m6, m2 150 movu m2, [ref4q+%3*2] 151 psubusw m3, m2 152 psubusw m2, m0 153 por m2, m3 154 pmaddwd m2, m1 155 paddd m7, m2 156%endif 157 158 ; 2nd 8 px 159 mova m0, [srcq +(%4)*2] 160 mova m3, m0 161 movu m2, [ref1q+(%5)*2] 162 psubusw m3, m2 163 psubusw m2, m0 164 por m2, m3 165 mova m3, m0 166 pmaddwd m2, m1 167 paddd m4, m2 168 movu m2, [ref2q+(%5)*2] 169 psubusw m3, m2 170 psubusw m2, m0 171 por m2, m3 172 mova m3, m0 173 pmaddwd m2, m1 174 paddd m5, m2 175 movu m2, [ref3q+(%5)*2] 176 psubusw m3, m2 177 psubusw m2, m0 178 por m2, m3 179 mova m3, m0 180 pmaddwd m2, m1 181 paddd m6, m2 182 movu m2, [ref4q+(%5)*2] 183 psubusw m3, m2 184 psubusw m2, m0 185%if %6 == 1 186 lea srcq, [srcq +src_strideq*4] 187 lea ref1q, [ref1q+ref_strideq*4] 188 lea ref2q, [ref2q+ref_strideq*4] 189 lea ref3q, [ref3q+ref_strideq*4] 190 lea ref4q, [ref4q+ref_strideq*4] 191%endif 192 por m2, m3 193 pmaddwd m2, m1 194 paddd m7, m2 195%endmacro 196 197; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end 198%macro HIGH_PROCESS_16x2x4 5-6 0 199 HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) 200 HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 201%endmacro 202 203; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end 204%macro HIGH_PROCESS_32x2x4 5-6 0 205 HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) 206 HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 207%endmacro 208 209; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end 210%macro HIGH_PROCESS_64x2x4 5-6 0 211 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) 212 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 213%endmacro 214 215; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, 216; uint8_t *ref[4], int ref_stride, 217; uint32_t res[4]); 218; Macro Arguments: 219; 1: Width 220; 2: Height 221; 3: If 0, then normal sad, if 2, then skip every other row 222%macro HIGH_SADNXN4D 2-3 0 223%if %3 == 0 ; normal sad 224%if AOM_ARCH_X86_64 225cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 226 res, ref2, ref3, ref4 227%else 228cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 229 ref2, ref3, ref4 230%endif ; AOM_ARCH_X86_64 231%else ; %3 == 2, downsample 232%if AOM_ARCH_X86_64 233cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 234 res, ref2, ref3, ref4 235%else 236cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 237 ref2, ref3, ref4 238%endif ; AOM_ARCH_X86_64 239%endif ; sad/avg/skip 240 241; set m1 242 push srcq 243 mov srcd, 0x00010001 244 movd m1, srcd 245 pshufd m1, m1, 0x0 246 pop srcq 247 248%if %3 == 2 ; skip rows 249 lea src_strided, [2*src_strided] 250 lea ref_strided, [2*ref_strided] 251%endif ; skip rows 252 movsxdifnidn src_strideq, src_strided 253 movsxdifnidn ref_strideq, ref_strided 254 mov ref2q, [ref1q+gprsize*1] 255 mov ref3q, [ref1q+gprsize*2] 256 mov ref4q, [ref1q+gprsize*3] 257 mov ref1q, [ref1q+gprsize*0] 258 259; convert byte pointers to short pointers 260 shl srcq, 1 261 shl ref2q, 1 262 shl ref3q, 1 263 shl ref4q, 1 264 shl ref1q, 1 265 266 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 267%if %3 == 2 ; Downsampling by two 268%define num_rep (%2-8)/4 269%else 270%define num_rep (%2-4)/2 271%endif 272%rep num_rep 273 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 274%endrep 275%undef rep 276 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 277 ; N.B. HIGH_PROCESS outputs dwords (32 bits) 278 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM 279 movhlps m0, m4 280 movhlps m1, m5 281 movhlps m2, m6 282 movhlps m3, m7 283 paddd m4, m0 284 paddd m5, m1 285 paddd m6, m2 286 paddd m7, m3 287 punpckldq m4, m5 288 punpckldq m6, m7 289 movhlps m0, m4 290 movhlps m1, m6 291 paddd m4, m0 292 paddd m6, m1 293 punpcklqdq m4, m6 294%if %3 == 2 ; skip rows 295 pslld m4, 1 296%endif 297 movifnidn r4, r4mp 298 movu [r4], m4 299 RET 300%endmacro 301 302 303INIT_XMM sse2 304HIGH_SADNXN4D 64, 64 305HIGH_SADNXN4D 64, 32 306HIGH_SADNXN4D 32, 64 307HIGH_SADNXN4D 32, 32 308HIGH_SADNXN4D 32, 16 309HIGH_SADNXN4D 16, 32 310HIGH_SADNXN4D 16, 16 311HIGH_SADNXN4D 16, 8 312HIGH_SADNXN4D 8, 16 313HIGH_SADNXN4D 8, 8 314HIGH_SADNXN4D 8, 4 315HIGH_SADNXN4D 4, 8 316HIGH_SADNXN4D 4, 4 317HIGH_SADNXN4D 4, 16 318HIGH_SADNXN4D 16, 4 319HIGH_SADNXN4D 8, 32 320HIGH_SADNXN4D 32, 8 321HIGH_SADNXN4D 16, 64 322HIGH_SADNXN4D 64, 16 323 324HIGH_SADNXN4D 64, 64, 2 325HIGH_SADNXN4D 64, 32, 2 326HIGH_SADNXN4D 32, 64, 2 327HIGH_SADNXN4D 32, 32, 2 328HIGH_SADNXN4D 32, 16, 2 329HIGH_SADNXN4D 16, 32, 2 330HIGH_SADNXN4D 16, 16, 2 331HIGH_SADNXN4D 16, 8, 2 332HIGH_SADNXN4D 8, 16, 2 333HIGH_SADNXN4D 8, 8, 2 334HIGH_SADNXN4D 4, 8, 2 335HIGH_SADNXN4D 4, 16, 2 336HIGH_SADNXN4D 8, 32, 2 337HIGH_SADNXN4D 32, 8, 2 338HIGH_SADNXN4D 16, 64, 2 339HIGH_SADNXN4D 64, 16, 2 340 341; Current code cannot handle the case when the height is downsampled to 2 342; HIGH_SADNXN4D 16, 4, 2 343; HIGH_SADNXN4D 8, 4, 2 344; HIGH_SADNXN4D 4, 4, 2 345