1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; 'spill_src_stride' affect a lot how the code works. 19; 20; When 'spill_src_stride' is false, the 'src_strideq' resides in 21; register, [srcq + src_strideq + offset] is allowed, so we can simply 22; use such form to access src memory and don't bother to update 'srcq' 23; at each line. We only update 'srcq' each two-lines using a compact 24; LEA instruction like [srcq+src_strideq*2]. 25; 26; When 'spill_src_stride' is true, the 'src_strideq' resides in memory. 27; we cannot use above form to access memory, we have to update 28; 'srcq' at each line break. As we process two parts (first,second) 29; together in each macro function, the second part may also sit 30; in the next line, which means we also need to possibly add 31; one 'src_strideq' to 'srcq' before processing second part. 32 33%macro HANDLE_SECOND_OFFSET 0 34 %if spill_src_stride 35 %define second_offset 0 36 add srcq, src_strideq 37 %else 38 %define second_offset (src_strideq) 39 %endif 40%endmacro 41 42; This is specically designed to handle when src_strideq is a 43; memory position, under such case, we can not accomplish 44; complex address calculation using LEA, and fall back to 45; using simple ADD instruction at each line ending. 46%macro ADVANCE_END_OF_TWO_LINES 0 47 %if spill_src_stride 48 add srcq, src_strideq 49 %else 50 lea srcq, [srcq+src_strideq*2] 51 %endif 52 53; note: ref_stride is never spilled when processing two lines 54 lea ref1q, [ref1q+ref_strideq*2] 55 lea ref2q, [ref2q+ref_strideq*2] 56 lea ref3q, [ref3q+ref_strideq*2] 57 lea ref4q, [ref4q+ref_strideq*2] 58%endmacro 59 60; PROCESS_4x2x4 first 61%macro PROCESS_4x2x4 1 62 movd m0, [srcq] 63 HANDLE_SECOND_OFFSET 64%if %1 == 1 65 movd m6, [ref1q] 66 movd m4, [ref2q] 67 movd m7, [ref3q] 68 movd m5, [ref4q] 69 70 movd m1, [srcq + second_offset] 71 movd m2, [ref1q+ref_strideq] 72 punpckldq m0, m1 73 punpckldq m6, m2 74 movd m1, [ref2q+ref_strideq] 75 movd m2, [ref3q+ref_strideq] 76 movd m3, [ref4q+ref_strideq] 77 punpckldq m4, m1 78 punpckldq m7, m2 79 punpckldq m5, m3 80 movlhps m0, m0 81 movlhps m6, m4 82 movlhps m7, m5 83 psadbw m6, m0 84 psadbw m7, m0 85%else 86 movd m1, [ref1q] 87 movd m5, [ref1q+ref_strideq] 88 movd m2, [ref2q] 89 movd m4, [ref2q+ref_strideq] 90 punpckldq m1, m5 91 punpckldq m2, m4 92 movd m3, [ref3q] 93 movd m5, [ref3q+ref_strideq] 94 punpckldq m3, m5 95 movd m4, [ref4q] 96 movd m5, [ref4q+ref_strideq] 97 punpckldq m4, m5 98 movd m5, [srcq + second_offset] 99 punpckldq m0, m5 100 movlhps m0, m0 101 movlhps m1, m2 102 movlhps m3, m4 103 psadbw m1, m0 104 psadbw m3, m0 105 paddd m6, m1 106 paddd m7, m3 107%endif 108%endmacro 109 110; PROCESS_8x2x4 first 111%macro PROCESS_8x2x4 1 112 movh m0, [srcq] 113 HANDLE_SECOND_OFFSET 114%if %1 == 1 115 movh m4, [ref1q] 116 movh m5, [ref2q] 117 movh m6, [ref3q] 118 movh m7, [ref4q] 119 movhps m0, [srcq + second_offset] 120 movhps m4, [ref1q+ref_strideq] 121 movhps m5, [ref2q+ref_strideq] 122 movhps m6, [ref3q+ref_strideq] 123 movhps m7, [ref4q+ref_strideq] 124 psadbw m4, m0 125 psadbw m5, m0 126 psadbw m6, m0 127 psadbw m7, m0 128%else 129 movh m1, [ref1q] 130 movh m2, [ref2q] 131 movhps m0, [srcq + second_offset] 132 movhps m1, [ref1q+ref_strideq] 133 movhps m2, [ref2q+ref_strideq] 134 psadbw m1, m0 135 psadbw m2, m0 136 paddd m4, m1 137 paddd m5, m2 138 139 movh m1, [ref3q] 140 movhps m1, [ref3q+ref_strideq] 141 movh m2, [ref4q] 142 movhps m2, [ref4q+ref_strideq] 143 psadbw m1, m0 144 psadbw m2, m0 145 paddd m6, m1 146 paddd m7, m2 147%endif 148%endmacro 149 150; PROCESS_FIRST_MMSIZE 151%macro PROCESS_FIRST_MMSIZE 0 152 mova m0, [srcq] 153 movu m4, [ref1q] 154 movu m5, [ref2q] 155 movu m6, [ref3q] 156 movu m7, [ref4q] 157 psadbw m4, m0 158 psadbw m5, m0 159 psadbw m6, m0 160 psadbw m7, m0 161%endmacro 162 163; PROCESS_16x1x4 offset 164%macro PROCESS_16x1x4 1 165 mova m0, [srcq + %1] 166 movu m1, [ref1q + ref_offsetq + %1] 167 movu m2, [ref2q + ref_offsetq + %1] 168 psadbw m1, m0 169 psadbw m2, m0 170 paddd m4, m1 171 paddd m5, m2 172 173 movu m1, [ref3q + ref_offsetq + %1] 174 movu m2, [ref4q + ref_offsetq + %1] 175 psadbw m1, m0 176 psadbw m2, m0 177 paddd m6, m1 178 paddd m7, m2 179%endmacro 180 181; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, 182; uint8_t *ref[4], int ref_stride, 183; uint32_t res[4]); 184; Macro Arguments: 185; 1: Width 186; 2: Height 187; 3: If 0, then normal sad, else skip rows 188%macro SADNXN4D 2-3 0 189 190%define spill_src_stride 0 191%define spill_ref_stride 0 192%define spill_cnt 0 193 194; Whether a shared offset should be used instead of adding strides to 195; each reference array. With this option, only one line will be processed 196; per loop iteration. 197%define use_ref_offset (%1 >= mmsize) 198 199; Remove loops in the 4x4 and 8x4 case 200%define use_loop (use_ref_offset || %2 > 4) 201 202%if %3 == 1 ; skip rows 203%if AOM_ARCH_X86_64 204%if use_ref_offset 205cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \ 206 ref2, ref3, ref4, cnt, ref_offset 207%elif use_loop 208cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \ 209 ref2, ref3, ref4, cnt 210%else 211cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \ 212 ref2, ref3, ref4 213%endif 214%else 215%if use_ref_offset 216cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \ 217 ref4 218%define spill_src_stride 1 219%define spill_ref_stride 1 220%elif use_loop 221cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \ 222 ref3, ref4 223%define spill_src_stride 1 224%else 225cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \ 226 ref3, ref4 227%endif 228%endif 229%else ; normal sad 230%if AOM_ARCH_X86_64 231%if use_ref_offset 232cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \ 233 ref3, ref4, cnt, ref_offset 234%elif use_loop 235cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \ 236 ref3, ref4, cnt 237%else 238cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \ 239 ref3, ref4 240%endif 241%else 242%if use_ref_offset 243cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4 244 %define spill_src_stride 1 245 %define spill_ref_stride 1 246%elif use_loop 247cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4 248 %define spill_src_stride 1 249%else 250cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \ 251 ref4 252%endif 253%endif 254%endif 255 256%if spill_src_stride 257 %define src_strideq r1mp 258 %define src_strided r1mp 259%endif 260%if spill_ref_stride 261 %define ref_strideq r3mp 262 %define ref_strided r3mp 263%endif 264 265%if spill_cnt 266 SUB rsp, 4 267 %define cntd word [rsp] 268%endif 269 270%if %3 == 1 271 sal src_strided, 1 272 sal ref_strided, 1 273%endif 274 movsxdifnidn src_strideq, src_strided 275 movsxdifnidn ref_strideq, ref_strided 276 277 mov ref2q, [ref1q+gprsize*1] 278 mov ref3q, [ref1q+gprsize*2] 279 mov ref4q, [ref1q+gprsize*3] 280 mov ref1q, [ref1q+gprsize*0] 281 282; Is the loop for this wxh in another function? 283; If so, we jump into that function for the loop and returning 284%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2) 285 286%if use_ref_offset 287 PROCESS_FIRST_MMSIZE 288%if %1 > mmsize 289 mov ref_offsetq, 0 290 mov cntd, %2 >> %3 291; Jump part way into the loop for the square version of this width 292%if %3 == 1 293 jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop 294%else 295 jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop 296%endif 297%else 298 mov ref_offsetq, ref_strideq 299 add srcq, src_strideq 300 mov cntd, (%2 >> %3) - 1 301%endif 302%if external_loop == 0 303.loop: 304; Unrolled horizontal loop 305%assign h_offset 0 306%rep %1/mmsize 307 PROCESS_16x1x4 h_offset 308%if h_offset == 0 309; The first row of the first column is done outside the loop and jumps here 310.midloop: 311%endif 312%assign h_offset h_offset+mmsize 313%endrep 314 315 add srcq, src_strideq 316 add ref_offsetq, ref_strideq 317 sub cntd, 1 318 jnz .loop 319%endif 320%else 321 PROCESS_%1x2x4 1 322 ADVANCE_END_OF_TWO_LINES 323%if use_loop 324 mov cntd, (%2/2 >> %3) - 1 325.loop: 326%endif 327 PROCESS_%1x2x4 0 328%if use_loop 329 ADVANCE_END_OF_TWO_LINES 330 sub cntd, 1 331 jnz .loop 332%endif 333%endif 334 335%if spill_cnt 336; Undo stack allocation for cnt 337 ADD rsp, 4 338%endif 339 340%if external_loop == 0 341%if %3 == 0 342 %define resultq r4 343 %define resultmp r4mp 344%endif 345 346; Undo modifications on parameters on the stack 347%if %3 == 1 348%if spill_src_stride 349 shr src_strided, 1 350%endif 351%if spill_ref_stride 352 shr ref_strided, 1 353%endif 354%endif 355 356%if %1 > 4 357 pslldq m5, 4 358 pslldq m7, 4 359 por m4, m5 360 por m6, m7 361 mova m5, m4 362 mova m7, m6 363 punpcklqdq m4, m6 364 punpckhqdq m5, m7 365 paddd m4, m5 366%if %3 == 1 367 pslld m4, 1 368%endif 369 movifnidn resultq, resultmp 370 movu [resultq], m4 371 RET 372%else 373 pshufd m6, m6, 0x08 374 pshufd m7, m7, 0x08 375%if %3 == 1 376 pslld m6, 1 377 pslld m7, 1 378%endif 379 movifnidn resultq, resultmp 380 movq [resultq+0], m6 381 movq [resultq+8], m7 382 RET 383%endif 384%endif ; external_loop == 0 385%endmacro 386 387INIT_XMM sse2 388SADNXN4D 128, 128 389SADNXN4D 128, 64 390SADNXN4D 64, 128 391SADNXN4D 64, 64 392SADNXN4D 64, 32 393SADNXN4D 32, 64 394SADNXN4D 32, 32 395SADNXN4D 32, 16 396SADNXN4D 16, 32 397SADNXN4D 16, 16 398SADNXN4D 16, 8 399SADNXN4D 8, 16 400SADNXN4D 8, 8 401SADNXN4D 8, 4 402SADNXN4D 4, 8 403SADNXN4D 4, 4 404%if CONFIG_REALTIME_ONLY==0 405SADNXN4D 4, 16 406SADNXN4D 16, 4 407SADNXN4D 8, 32 408SADNXN4D 32, 8 409SADNXN4D 16, 64 410SADNXN4D 64, 16 411%endif 412SADNXN4D 128, 128, 1 413SADNXN4D 128, 64, 1 414SADNXN4D 64, 128, 1 415SADNXN4D 64, 64, 1 416SADNXN4D 64, 32, 1 417SADNXN4D 32, 64, 1 418SADNXN4D 32, 32, 1 419SADNXN4D 32, 16, 1 420SADNXN4D 16, 32, 1 421SADNXN4D 16, 16, 1 422SADNXN4D 16, 8, 1 423SADNXN4D 8, 16, 1 424SADNXN4D 8, 8, 1 425SADNXN4D 4, 8, 1 426%if CONFIG_REALTIME_ONLY==0 427SADNXN4D 4, 16, 1 428SADNXN4D 8, 32, 1 429SADNXN4D 32, 8, 1 430SADNXN4D 16, 64, 1 431SADNXN4D 64, 16, 1 432%endif 433 434; Different assembly is needed when the height gets subsampled to 2 435; SADNXN4D 16, 4, 1 436; SADNXN4D 8, 4, 1 437; SADNXN4D 4, 4, 1 438