1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "aom_ports/x86_abi_support.asm" 15 16; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 17%macro TABULATE_SSIM 0 18 paddusw xmm15, xmm3 ; sum_s 19 paddusw xmm14, xmm4 ; sum_r 20 movdqa xmm1, xmm3 21 pmaddwd xmm1, xmm1 22 paddd xmm13, xmm1 ; sum_sq_s 23 movdqa xmm2, xmm4 24 pmaddwd xmm2, xmm2 25 paddd xmm12, xmm2 ; sum_sq_r 26 pmaddwd xmm3, xmm4 27 paddd xmm11, xmm3 ; sum_sxr 28%endmacro 29 30; Sum across the register %1 starting with q words 31%macro SUM_ACROSS_Q 1 32 movdqa xmm2,%1 33 punpckldq %1,xmm0 34 punpckhdq xmm2,xmm0 35 paddq %1,xmm2 36 movdqa xmm2,%1 37 punpcklqdq %1,xmm0 38 punpckhqdq xmm2,xmm0 39 paddq %1,xmm2 40%endmacro 41 42; Sum across the register %1 starting with q words 43%macro SUM_ACROSS_W 1 44 movdqa xmm1, %1 45 punpcklwd %1,xmm0 46 punpckhwd xmm1,xmm0 47 paddd %1, xmm1 48 SUM_ACROSS_Q %1 49%endmacro 50 51SECTION .text 52 53;void aom_ssim_parms_8x8_sse2( 54; unsigned char *s, 55; int sp, 56; unsigned char *r, 57; int rp 58; uint32_t *sum_s, 59; uint32_t *sum_r, 60; uint32_t *sum_sq_s, 61; uint32_t *sum_sq_r, 62; uint32_t *sum_sxr); 63; 64; TODO: Use parm passing through structure, probably don't need the pxors 65; ( calling app will initialize to 0 ) could easily fit everything in sse2 66; without too much hastle, and can probably do better estimates with psadw 67; or pavgb At this point this is just meant to be first pass for calculating 68; all the parms needed for 16x16 ssim so we can play with dssim as distortion 69; in mode selection code. 70globalsym(aom_ssim_parms_8x8_sse2) 71sym(aom_ssim_parms_8x8_sse2): 72 push rbp 73 mov rbp, rsp 74 SHADOW_ARGS_TO_STACK 9 75 SAVE_XMM 15 76 push rsi 77 push rdi 78 ; end prolog 79 80 mov rsi, arg(0) ;s 81 mov rcx, arg(1) ;sp 82 mov rdi, arg(2) ;r 83 mov rax, arg(3) ;rp 84 85 pxor xmm0, xmm0 86 pxor xmm15,xmm15 ;sum_s 87 pxor xmm14,xmm14 ;sum_r 88 pxor xmm13,xmm13 ;sum_sq_s 89 pxor xmm12,xmm12 ;sum_sq_r 90 pxor xmm11,xmm11 ;sum_sxr 91 92 mov rdx, 8 ;row counter 93.NextRow: 94 95 ;grab source and reference pixels 96 movq xmm3, [rsi] 97 movq xmm4, [rdi] 98 punpcklbw xmm3, xmm0 ; low_s 99 punpcklbw xmm4, xmm0 ; low_r 100 101 TABULATE_SSIM 102 103 add rsi, rcx ; next s row 104 add rdi, rax ; next r row 105 106 dec rdx ; counter 107 jnz .NextRow 108 109 SUM_ACROSS_W xmm15 110 SUM_ACROSS_W xmm14 111 SUM_ACROSS_Q xmm13 112 SUM_ACROSS_Q xmm12 113 SUM_ACROSS_Q xmm11 114 115 mov rdi,arg(4) 116 movd [rdi], xmm15; 117 mov rdi,arg(5) 118 movd [rdi], xmm14; 119 mov rdi,arg(6) 120 movd [rdi], xmm13; 121 mov rdi,arg(7) 122 movd [rdi], xmm12; 123 mov rdi,arg(8) 124 movd [rdi], xmm11; 125 126 ; begin epilog 127 pop rdi 128 pop rsi 129 RESTORE_XMM 130 UNSHADOW_ARGS 131 pop rbp 132 ret 133