xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14%macro TABULATE_SSIM 0
15        paddusw         xmm15, xmm3  ; sum_s
16        paddusw         xmm14, xmm4  ; sum_r
17        movdqa          xmm1, xmm3
18        pmaddwd         xmm1, xmm1
19        paddd           xmm13, xmm1 ; sum_sq_s
20        movdqa          xmm2, xmm4
21        pmaddwd         xmm2, xmm2
22        paddd           xmm12, xmm2 ; sum_sq_r
23        pmaddwd         xmm3, xmm4
24        paddd           xmm11, xmm3  ; sum_sxr
25%endmacro
26
27; Sum across the register %1 starting with q words
28%macro SUM_ACROSS_Q 1
29        movdqa          xmm2,%1
30        punpckldq       %1,xmm0
31        punpckhdq       xmm2,xmm0
32        paddq           %1,xmm2
33        movdqa          xmm2,%1
34        punpcklqdq      %1,xmm0
35        punpckhqdq      xmm2,xmm0
36        paddq           %1,xmm2
37%endmacro
38
39; Sum across the register %1 starting with q words
40%macro SUM_ACROSS_W 1
41        movdqa          xmm1, %1
42        punpcklwd       %1,xmm0
43        punpckhwd       xmm1,xmm0
44        paddd           %1, xmm1
45        SUM_ACROSS_Q    %1
46%endmacro
47
48SECTION .text
49
50;void vpx_ssim_parms_8x8_sse2(
51;    unsigned char *s,
52;    int sp,
53;    unsigned char *r,
54;    int rp
55;    uint32_t *sum_s,
56;    uint32_t *sum_r,
57;    uint32_t *sum_sq_s,
58;    uint32_t *sum_sq_r,
59;    uint32_t *sum_sxr);
60;
61; TODO: Use parm passing through structure, probably don't need the pxors
62; ( calling app will initialize to 0 ) could easily fit everything in sse2
63; without too much hastle, and can probably do better estimates with psadw
64; or pavgb At this point this is just meant to be first pass for calculating
65; all the parms needed for 16x16 ssim so we can play with dssim as distortion
66; in mode selection code.
67globalsym(vpx_ssim_parms_8x8_sse2)
68sym(vpx_ssim_parms_8x8_sse2):
69    push        rbp
70    mov         rbp, rsp
71    SHADOW_ARGS_TO_STACK 9
72    SAVE_XMM 15
73    push        rsi
74    push        rdi
75    ; end prolog
76
77    mov             rsi,        arg(0) ;s
78    mov             rcx,        arg(1) ;sp
79    mov             rdi,        arg(2) ;r
80    mov             rax,        arg(3) ;rp
81
82    pxor            xmm0, xmm0
83    pxor            xmm15,xmm15  ;sum_s
84    pxor            xmm14,xmm14  ;sum_r
85    pxor            xmm13,xmm13  ;sum_sq_s
86    pxor            xmm12,xmm12  ;sum_sq_r
87    pxor            xmm11,xmm11  ;sum_sxr
88
89    mov             rdx, 8      ;row counter
90.NextRow:
91
92    ;grab source and reference pixels
93    movq            xmm3, [rsi]
94    movq            xmm4, [rdi]
95    punpcklbw       xmm3, xmm0 ; low_s
96    punpcklbw       xmm4, xmm0 ; low_r
97
98    TABULATE_SSIM
99
100    add             rsi, rcx   ; next s row
101    add             rdi, rax   ; next r row
102
103    dec             rdx        ; counter
104    jnz .NextRow
105
106    SUM_ACROSS_W    xmm15
107    SUM_ACROSS_W    xmm14
108    SUM_ACROSS_Q    xmm13
109    SUM_ACROSS_Q    xmm12
110    SUM_ACROSS_Q    xmm11
111
112    mov             rdi,arg(4)
113    movd            [rdi], xmm15;
114    mov             rdi,arg(5)
115    movd            [rdi], xmm14;
116    mov             rdi,arg(6)
117    movd            [rdi], xmm13;
118    mov             rdi,arg(7)
119    movd            [rdi], xmm12;
120    mov             rdi,arg(8)
121    movd            [rdi], xmm11;
122
123    ; begin epilog
124    pop         rdi
125    pop         rsi
126    RESTORE_XMM
127    UNSHADOW_ARGS
128    pop         rbp
129    ret
130