xref: /aosp_15_r20/external/libaom/aom_dsp/x86/ssim_sse2_x86_64.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "aom_ports/x86_abi_support.asm"
15
16; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
17%macro TABULATE_SSIM 0
18        paddusw         xmm15, xmm3  ; sum_s
19        paddusw         xmm14, xmm4  ; sum_r
20        movdqa          xmm1, xmm3
21        pmaddwd         xmm1, xmm1
22        paddd           xmm13, xmm1 ; sum_sq_s
23        movdqa          xmm2, xmm4
24        pmaddwd         xmm2, xmm2
25        paddd           xmm12, xmm2 ; sum_sq_r
26        pmaddwd         xmm3, xmm4
27        paddd           xmm11, xmm3  ; sum_sxr
28%endmacro
29
30; Sum across the register %1 starting with q words
31%macro SUM_ACROSS_Q 1
32        movdqa          xmm2,%1
33        punpckldq       %1,xmm0
34        punpckhdq       xmm2,xmm0
35        paddq           %1,xmm2
36        movdqa          xmm2,%1
37        punpcklqdq      %1,xmm0
38        punpckhqdq      xmm2,xmm0
39        paddq           %1,xmm2
40%endmacro
41
42; Sum across the register %1 starting with q words
43%macro SUM_ACROSS_W 1
44        movdqa          xmm1, %1
45        punpcklwd       %1,xmm0
46        punpckhwd       xmm1,xmm0
47        paddd           %1, xmm1
48        SUM_ACROSS_Q    %1
49%endmacro
50
51SECTION .text
52
53;void aom_ssim_parms_8x8_sse2(
54;    unsigned char *s,
55;    int sp,
56;    unsigned char *r,
57;    int rp
58;    uint32_t *sum_s,
59;    uint32_t *sum_r,
60;    uint32_t *sum_sq_s,
61;    uint32_t *sum_sq_r,
62;    uint32_t *sum_sxr);
63;
64; TODO: Use parm passing through structure, probably don't need the pxors
65; ( calling app will initialize to 0 ) could easily fit everything in sse2
66; without too much hastle, and can probably do better estimates with psadw
67; or pavgb At this point this is just meant to be first pass for calculating
68; all the parms needed for 16x16 ssim so we can play with dssim as distortion
69; in mode selection code.
70globalsym(aom_ssim_parms_8x8_sse2)
71sym(aom_ssim_parms_8x8_sse2):
72    push        rbp
73    mov         rbp, rsp
74    SHADOW_ARGS_TO_STACK 9
75    SAVE_XMM 15
76    push        rsi
77    push        rdi
78    ; end prolog
79
80    mov             rsi,        arg(0) ;s
81    mov             rcx,        arg(1) ;sp
82    mov             rdi,        arg(2) ;r
83    mov             rax,        arg(3) ;rp
84
85    pxor            xmm0, xmm0
86    pxor            xmm15,xmm15  ;sum_s
87    pxor            xmm14,xmm14  ;sum_r
88    pxor            xmm13,xmm13  ;sum_sq_s
89    pxor            xmm12,xmm12  ;sum_sq_r
90    pxor            xmm11,xmm11  ;sum_sxr
91
92    mov             rdx, 8      ;row counter
93.NextRow:
94
95    ;grab source and reference pixels
96    movq            xmm3, [rsi]
97    movq            xmm4, [rdi]
98    punpcklbw       xmm3, xmm0 ; low_s
99    punpcklbw       xmm4, xmm0 ; low_r
100
101    TABULATE_SSIM
102
103    add             rsi, rcx   ; next s row
104    add             rdi, rax   ; next r row
105
106    dec             rdx        ; counter
107    jnz .NextRow
108
109    SUM_ACROSS_W    xmm15
110    SUM_ACROSS_W    xmm14
111    SUM_ACROSS_Q    xmm13
112    SUM_ACROSS_Q    xmm12
113    SUM_ACROSS_Q    xmm11
114
115    mov             rdi,arg(4)
116    movd            [rdi], xmm15;
117    mov             rdi,arg(5)
118    movd            [rdi], xmm14;
119    mov             rdi,arg(6)
120    movd            [rdi], xmm13;
121    mov             rdi,arg(7)
122    movd            [rdi], xmm12;
123    mov             rdi,arg(8)
124    movd            [rdi], xmm11;
125
126    ; begin epilog
127    pop         rdi
128    pop         rsi
129    RESTORE_XMM
130    UNSHADOW_ARGS
131    pop         rbp
132    ret
133