xref: /aosp_15_r20/external/libaom/aom_dsp/x86/variance_ssse3.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <stddef.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include <stdint.h>
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_dsp_common.h"
19*77c1e3ccSAndroid Build Coastguard Worker 
20*77c1e3ccSAndroid Build Coastguard Worker // The 2 unused parameters are place holders for PIC enabled build.
21*77c1e3ccSAndroid Build Coastguard Worker // These definitions are for functions defined in subpel_variance.asm
22*77c1e3ccSAndroid Build Coastguard Worker #define DECL(w, opt)                                                           \
23*77c1e3ccSAndroid Build Coastguard Worker   int aom_sub_pixel_variance##w##xh_##opt(                                     \
24*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
25*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
26*77c1e3ccSAndroid Build Coastguard Worker       void *unused0, void *unused)
27*77c1e3ccSAndroid Build Coastguard Worker #define DECLS(opt) \
28*77c1e3ccSAndroid Build Coastguard Worker   DECL(4, opt);    \
29*77c1e3ccSAndroid Build Coastguard Worker   DECL(8, opt);    \
30*77c1e3ccSAndroid Build Coastguard Worker   DECL(16, opt)
31*77c1e3ccSAndroid Build Coastguard Worker 
32*77c1e3ccSAndroid Build Coastguard Worker DECLS(ssse3);
33*77c1e3ccSAndroid Build Coastguard Worker #undef DECLS
34*77c1e3ccSAndroid Build Coastguard Worker #undef DECL
35*77c1e3ccSAndroid Build Coastguard Worker 
36*77c1e3ccSAndroid Build Coastguard Worker #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
37*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
38*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
39*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
40*77c1e3ccSAndroid Build Coastguard Worker     /*Avoid overflow in helper by capping height.*/                           \
41*77c1e3ccSAndroid Build Coastguard Worker     const int hf = AOMMIN(h, 64);                                             \
42*77c1e3ccSAndroid Build Coastguard Worker     unsigned int sse = 0;                                                     \
43*77c1e3ccSAndroid Build Coastguard Worker     int se = 0;                                                               \
44*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < (w / wf); ++i) {                                      \
45*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr = src;                                           \
46*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dst_ptr = dst;                                           \
47*77c1e3ccSAndroid Build Coastguard Worker       for (int j = 0; j < (h / hf); ++j) {                                    \
48*77c1e3ccSAndroid Build Coastguard Worker         unsigned int sse2;                                                    \
49*77c1e3ccSAndroid Build Coastguard Worker         const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
50*77c1e3ccSAndroid Build Coastguard Worker             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
51*77c1e3ccSAndroid Build Coastguard Worker             &sse2, NULL, NULL);                                               \
52*77c1e3ccSAndroid Build Coastguard Worker         dst_ptr += hf * dst_stride;                                           \
53*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += hf * src_stride;                                           \
54*77c1e3ccSAndroid Build Coastguard Worker         se += se2;                                                            \
55*77c1e3ccSAndroid Build Coastguard Worker         sse += sse2;                                                          \
56*77c1e3ccSAndroid Build Coastguard Worker       }                                                                       \
57*77c1e3ccSAndroid Build Coastguard Worker       src += wf;                                                              \
58*77c1e3ccSAndroid Build Coastguard Worker       dst += wf;                                                              \
59*77c1e3ccSAndroid Build Coastguard Worker     }                                                                         \
60*77c1e3ccSAndroid Build Coastguard Worker     *sse_ptr = sse;                                                           \
61*77c1e3ccSAndroid Build Coastguard Worker     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
62*77c1e3ccSAndroid Build Coastguard Worker   }
63*77c1e3ccSAndroid Build Coastguard Worker 
64*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
65*77c1e3ccSAndroid Build Coastguard Worker #define FNS(opt)                                    \
66*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
67*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
68*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
69*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
70*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
71*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
72*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
73*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
74*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
75*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
76*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
77*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
78*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
79*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
80*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
81*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
82*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
83*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
84*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
85*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
86*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
87*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
88*77c1e3ccSAndroid Build Coastguard Worker #else
89*77c1e3ccSAndroid Build Coastguard Worker #define FNS(opt)                                    \
90*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
91*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
92*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
93*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
94*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
95*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
96*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
97*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
98*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
99*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
100*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
101*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
102*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
103*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
104*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
105*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
106*77c1e3ccSAndroid Build Coastguard Worker #endif
107*77c1e3ccSAndroid Build Coastguard Worker 
108*77c1e3ccSAndroid Build Coastguard Worker FNS(ssse3)
109*77c1e3ccSAndroid Build Coastguard Worker 
110*77c1e3ccSAndroid Build Coastguard Worker #undef FNS
111*77c1e3ccSAndroid Build Coastguard Worker #undef FN
112*77c1e3ccSAndroid Build Coastguard Worker 
113*77c1e3ccSAndroid Build Coastguard Worker // The 2 unused parameters are place holders for PIC enabled build.
114*77c1e3ccSAndroid Build Coastguard Worker #define DECL(w, opt)                                                        \
115*77c1e3ccSAndroid Build Coastguard Worker   int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
116*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
117*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
118*77c1e3ccSAndroid Build Coastguard Worker       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
119*77c1e3ccSAndroid Build Coastguard Worker       void *unused)
120*77c1e3ccSAndroid Build Coastguard Worker #define DECLS(opt) \
121*77c1e3ccSAndroid Build Coastguard Worker   DECL(4, opt);    \
122*77c1e3ccSAndroid Build Coastguard Worker   DECL(8, opt);    \
123*77c1e3ccSAndroid Build Coastguard Worker   DECL(16, opt)
124*77c1e3ccSAndroid Build Coastguard Worker 
125*77c1e3ccSAndroid Build Coastguard Worker DECLS(ssse3);
126*77c1e3ccSAndroid Build Coastguard Worker #undef DECL
127*77c1e3ccSAndroid Build Coastguard Worker #undef DECLS
128*77c1e3ccSAndroid Build Coastguard Worker 
129*77c1e3ccSAndroid Build Coastguard Worker #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
130*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
131*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
132*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
133*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *sec) {                                                  \
134*77c1e3ccSAndroid Build Coastguard Worker     /*Avoid overflow in helper by capping height.*/                          \
135*77c1e3ccSAndroid Build Coastguard Worker     const int hf = AOMMIN(h, 64);                                            \
136*77c1e3ccSAndroid Build Coastguard Worker     unsigned int sse = 0;                                                    \
137*77c1e3ccSAndroid Build Coastguard Worker     int se = 0;                                                              \
138*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < (w / wf); ++i) {                                     \
139*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr = src;                                          \
140*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dst_ptr = dst;                                          \
141*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *sec_ptr = sec;                                          \
142*77c1e3ccSAndroid Build Coastguard Worker       for (int j = 0; j < (h / hf); ++j) {                                   \
143*77c1e3ccSAndroid Build Coastguard Worker         unsigned int sse2;                                                   \
144*77c1e3ccSAndroid Build Coastguard Worker         const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
145*77c1e3ccSAndroid Build Coastguard Worker             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
146*77c1e3ccSAndroid Build Coastguard Worker             sec_ptr, w, hf, &sse2, NULL, NULL);                              \
147*77c1e3ccSAndroid Build Coastguard Worker         dst_ptr += hf * dst_stride;                                          \
148*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += hf * src_stride;                                          \
149*77c1e3ccSAndroid Build Coastguard Worker         sec_ptr += hf * w;                                                   \
150*77c1e3ccSAndroid Build Coastguard Worker         se += se2;                                                           \
151*77c1e3ccSAndroid Build Coastguard Worker         sse += sse2;                                                         \
152*77c1e3ccSAndroid Build Coastguard Worker       }                                                                      \
153*77c1e3ccSAndroid Build Coastguard Worker       src += wf;                                                             \
154*77c1e3ccSAndroid Build Coastguard Worker       dst += wf;                                                             \
155*77c1e3ccSAndroid Build Coastguard Worker       sec += wf;                                                             \
156*77c1e3ccSAndroid Build Coastguard Worker     }                                                                        \
157*77c1e3ccSAndroid Build Coastguard Worker     *sse_ptr = sse;                                                          \
158*77c1e3ccSAndroid Build Coastguard Worker     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
159*77c1e3ccSAndroid Build Coastguard Worker   }
160*77c1e3ccSAndroid Build Coastguard Worker 
161*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
162*77c1e3ccSAndroid Build Coastguard Worker #define FNS(opt)                                    \
163*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
164*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
165*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
166*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
167*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
168*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
169*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
170*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
171*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
172*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
173*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
174*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
175*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
176*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
177*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
178*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
179*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
180*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
181*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
182*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
183*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
184*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
185*77c1e3ccSAndroid Build Coastguard Worker #else
186*77c1e3ccSAndroid Build Coastguard Worker #define FNS(opt)                                    \
187*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
188*77c1e3ccSAndroid Build Coastguard Worker   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
189*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
190*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
191*77c1e3ccSAndroid Build Coastguard Worker   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
192*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
193*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
194*77c1e3ccSAndroid Build Coastguard Worker   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
195*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
196*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
197*77c1e3ccSAndroid Build Coastguard Worker   FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
198*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
199*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
200*77c1e3ccSAndroid Build Coastguard Worker   FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
201*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
202*77c1e3ccSAndroid Build Coastguard Worker   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
203*77c1e3ccSAndroid Build Coastguard Worker #endif
204*77c1e3ccSAndroid Build Coastguard Worker 
205*77c1e3ccSAndroid Build Coastguard Worker FNS(ssse3)
206*77c1e3ccSAndroid Build Coastguard Worker 
207*77c1e3ccSAndroid Build Coastguard Worker #undef FNS
208*77c1e3ccSAndroid Build Coastguard Worker #undef FN
209