xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_variance_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h> /* SSE4.1 */
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/variance.h"
18 #include "aom_dsp/aom_filter.h"
19 
variance4x4_64_sse4_1(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,uint64_t * sse,int64_t * sum)20 static inline void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
21                                          const uint8_t *b8, int b_stride,
22                                          uint64_t *sse, int64_t *sum) {
23   __m128i u0, u1, u2, u3;
24   __m128i s0, s1, s2, s3;
25   __m128i t0, t1, x0, y0;
26   __m128i a0, a1, a2, a3;
27   __m128i b0, b1, b2, b3;
28   __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
29 
30   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
31   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
32 
33   a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
34   a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
35   a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
36   a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
37 
38   b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
39   b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
40   b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
41   b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
42 
43   u0 = _mm_unpacklo_epi16(a0, a1);
44   u1 = _mm_unpacklo_epi16(a2, a3);
45   u2 = _mm_unpacklo_epi16(b0, b1);
46   u3 = _mm_unpacklo_epi16(b2, b3);
47 
48   s0 = _mm_sub_epi16(u0, u2);
49   s1 = _mm_sub_epi16(u1, u3);
50 
51   t0 = _mm_madd_epi16(s0, k_one_epi16);
52   t1 = _mm_madd_epi16(s1, k_one_epi16);
53 
54   s2 = _mm_hadd_epi32(t0, t1);
55   s3 = _mm_hadd_epi32(s2, s2);
56   y0 = _mm_hadd_epi32(s3, s3);
57 
58   t0 = _mm_madd_epi16(s0, s0);
59   t1 = _mm_madd_epi16(s1, s1);
60 
61   s2 = _mm_hadd_epi32(t0, t1);
62   s3 = _mm_hadd_epi32(s2, s2);
63   x0 = _mm_hadd_epi32(s3, s3);
64 
65   *sse = (uint64_t)_mm_extract_epi32(x0, 0);
66   *sum = (int64_t)_mm_extract_epi32(y0, 0);
67 }
68 
aom_highbd_8_variance4x4_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)69 uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
70                                          const uint8_t *b, int b_stride,
71                                          uint32_t *sse) {
72   int64_t sum, diff;
73   uint64_t local_sse;
74 
75   variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
76   *sse = (uint32_t)local_sse;
77 
78   diff = (int64_t)*sse - ((sum * sum) >> 4);
79   return (diff >= 0) ? (uint32_t)diff : 0;
80 }
81 
aom_highbd_10_variance4x4_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)82 uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
83                                           const uint8_t *b, int b_stride,
84                                           uint32_t *sse) {
85   int64_t sum, diff;
86   uint64_t local_sse;
87 
88   variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
89   *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
90   sum = ROUND_POWER_OF_TWO(sum, 2);
91 
92   diff = (int64_t)*sse - ((sum * sum) >> 4);
93   return (diff >= 0) ? (uint32_t)diff : 0;
94 }
95 
aom_highbd_12_variance4x4_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)96 uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
97                                           const uint8_t *b, int b_stride,
98                                           uint32_t *sse) {
99   int64_t sum, diff;
100   uint64_t local_sse;
101 
102   variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
103   *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
104   sum = ROUND_POWER_OF_TWO(sum, 4);
105 
106   diff = (int64_t)*sse - ((sum * sum) >> 4);
107   return diff >= 0 ? (uint32_t)diff : 0;
108 }
109 
110 // Sub-pixel
aom_highbd_8_sub_pixel_variance4x4_sse4_1(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,uint32_t * sse)111 uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
112     const uint8_t *src, int src_stride, int xoffset, int yoffset,
113     const uint8_t *dst, int dst_stride, uint32_t *sse) {
114   uint16_t fdata3[(4 + 1) * 4];
115   uint16_t temp2[4 * 4];
116 
117   aom_highbd_var_filter_block2d_bil_first_pass(
118       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
119   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
120                                                 bilinear_filters_2t[yoffset]);
121 
122   return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
123                                   sse);
124 }
125 
aom_highbd_10_sub_pixel_variance4x4_sse4_1(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,uint32_t * sse)126 uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
127     const uint8_t *src, int src_stride, int xoffset, int yoffset,
128     const uint8_t *dst, int dst_stride, uint32_t *sse) {
129   uint16_t fdata3[(4 + 1) * 4];
130   uint16_t temp2[4 * 4];
131 
132   aom_highbd_var_filter_block2d_bil_first_pass(
133       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
134   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
135                                                 bilinear_filters_2t[yoffset]);
136 
137   return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
138                                    dst_stride, sse);
139 }
140 
aom_highbd_12_sub_pixel_variance4x4_sse4_1(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,uint32_t * sse)141 uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
142     const uint8_t *src, int src_stride, int xoffset, int yoffset,
143     const uint8_t *dst, int dst_stride, uint32_t *sse) {
144   uint16_t fdata3[(4 + 1) * 4];
145   uint16_t temp2[4 * 4];
146 
147   aom_highbd_var_filter_block2d_bil_first_pass(
148       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
149   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
150                                                 bilinear_filters_2t[yoffset]);
151 
152   return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
153                                    dst_stride, sse);
154 }
155 
156 // Sub-pixel average
157 
aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,uint32_t * sse,const uint8_t * second_pred)158 uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
159     const uint8_t *src, int src_stride, int xoffset, int yoffset,
160     const uint8_t *dst, int dst_stride, uint32_t *sse,
161     const uint8_t *second_pred) {
162   uint16_t fdata3[(4 + 1) * 4];
163   uint16_t temp2[4 * 4];
164   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
165 
166   aom_highbd_var_filter_block2d_bil_first_pass(
167       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
168   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
169                                                 bilinear_filters_2t[yoffset]);
170 
171   aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
172                            CONVERT_TO_BYTEPTR(temp2), 4);
173 
174   return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
175                                   sse);
176 }
177 
aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,uint32_t * sse,const uint8_t * second_pred)178 uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
179     const uint8_t *src, int src_stride, int xoffset, int yoffset,
180     const uint8_t *dst, int dst_stride, uint32_t *sse,
181     const uint8_t *second_pred) {
182   uint16_t fdata3[(4 + 1) * 4];
183   uint16_t temp2[4 * 4];
184   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
185 
186   aom_highbd_var_filter_block2d_bil_first_pass(
187       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
188   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
189                                                 bilinear_filters_2t[yoffset]);
190 
191   aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
192                            CONVERT_TO_BYTEPTR(temp2), 4);
193 
194   return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
195                                    dst_stride, sse);
196 }
197 
aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,uint32_t * sse,const uint8_t * second_pred)198 uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
199     const uint8_t *src, int src_stride, int xoffset, int yoffset,
200     const uint8_t *dst, int dst_stride, uint32_t *sse,
201     const uint8_t *second_pred) {
202   uint16_t fdata3[(4 + 1) * 4];
203   uint16_t temp2[4 * 4];
204   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
205 
206   aom_highbd_var_filter_block2d_bil_first_pass(
207       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
208   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
209                                                 bilinear_filters_2t[yoffset]);
210 
211   aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
212                            CONVERT_TO_BYTEPTR(temp2), 4);
213 
214   return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
215                                    dst_stride, sse);
216 }
217