xref: /aosp_15_r20/external/libaom/aom_dsp/x86/jnt_sad_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_dsp/x86/synonyms.h"
19 
sad4xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)20 static unsigned int sad4xh_sse2(const uint8_t *a, int a_stride,
21                                 const uint8_t *b, int b_stride, int width,
22                                 int height) {
23   int i;
24   assert(width == 4);
25   (void)width;
26 
27   __m128i sad = _mm_setzero_si128();
28   for (i = 0; i < height; i += 4) {
29     __m128i x0 = xx_loadl_32(a + 0 * a_stride);
30     __m128i x1 = xx_loadl_32(a + 1 * a_stride);
31     __m128i x2 = xx_loadl_32(a + 2 * a_stride);
32     __m128i x3 = xx_loadl_32(a + 3 * a_stride);
33     __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
34     __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
35 
36     __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
37 
38     x0 = xx_loadl_32(b + 0 * b_stride);
39     x1 = xx_loadl_32(b + 1 * b_stride);
40     x2 = xx_loadl_32(b + 2 * b_stride);
41     x3 = xx_loadl_32(b + 3 * b_stride);
42     x_lo = _mm_unpacklo_epi32(x0, x1);
43     x_hi = _mm_unpacklo_epi32(x2, x3);
44 
45     __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
46 
47     __m128i sad4x4 = _mm_sad_epu8(x, y);
48     sad = _mm_add_epi32(sad, sad4x4);
49 
50     a += 4 * a_stride;
51     b += 4 * b_stride;
52   }
53 
54   // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
55   const unsigned int res =
56       (unsigned int)(_mm_cvtsi128_si32(sad) +
57                      _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
58 
59   return res;
60 }
61 
sad8xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)62 static unsigned int sad8xh_sse2(const uint8_t *a, int a_stride,
63                                 const uint8_t *b, int b_stride, int width,
64                                 int height) {
65   int i;
66   assert(width == 8);
67   (void)width;
68 
69   __m128i sad = _mm_setzero_si128();
70   for (i = 0; i < height; i += 2) {
71     __m128i x0 = xx_loadl_64(a + 0 * a_stride);
72     __m128i x1 = xx_loadl_64(a + 1 * a_stride);
73 
74     __m128i x = _mm_unpacklo_epi64(x0, x1);
75 
76     x0 = xx_loadl_64(b + 0 * b_stride);
77     x1 = xx_loadl_64(b + 1 * b_stride);
78 
79     __m128i y = _mm_unpacklo_epi64(x0, x1);
80 
81     __m128i sad8x2 = _mm_sad_epu8(x, y);
82     sad = _mm_add_epi32(sad, sad8x2);
83 
84     a += 2 * a_stride;
85     b += 2 * b_stride;
86   }
87 
88   const unsigned int res =
89       (unsigned int)(_mm_cvtsi128_si32(sad) +
90                      _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
91 
92   return res;
93 }
94 
sad16xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)95 static unsigned int sad16xh_sse2(const uint8_t *a, int a_stride,
96                                  const uint8_t *b, int b_stride, int width,
97                                  int height) {
98   int i;
99   assert(width == 16);
100   (void)width;
101 
102   __m128i sad = _mm_setzero_si128();
103   for (i = 0; i < height; ++i) {
104     __m128i x = xx_loadu_128(a);
105     __m128i y = xx_loadu_128(b);
106 
107     __m128i sad16x1 = _mm_sad_epu8(x, y);
108     sad = _mm_add_epi32(sad, sad16x1);
109 
110     a += a_stride;
111     b += b_stride;
112   }
113 
114   const unsigned int res =
115       (unsigned int)(_mm_cvtsi128_si32(sad) +
116                      _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
117 
118   return res;
119 }
120 
sad32xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)121 static unsigned int sad32xh_sse2(const uint8_t *a, int a_stride,
122                                  const uint8_t *b, int b_stride, int width,
123                                  int height) {
124   int i, j;
125   assert(width == 32);
126   (void)width;
127 
128   __m128i sad = _mm_setzero_si128();
129   for (i = 0; i < height; ++i) {
130     for (j = 0; j < 2; ++j) {
131       __m128i x = xx_loadu_128(a + j * 16);
132       __m128i y = xx_loadu_128(b + j * 16);
133 
134       __m128i sad32_half = _mm_sad_epu8(x, y);
135       sad = _mm_add_epi32(sad, sad32_half);
136     }
137 
138     a += a_stride;
139     b += b_stride;
140   }
141 
142   const unsigned int res =
143       (unsigned int)(_mm_cvtsi128_si32(sad) +
144                      _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
145 
146   return res;
147 }
148 
sad64xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)149 static unsigned int sad64xh_sse2(const uint8_t *a, int a_stride,
150                                  const uint8_t *b, int b_stride, int width,
151                                  int height) {
152   int i, j;
153   assert(width == 64);
154   (void)width;
155 
156   __m128i sad = _mm_setzero_si128();
157   for (i = 0; i < height; ++i) {
158     for (j = 0; j < 4; ++j) {
159       __m128i x = xx_loadu_128(a + j * 16);
160       __m128i y = xx_loadu_128(b + j * 16);
161 
162       __m128i sad64_quarter = _mm_sad_epu8(x, y);
163       sad = _mm_add_epi32(sad, sad64_quarter);
164     }
165 
166     a += a_stride;
167     b += b_stride;
168   }
169 
170   const unsigned int res =
171       (unsigned int)(_mm_cvtsi128_si32(sad) +
172                      _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
173 
174   return res;
175 }
176 
sad128xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)177 static unsigned int sad128xh_sse2(const uint8_t *a, int a_stride,
178                                   const uint8_t *b, int b_stride, int width,
179                                   int height) {
180   int i, j;
181   assert(width == 128);
182   (void)width;
183 
184   __m128i sad = _mm_setzero_si128();
185   for (i = 0; i < height; ++i) {
186     for (j = 0; j < 8; ++j) {
187       __m128i x = xx_loadu_128(a + j * 16);
188       __m128i y = xx_loadu_128(b + j * 16);
189 
190       __m128i sad64_quarter = _mm_sad_epu8(x, y);
191       sad = _mm_add_epi32(sad, sad64_quarter);
192     }
193 
194     a += a_stride;
195     b += b_stride;
196   }
197 
198   const unsigned int res =
199       (unsigned int)(_mm_cvtsi128_si32(sad) +
200                      _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
201 
202   return res;
203 }
204 
205 #define DIST_WTD_SADMXN_SSE2(m, n)                                            \
206   unsigned int aom_dist_wtd_sad##m##x##n##_avg_sse2(                          \
207       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
208       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
209     uint8_t comp_pred[m * n];                                                 \
210     aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
211                                jcp_param);                                    \
212     return sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);              \
213   }
214 
215 DIST_WTD_SADMXN_SSE2(128, 128)
216 DIST_WTD_SADMXN_SSE2(128, 64)
217 DIST_WTD_SADMXN_SSE2(64, 128)
218 DIST_WTD_SADMXN_SSE2(64, 64)
219 DIST_WTD_SADMXN_SSE2(64, 32)
220 DIST_WTD_SADMXN_SSE2(32, 64)
221 DIST_WTD_SADMXN_SSE2(32, 32)
222 DIST_WTD_SADMXN_SSE2(32, 16)
223 DIST_WTD_SADMXN_SSE2(16, 32)
224 DIST_WTD_SADMXN_SSE2(16, 16)
225 DIST_WTD_SADMXN_SSE2(16, 8)
226 DIST_WTD_SADMXN_SSE2(8, 16)
227 DIST_WTD_SADMXN_SSE2(8, 8)
228 DIST_WTD_SADMXN_SSE2(8, 4)
229 DIST_WTD_SADMXN_SSE2(4, 8)
230 DIST_WTD_SADMXN_SSE2(4, 4)
231 #if !CONFIG_REALTIME_ONLY
232 DIST_WTD_SADMXN_SSE2(4, 16)
233 DIST_WTD_SADMXN_SSE2(16, 4)
234 DIST_WTD_SADMXN_SSE2(8, 32)
235 DIST_WTD_SADMXN_SSE2(32, 8)
236 DIST_WTD_SADMXN_SSE2(16, 64)
237 DIST_WTD_SADMXN_SSE2(64, 16)
238 #endif
239