1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <emmintrin.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_dsp/x86/synonyms.h"
19
sad4xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)20 static unsigned int sad4xh_sse2(const uint8_t *a, int a_stride,
21 const uint8_t *b, int b_stride, int width,
22 int height) {
23 int i;
24 assert(width == 4);
25 (void)width;
26
27 __m128i sad = _mm_setzero_si128();
28 for (i = 0; i < height; i += 4) {
29 __m128i x0 = xx_loadl_32(a + 0 * a_stride);
30 __m128i x1 = xx_loadl_32(a + 1 * a_stride);
31 __m128i x2 = xx_loadl_32(a + 2 * a_stride);
32 __m128i x3 = xx_loadl_32(a + 3 * a_stride);
33 __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
34 __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
35
36 __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
37
38 x0 = xx_loadl_32(b + 0 * b_stride);
39 x1 = xx_loadl_32(b + 1 * b_stride);
40 x2 = xx_loadl_32(b + 2 * b_stride);
41 x3 = xx_loadl_32(b + 3 * b_stride);
42 x_lo = _mm_unpacklo_epi32(x0, x1);
43 x_hi = _mm_unpacklo_epi32(x2, x3);
44
45 __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
46
47 __m128i sad4x4 = _mm_sad_epu8(x, y);
48 sad = _mm_add_epi32(sad, sad4x4);
49
50 a += 4 * a_stride;
51 b += 4 * b_stride;
52 }
53
54 // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
55 const unsigned int res =
56 (unsigned int)(_mm_cvtsi128_si32(sad) +
57 _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
58
59 return res;
60 }
61
sad8xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)62 static unsigned int sad8xh_sse2(const uint8_t *a, int a_stride,
63 const uint8_t *b, int b_stride, int width,
64 int height) {
65 int i;
66 assert(width == 8);
67 (void)width;
68
69 __m128i sad = _mm_setzero_si128();
70 for (i = 0; i < height; i += 2) {
71 __m128i x0 = xx_loadl_64(a + 0 * a_stride);
72 __m128i x1 = xx_loadl_64(a + 1 * a_stride);
73
74 __m128i x = _mm_unpacklo_epi64(x0, x1);
75
76 x0 = xx_loadl_64(b + 0 * b_stride);
77 x1 = xx_loadl_64(b + 1 * b_stride);
78
79 __m128i y = _mm_unpacklo_epi64(x0, x1);
80
81 __m128i sad8x2 = _mm_sad_epu8(x, y);
82 sad = _mm_add_epi32(sad, sad8x2);
83
84 a += 2 * a_stride;
85 b += 2 * b_stride;
86 }
87
88 const unsigned int res =
89 (unsigned int)(_mm_cvtsi128_si32(sad) +
90 _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
91
92 return res;
93 }
94
sad16xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)95 static unsigned int sad16xh_sse2(const uint8_t *a, int a_stride,
96 const uint8_t *b, int b_stride, int width,
97 int height) {
98 int i;
99 assert(width == 16);
100 (void)width;
101
102 __m128i sad = _mm_setzero_si128();
103 for (i = 0; i < height; ++i) {
104 __m128i x = xx_loadu_128(a);
105 __m128i y = xx_loadu_128(b);
106
107 __m128i sad16x1 = _mm_sad_epu8(x, y);
108 sad = _mm_add_epi32(sad, sad16x1);
109
110 a += a_stride;
111 b += b_stride;
112 }
113
114 const unsigned int res =
115 (unsigned int)(_mm_cvtsi128_si32(sad) +
116 _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
117
118 return res;
119 }
120
sad32xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)121 static unsigned int sad32xh_sse2(const uint8_t *a, int a_stride,
122 const uint8_t *b, int b_stride, int width,
123 int height) {
124 int i, j;
125 assert(width == 32);
126 (void)width;
127
128 __m128i sad = _mm_setzero_si128();
129 for (i = 0; i < height; ++i) {
130 for (j = 0; j < 2; ++j) {
131 __m128i x = xx_loadu_128(a + j * 16);
132 __m128i y = xx_loadu_128(b + j * 16);
133
134 __m128i sad32_half = _mm_sad_epu8(x, y);
135 sad = _mm_add_epi32(sad, sad32_half);
136 }
137
138 a += a_stride;
139 b += b_stride;
140 }
141
142 const unsigned int res =
143 (unsigned int)(_mm_cvtsi128_si32(sad) +
144 _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
145
146 return res;
147 }
148
sad64xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)149 static unsigned int sad64xh_sse2(const uint8_t *a, int a_stride,
150 const uint8_t *b, int b_stride, int width,
151 int height) {
152 int i, j;
153 assert(width == 64);
154 (void)width;
155
156 __m128i sad = _mm_setzero_si128();
157 for (i = 0; i < height; ++i) {
158 for (j = 0; j < 4; ++j) {
159 __m128i x = xx_loadu_128(a + j * 16);
160 __m128i y = xx_loadu_128(b + j * 16);
161
162 __m128i sad64_quarter = _mm_sad_epu8(x, y);
163 sad = _mm_add_epi32(sad, sad64_quarter);
164 }
165
166 a += a_stride;
167 b += b_stride;
168 }
169
170 const unsigned int res =
171 (unsigned int)(_mm_cvtsi128_si32(sad) +
172 _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
173
174 return res;
175 }
176
sad128xh_sse2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)177 static unsigned int sad128xh_sse2(const uint8_t *a, int a_stride,
178 const uint8_t *b, int b_stride, int width,
179 int height) {
180 int i, j;
181 assert(width == 128);
182 (void)width;
183
184 __m128i sad = _mm_setzero_si128();
185 for (i = 0; i < height; ++i) {
186 for (j = 0; j < 8; ++j) {
187 __m128i x = xx_loadu_128(a + j * 16);
188 __m128i y = xx_loadu_128(b + j * 16);
189
190 __m128i sad64_quarter = _mm_sad_epu8(x, y);
191 sad = _mm_add_epi32(sad, sad64_quarter);
192 }
193
194 a += a_stride;
195 b += b_stride;
196 }
197
198 const unsigned int res =
199 (unsigned int)(_mm_cvtsi128_si32(sad) +
200 _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
201
202 return res;
203 }
204
205 #define DIST_WTD_SADMXN_SSE2(m, n) \
206 unsigned int aom_dist_wtd_sad##m##x##n##_avg_sse2( \
207 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
208 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
209 uint8_t comp_pred[m * n]; \
210 aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
211 jcp_param); \
212 return sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \
213 }
214
215 DIST_WTD_SADMXN_SSE2(128, 128)
216 DIST_WTD_SADMXN_SSE2(128, 64)
217 DIST_WTD_SADMXN_SSE2(64, 128)
218 DIST_WTD_SADMXN_SSE2(64, 64)
219 DIST_WTD_SADMXN_SSE2(64, 32)
220 DIST_WTD_SADMXN_SSE2(32, 64)
221 DIST_WTD_SADMXN_SSE2(32, 32)
222 DIST_WTD_SADMXN_SSE2(32, 16)
223 DIST_WTD_SADMXN_SSE2(16, 32)
224 DIST_WTD_SADMXN_SSE2(16, 16)
225 DIST_WTD_SADMXN_SSE2(16, 8)
226 DIST_WTD_SADMXN_SSE2(8, 16)
227 DIST_WTD_SADMXN_SSE2(8, 8)
228 DIST_WTD_SADMXN_SSE2(8, 4)
229 DIST_WTD_SADMXN_SSE2(4, 8)
230 DIST_WTD_SADMXN_SSE2(4, 4)
231 #if !CONFIG_REALTIME_ONLY
232 DIST_WTD_SADMXN_SSE2(4, 16)
233 DIST_WTD_SADMXN_SSE2(16, 4)
234 DIST_WTD_SADMXN_SSE2(8, 32)
235 DIST_WTD_SADMXN_SSE2(32, 8)
236 DIST_WTD_SADMXN_SSE2(16, 64)
237 DIST_WTD_SADMXN_SSE2(64, 16)
238 #endif
239