1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <immintrin.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
23 #include "aom_dsp/x86/synonyms.h"
24
25 ////////////////////////////////////////////////////////////////////////////////
26 // 8 bit
27 ////////////////////////////////////////////////////////////////////////////////
28
obmc_sad_w4(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)29 static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
30 const int pre_stride,
31 const int32_t *wsrc,
32 const int32_t *mask,
33 const int height) {
34 const int pre_step = pre_stride - 4;
35 int n = 0;
36 __m128i v_sad_d = _mm_setzero_si128();
37
38 do {
39 const __m128i v_p_b = xx_loadl_32(pre + n);
40 const __m128i v_m_d = xx_load_128(mask + n);
41 const __m128i v_w_d = xx_load_128(wsrc + n);
42
43 const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
44
45 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
46 // boundaries. We use pmaddwd, as it has lower latency on Haswell
47 // than pmulld but produces the same result with these inputs.
48 const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
49
50 const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
51 const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
52
53 // Rounded absolute difference
54 const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
55
56 v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
57
58 n += 4;
59
60 if (n % 4 == 0) pre += pre_step;
61 } while (n < 4 * height);
62
63 return xx_hsum_epi32_si32(v_sad_d);
64 }
65
obmc_sad_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)66 static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
67 const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
68 const int32_t *mask, const int width, const int height) {
69 const int pre_step = pre_stride - width;
70 int n = 0;
71 __m128i v_sad_d = _mm_setzero_si128();
72
73 assert(width >= 8);
74 assert(IS_POWER_OF_TWO(width));
75
76 do {
77 const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
78 const __m128i v_m1_d = xx_load_128(mask + n + 4);
79 const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
80 const __m128i v_p0_b = xx_loadl_32(pre + n);
81 const __m128i v_m0_d = xx_load_128(mask + n);
82 const __m128i v_w0_d = xx_load_128(wsrc + n);
83
84 const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
85 const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
86
87 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
88 // boundaries. We use pmaddwd, as it has lower latency on Haswell
89 // than pmulld but produces the same result with these inputs.
90 const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
91 const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
92
93 const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
94 const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
95 const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
96 const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
97
98 // Rounded absolute difference
99 const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
100 const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
101
102 v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
103 v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
104
105 n += 8;
106
107 if (n % width == 0) pre += pre_step;
108 } while (n < width * height);
109
110 return xx_hsum_epi32_si32(v_sad_d);
111 }
112
113 #define OBMCSADWXH(w, h) \
114 unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
115 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
116 const int32_t *msk) { \
117 if (w == 4) { \
118 return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
119 } else { \
120 return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
121 } \
122 }
123
124 OBMCSADWXH(128, 128)
125 OBMCSADWXH(128, 64)
126 OBMCSADWXH(64, 128)
127 OBMCSADWXH(64, 64)
128 OBMCSADWXH(64, 32)
129 OBMCSADWXH(32, 64)
130 OBMCSADWXH(32, 32)
131 OBMCSADWXH(32, 16)
132 OBMCSADWXH(16, 32)
133 OBMCSADWXH(16, 16)
134 OBMCSADWXH(16, 8)
135 OBMCSADWXH(8, 16)
136 OBMCSADWXH(8, 8)
137 OBMCSADWXH(8, 4)
138 OBMCSADWXH(4, 8)
139 OBMCSADWXH(4, 4)
140 OBMCSADWXH(4, 16)
141 OBMCSADWXH(16, 4)
142 OBMCSADWXH(8, 32)
143 OBMCSADWXH(32, 8)
144 OBMCSADWXH(16, 64)
145 OBMCSADWXH(64, 16)
146
147 ////////////////////////////////////////////////////////////////////////////////
148 // High bit-depth
149 ////////////////////////////////////////////////////////////////////////////////
150
151 #if CONFIG_AV1_HIGHBITDEPTH
hbd_obmc_sad_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)152 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
153 const int pre_stride,
154 const int32_t *wsrc,
155 const int32_t *mask,
156 const int height) {
157 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
158 const int pre_step = pre_stride - 4;
159 int n = 0;
160 __m128i v_sad_d = _mm_setzero_si128();
161
162 do {
163 const __m128i v_p_w = xx_loadl_64(pre + n);
164 const __m128i v_m_d = xx_load_128(mask + n);
165 const __m128i v_w_d = xx_load_128(wsrc + n);
166
167 const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
168
169 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
170 // boundaries. We use pmaddwd, as it has lower latency on Haswell
171 // than pmulld but produces the same result with these inputs.
172 const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
173
174 const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
175 const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
176
177 // Rounded absolute difference
178 const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
179
180 v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
181
182 n += 4;
183
184 if (n % 4 == 0) pre += pre_step;
185 } while (n < 4 * height);
186
187 return xx_hsum_epi32_si32(v_sad_d);
188 }
189
hbd_obmc_sad_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)190 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
191 const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
192 const int32_t *mask, const int width, const int height) {
193 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
194 const int pre_step = pre_stride - width;
195 int n = 0;
196 __m128i v_sad_d = _mm_setzero_si128();
197
198 assert(width >= 8);
199 assert(IS_POWER_OF_TWO(width));
200
201 do {
202 const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
203 const __m128i v_m1_d = xx_load_128(mask + n + 4);
204 const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
205 const __m128i v_p0_w = xx_loadl_64(pre + n);
206 const __m128i v_m0_d = xx_load_128(mask + n);
207 const __m128i v_w0_d = xx_load_128(wsrc + n);
208
209 const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
210 const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
211
212 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
213 // boundaries. We use pmaddwd, as it has lower latency on Haswell
214 // than pmulld but produces the same result with these inputs.
215 const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
216 const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
217
218 const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
219 const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
220 const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
221 const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
222
223 // Rounded absolute difference
224 const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
225 const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
226
227 v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
228 v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
229
230 n += 8;
231
232 if (n % width == 0) pre += pre_step;
233 } while (n < width * height);
234
235 return xx_hsum_epi32_si32(v_sad_d);
236 }
237
238 #define HBD_OBMCSADWXH(w, h) \
239 unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
240 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
241 const int32_t *mask) { \
242 if (w == 4) { \
243 return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
244 } else { \
245 return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
246 } \
247 }
248
249 HBD_OBMCSADWXH(128, 128)
250 HBD_OBMCSADWXH(128, 64)
251 HBD_OBMCSADWXH(64, 128)
252 HBD_OBMCSADWXH(64, 64)
253 HBD_OBMCSADWXH(64, 32)
254 HBD_OBMCSADWXH(32, 64)
255 HBD_OBMCSADWXH(32, 32)
256 HBD_OBMCSADWXH(32, 16)
257 HBD_OBMCSADWXH(16, 32)
258 HBD_OBMCSADWXH(16, 16)
259 HBD_OBMCSADWXH(16, 8)
260 HBD_OBMCSADWXH(8, 16)
261 HBD_OBMCSADWXH(8, 8)
262 HBD_OBMCSADWXH(8, 4)
263 HBD_OBMCSADWXH(4, 8)
264 HBD_OBMCSADWXH(4, 4)
265 HBD_OBMCSADWXH(4, 16)
266 HBD_OBMCSADWXH(16, 4)
267 HBD_OBMCSADWXH(8, 32)
268 HBD_OBMCSADWXH(32, 8)
269 HBD_OBMCSADWXH(16, 64)
270 HBD_OBMCSADWXH(64, 16)
271 #endif // CONFIG_AV1_HIGHBITDEPTH
272