1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <immintrin.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
23 #include "aom_dsp/x86/synonyms.h"
24
25 ////////////////////////////////////////////////////////////////////////////////
26 // 8 bit
27 ////////////////////////////////////////////////////////////////////////////////
28
obmc_sad_w4_avx2(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)29 static inline unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
30 const int pre_stride,
31 const int32_t *wsrc,
32 const int32_t *mask,
33 const int height) {
34 int n = 0;
35 __m256i v_sad_d = _mm256_setzero_si256();
36 const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
37
38 do {
39 const __m128i v_p_b_0 = xx_loadl_32(pre);
40 const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
41 const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
42 const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
43 const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
44
45 const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
46
47 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
48 // boundaries. We use pmaddwd, as it has lower latency on Haswell
49 // than pmulld but produces the same result with these inputs.
50 const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
51
52 const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
53 const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
54
55 // Rounded absolute difference
56 const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
57 const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
58
59 v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
60
61 n += 8;
62 pre += pre_stride << 1;
63 } while (n < 8 * (height >> 1));
64
65 __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
66 __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
67 v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
68 return xx_hsum_epi32_si32(v_sad_d_0);
69 }
70
obmc_sad_w8n_avx2(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)71 static inline unsigned int obmc_sad_w8n_avx2(
72 const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
73 const int32_t *mask, const int width, const int height) {
74 const int pre_step = pre_stride - width;
75 int n = 0;
76 __m256i v_sad_d = _mm256_setzero_si256();
77 const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
78 assert(width >= 8);
79 assert(IS_POWER_OF_TWO(width));
80
81 do {
82 const __m128i v_p0_b = xx_loadl_64(pre + n);
83 const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
84 const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
85
86 const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
87
88 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
89 // boundaries. We use pmaddwd, as it has lower latency on Haswell
90 // than pmulld but produces the same result with these inputs.
91 const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
92
93 const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
94 const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
95
96 // Rounded absolute difference
97 const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
98 const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
99
100 v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
101
102 n += 8;
103
104 if ((n & (width - 1)) == 0) pre += pre_step;
105 } while (n < width * height);
106
107 __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
108 __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
109 v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
110 return xx_hsum_epi32_si32(v_sad_d_0);
111 }
112
113 #define OBMCSADWXH(w, h) \
114 unsigned int aom_obmc_sad##w##x##h##_avx2( \
115 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
116 const int32_t *msk) { \
117 if (w == 4) { \
118 return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \
119 } else { \
120 return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
121 } \
122 }
123
124 OBMCSADWXH(128, 128)
125 OBMCSADWXH(128, 64)
126 OBMCSADWXH(64, 128)
127 OBMCSADWXH(64, 64)
128 OBMCSADWXH(64, 32)
129 OBMCSADWXH(32, 64)
130 OBMCSADWXH(32, 32)
131 OBMCSADWXH(32, 16)
132 OBMCSADWXH(16, 32)
133 OBMCSADWXH(16, 16)
134 OBMCSADWXH(16, 8)
135 OBMCSADWXH(8, 16)
136 OBMCSADWXH(8, 8)
137 OBMCSADWXH(8, 4)
138 OBMCSADWXH(4, 8)
139 OBMCSADWXH(4, 4)
140 OBMCSADWXH(4, 16)
141 OBMCSADWXH(16, 4)
142 OBMCSADWXH(8, 32)
143 OBMCSADWXH(32, 8)
144 OBMCSADWXH(16, 64)
145 OBMCSADWXH(64, 16)
146
147 ////////////////////////////////////////////////////////////////////////////////
148 // High bit-depth
149 ////////////////////////////////////////////////////////////////////////////////
150
151 #if CONFIG_AV1_HIGHBITDEPTH
hbd_obmc_sad_w4_avx2(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)152 static inline unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
153 const int pre_stride,
154 const int32_t *wsrc,
155 const int32_t *mask,
156 const int height) {
157 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
158 int n = 0;
159 __m256i v_sad_d = _mm256_setzero_si256();
160 const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
161 do {
162 const __m128i v_p_w_0 = xx_loadl_64(pre);
163 const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
164 const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
165 const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
166 const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
167
168 const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
169
170 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
171 // boundaries. We use pmaddwd, as it has lower latency on Haswell
172 // than pmulld but produces the same result with these inputs.
173 const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
174
175 const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
176 const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
177
178 // Rounded absolute difference
179
180 const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
181 const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
182
183 v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
184
185 n += 8;
186
187 pre += pre_stride << 1;
188 } while (n < 8 * (height >> 1));
189
190 __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
191 __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
192 v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
193 return xx_hsum_epi32_si32(v_sad_d_0);
194 }
195
hbd_obmc_sad_w8n_avx2(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)196 static inline unsigned int hbd_obmc_sad_w8n_avx2(
197 const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
198 const int32_t *mask, const int width, const int height) {
199 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
200 const int pre_step = pre_stride - width;
201 int n = 0;
202 __m256i v_sad_d = _mm256_setzero_si256();
203 const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
204
205 assert(width >= 8);
206 assert(IS_POWER_OF_TWO(width));
207
208 do {
209 const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
210 const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
211 const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
212
213 const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
214
215 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
216 // boundaries. We use pmaddwd, as it has lower latency on Haswell
217 // than pmulld but produces the same result with these inputs.
218 const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
219
220 const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
221 const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
222
223 // Rounded absolute difference
224 const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
225 const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
226
227 v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
228
229 n += 8;
230
231 if (n % width == 0) pre += pre_step;
232 } while (n < width * height);
233
234 __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
235 __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
236 v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
237 return xx_hsum_epi32_si32(v_sad_d_0);
238 }
239
240 #define HBD_OBMCSADWXH(w, h) \
241 unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \
242 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
243 const int32_t *mask) { \
244 if (w == 4) { \
245 return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \
246 } else { \
247 return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
248 } \
249 }
250
251 HBD_OBMCSADWXH(128, 128)
252 HBD_OBMCSADWXH(128, 64)
253 HBD_OBMCSADWXH(64, 128)
254 HBD_OBMCSADWXH(64, 64)
255 HBD_OBMCSADWXH(64, 32)
256 HBD_OBMCSADWXH(32, 64)
257 HBD_OBMCSADWXH(32, 32)
258 HBD_OBMCSADWXH(32, 16)
259 HBD_OBMCSADWXH(16, 32)
260 HBD_OBMCSADWXH(16, 16)
261 HBD_OBMCSADWXH(16, 8)
262 HBD_OBMCSADWXH(8, 16)
263 HBD_OBMCSADWXH(8, 8)
264 HBD_OBMCSADWXH(8, 4)
265 HBD_OBMCSADWXH(4, 8)
266 HBD_OBMCSADWXH(4, 4)
267 HBD_OBMCSADWXH(4, 16)
268 HBD_OBMCSADWXH(16, 4)
269 HBD_OBMCSADWXH(8, 32)
270 HBD_OBMCSADWXH(32, 8)
271 HBD_OBMCSADWXH(16, 64)
272 HBD_OBMCSADWXH(64, 16)
273 #endif // CONFIG_AV1_HIGHBITDEPTH
274