1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <emmintrin.h>
12
13 #include "config/aom_dsp_rtcd.h"
14 #include "aom_dsp/x86/convolve.h"
15
16 // -----------------------------------------------------------------------------
17
aom_highbd_filter_block1d4_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)18 static void aom_highbd_filter_block1d4_v4_sse2(
19 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
20 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
21 __m128i filtersReg;
22 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
23 __m128i srcReg23_lo, srcReg34_lo;
24 __m128i srcReg45_lo, srcReg56_lo;
25 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
26 __m128i resReg23_45_lo, resReg34_56_lo;
27 __m128i resReg23_45, resReg34_56;
28 __m128i addFilterReg64, secondFilters, thirdFilters;
29 unsigned int i;
30 ptrdiff_t src_stride, dst_stride;
31
32 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
33 addFilterReg64 = _mm_set1_epi32(64);
34 filtersReg = _mm_loadu_si128((const __m128i *)filter);
35
36 // coeffs 0 1 0 1 2 3 2 3
37 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
38 // coeffs 4 5 4 5 6 7 6 7
39 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
40
41 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
42 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
43
44 // multiply the size of the source and destination stride by two
45 src_stride = src_pitch << 1;
46 dst_stride = dst_pitch << 1;
47
48 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
49 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
50 srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
51
52 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
53 srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
54
55 for (i = height; i > 1; i -= 2) {
56 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
57 srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
58
59 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
60 srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
61
62 // multiply 2 adjacent elements with the filter and add the result
63
64 resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
65 resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
66 resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
67 resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
68
69 resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
70 resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
71
72 // shift by 7 bit each 32 bit
73 resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
74 resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
75 resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
76 resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
77
78 // shrink to 16 bit each 32 bits, the first lane contain the first
79 // convolve result and the second lane contain the second convolve
80 // result
81 resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
82 resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
83
84 resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
85 resReg23_45 = _mm_min_epi16(resReg23_45, max);
86 resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
87 resReg34_56 = _mm_min_epi16(resReg34_56, max);
88
89 src_ptr += src_stride;
90
91 _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
92 _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
93
94 dst_ptr += dst_stride;
95
96 // save part of the registers for next strides
97 srcReg23_lo = srcReg45_lo;
98 srcReg34_lo = srcReg56_lo;
99 srcReg4 = srcReg6;
100 }
101 }
102
aom_highbd_filter_block1d4_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)103 static void aom_highbd_filter_block1d4_h4_sse2(
104 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
105 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
106 __m128i filtersReg;
107 __m128i addFilterReg64;
108 __m128i secondFilters, thirdFilters;
109 __m128i srcRegFilt32b1_1;
110 __m128i srcReg32b1;
111 unsigned int i;
112 src_ptr -= 3;
113 addFilterReg64 = _mm_set1_epi32(64);
114 filtersReg = _mm_loadu_si128((const __m128i *)filter);
115 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
116
117 // coeffs 0 1 0 1 2 3 2 3
118 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
119 // coeffs 4 5 4 5 6 7 6 7
120 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
121
122 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
123 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
124
125 for (i = height; i > 0; i -= 1) {
126 srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
127
128 __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
129 __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
130 __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
131 __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
132 __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
133
134 ss_23 = _mm_madd_epi16(ss_23, secondFilters);
135 ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
136 srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
137
138 // shift by 7 bit each 32 bit
139 srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
140 srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
141
142 srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
143 srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
144 srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
145
146 src_ptr += src_pitch;
147
148 _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
149
150 dst_ptr += dst_pitch;
151 }
152 }
153
aom_highbd_filter_block1d8_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)154 static void aom_highbd_filter_block1d8_v4_sse2(
155 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
156 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
157 __m128i filtersReg;
158 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
159 __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
160 __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
161 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
162 __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
163 __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
164 __m128i resReg23_45, resReg34_56;
165 __m128i addFilterReg64, secondFilters, thirdFilters;
166 unsigned int i;
167 ptrdiff_t src_stride, dst_stride;
168
169 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
170 addFilterReg64 = _mm_set1_epi32(64);
171 filtersReg = _mm_loadu_si128((const __m128i *)filter);
172
173 // coeffs 0 1 0 1 2 3 2 3
174 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
175 // coeffs 4 5 4 5 6 7 6 7
176 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
177
178 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
179 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
180
181 // multiple the size of the source and destination stride by two
182 src_stride = src_pitch << 1;
183 dst_stride = dst_pitch << 1;
184
185 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
186 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
187 srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
188 srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
189
190 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
191 srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
192 srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
193
194 for (i = height; i > 1; i -= 2) {
195 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
196
197 srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
198 srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
199
200 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
201
202 srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
203 srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
204
205 // multiply 2 adjacent elements with the filter and add the result
206
207 resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
208 resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
209 resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
210 resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
211
212 resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
213 resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
214
215 // multiply 2 adjacent elements with the filter and add the result
216
217 resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
218 resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
219 resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
220 resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
221
222 resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
223 resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
224
225 // shift by 7 bit each 32 bit
226 resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
227 resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
228 resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
229 resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
230 resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
231 resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
232 resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
233 resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
234
235 // shrink to 16 bit each 32 bits, the first lane contain the first
236 // convolve result and the second lane contain the second convolve
237 // result
238 resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
239 resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
240
241 resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
242 resReg23_45 = _mm_min_epi16(resReg23_45, max);
243 resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
244 resReg34_56 = _mm_min_epi16(resReg34_56, max);
245
246 src_ptr += src_stride;
247
248 _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
249 _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
250
251 dst_ptr += dst_stride;
252
253 // save part of the registers for next strides
254 srcReg23_lo = srcReg45_lo;
255 srcReg23_hi = srcReg45_hi;
256 srcReg34_lo = srcReg56_lo;
257 srcReg34_hi = srcReg56_hi;
258 srcReg4 = srcReg6;
259 }
260 }
261
aom_highbd_filter_block1d8_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)262 static void aom_highbd_filter_block1d8_h4_sse2(
263 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
264 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
265 __m128i filtersReg;
266 __m128i addFilterReg64;
267 __m128i secondFilters, thirdFilters;
268 __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
269 __m128i srcReg32b1, srcReg32b2;
270 unsigned int i;
271 src_ptr -= 3;
272 addFilterReg64 = _mm_set1_epi32(64);
273 filtersReg = _mm_loadu_si128((const __m128i *)filter);
274 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
275
276 // coeffs 0 1 0 1 2 3 2 3
277 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
278 // coeffs 4 5 4 5 6 7 6 7
279 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
280
281 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
282 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
283
284 for (i = height; i > 0; i -= 1) {
285 srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
286 srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
287
288 __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
289 __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
290 __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
291
292 __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
293 __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
294 srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
295
296 __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
297 __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
298 __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
299 __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
300 __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
301 __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
302
303 d1 = _mm_madd_epi16(ss_3, secondFilters);
304 d2 = _mm_madd_epi16(ss_5, thirdFilters);
305 srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
306
307 __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
308 __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
309
310 // shift by 7 bit each 32 bit
311 res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
312 res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
313 res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
314 res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
315
316 srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
317
318 srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
319 srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
320
321 src_ptr += src_pitch;
322
323 _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
324
325 dst_ptr += dst_pitch;
326 }
327 }
328
aom_highbd_filter_block1d16_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)329 static void aom_highbd_filter_block1d16_v4_sse2(
330 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
331 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
332 aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
333 height, filter, bd);
334 aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
335 dst_pitch, height, filter, bd);
336 }
337
aom_highbd_filter_block1d16_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)338 static void aom_highbd_filter_block1d16_h4_sse2(
339 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
340 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
341 aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
342 height, filter, bd);
343 aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
344 dst_pitch, height, filter, bd);
345 }
346
347 // From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
348 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
349 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
350 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
351 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
352 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
353 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
354
355 // From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
356 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
357 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
358 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
359 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
360 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
361 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
362
363 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
364 // ptrdiff_t src_stride,
365 // uint8_t *dst,
366 // ptrdiff_t dst_stride,
367 // const int16_t *filter_x,
368 // int x_step_q4,
369 // const int16_t *filter_y,
370 // int y_step_q4,
371 // int w, int h, int bd);
372 // void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
373 // ptrdiff_t src_stride,
374 // uint8_t *dst,
375 // ptrdiff_t dst_stride,
376 // const int16_t *filter_x,
377 // int x_step_q4,
378 // const int16_t *filter_y,
379 // int y_step_q4,
380 // int w, int h, int bd);
381 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
382 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
383