1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/x86/convolve.h"
16 #include "vpx_dsp/x86/convolve_sse2.h"
17 #include "vpx_ports/mem.h"
18
19 #define CONV8_ROUNDING_BITS (7)
20 #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
21
vpx_filter_block1d16_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)22 static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
23 ptrdiff_t src_stride, uint8_t *dst_ptr,
24 ptrdiff_t dst_stride, uint32_t height,
25 const int16_t *kernel) {
26 __m128i kernel_reg; // Kernel
27 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
28 const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
29 int h;
30
31 __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
32 __m128i dst_first, dst_second;
33 __m128i even, odd;
34
35 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
36 src_ptr -= 1;
37
38 // Load Kernel
39 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
40 kernel_reg = _mm_srai_epi16(kernel_reg, 1);
41 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
42 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
43
44 for (h = height; h > 0; --h) {
45 // We will load multiple shifted versions of the row and shuffle them into
46 // 16-bit words of the form
47 // ... s[2] s[1] s[0] s[-1]
48 // ... s[4] s[3] s[2] s[1]
49 // Then we call multiply and add to get partial results
50 // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
51 // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
52 // The two results are then added together for the first half of even
53 // output.
54 // Repeat multiple times to get the whole outoput
55 src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
56 src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
57 src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
58 src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
59
60 // Output 6 4 2 0
61 even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
62 &kernel_reg_45);
63
64 // Output 7 5 3 1
65 odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
66 &kernel_reg_23, &kernel_reg_45);
67
68 // Combine to get the first half of the dst
69 dst_first = mm_zip_epi32_sse2(&even, &odd);
70
71 // Do again to get the second half of dst
72 src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
73 src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
74 src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
75 src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
76
77 // Output 14 12 10 8
78 even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
79 &kernel_reg_45);
80
81 // Output 15 13 11 9
82 odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
83 &kernel_reg_23, &kernel_reg_45);
84
85 // Combine to get the second half of the dst
86 dst_second = mm_zip_epi32_sse2(&even, &odd);
87
88 // Round each result
89 dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6);
90 dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6);
91
92 // Finally combine to get the final dst
93 dst_first = _mm_packus_epi16(dst_first, dst_second);
94 _mm_store_si128((__m128i *)dst_ptr, dst_first);
95
96 src_ptr += src_stride;
97 dst_ptr += dst_stride;
98 }
99 }
100
101 /* The macro used to generate functions shifts the src_ptr up by 3 rows already
102 * */
103
vpx_filter_block1d16_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)104 static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
105 ptrdiff_t src_stride, uint8_t *dst_ptr,
106 ptrdiff_t dst_stride, uint32_t height,
107 const int16_t *kernel) {
108 // Register for source s[-1:3, :]
109 __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
110 // Interleaved rows of the source. lo is first half, hi second
111 __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
112 __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
113 // Half of half of the interleaved rows
114 __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
115 src_reg_m10_hi_2;
116 __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
117 __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
118 __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
119
120 __m128i kernel_reg; // Kernel
121 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
122
123 // Result after multiply and add
124 __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
125 __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
126 __m128i res_reg_m1012, res_reg_0123;
127 __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
128
129 const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
130
131 // We will compute the result two rows at a time
132 const ptrdiff_t src_stride_unrolled = src_stride << 1;
133 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
134 int h;
135
136 // Load Kernel
137 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
138 kernel_reg = _mm_srai_epi16(kernel_reg, 1);
139 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
140 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
141
142 // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
143 // words,
144 // shuffle the data into the form
145 // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
146 // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
147 // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
148 // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
149 // so that we can call multiply and add with the kernel to get 32-bit words of
150 // the form
151 // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
152 // Finally, we can add multiple rows together to get the desired output.
153
154 // First shuffle the data
155 src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
156 src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
157 src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
158 src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
159 src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
160 src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
161 src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
162 src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
163
164 // More shuffling
165 src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
166 src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
167 src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
168 src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
169 src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
170 src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
171 src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
172
173 for (h = height; h > 1; h -= 2) {
174 src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
175
176 src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
177 src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
178
179 src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
180
181 src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
182 src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
183
184 // Partial output from first half
185 res_reg_m10_lo = mm_madd_packs_epi16_sse2(
186 &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
187
188 res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
189 &kernel_reg_23);
190
191 src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
192 src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
193 res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
194 &kernel_reg_45);
195
196 src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
197 src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
198 res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
199 &kernel_reg_45);
200
201 // Add to get first half of the results
202 res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
203 res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
204
205 // Now repeat everything again for the second half
206 // Partial output for second half
207 res_reg_m10_hi = mm_madd_packs_epi16_sse2(
208 &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
209
210 res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
211 &kernel_reg_23);
212
213 src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
214 src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
215 res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
216 &kernel_reg_45);
217
218 src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
219 src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
220 res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
221 &kernel_reg_45);
222
223 // Second half of the results
224 res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
225 res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
226
227 // Round the words
228 res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6);
229 res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6);
230 res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6);
231 res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6);
232
233 // Combine to get the result
234 res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
235 res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
236
237 _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
238 _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
239
240 // Update the source by two rows
241 src_ptr += src_stride_unrolled;
242 dst_ptr += dst_stride_unrolled;
243
244 src_reg_m10_lo_1 = src_reg_12_lo_1;
245 src_reg_m10_lo_2 = src_reg_12_lo_2;
246 src_reg_m10_hi_1 = src_reg_12_hi_1;
247 src_reg_m10_hi_2 = src_reg_12_hi_2;
248 src_reg_01_lo_1 = src_reg_23_lo_1;
249 src_reg_01_lo_2 = src_reg_23_lo_2;
250 src_reg_01_hi_1 = src_reg_23_hi_1;
251 src_reg_01_hi_2 = src_reg_23_hi_2;
252 src_reg_1 = src_reg_3;
253 }
254 }
255
vpx_filter_block1d8_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)256 static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
257 ptrdiff_t src_stride, uint8_t *dst_ptr,
258 ptrdiff_t dst_stride, uint32_t height,
259 const int16_t *kernel) {
260 __m128i kernel_reg; // Kernel
261 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
262 const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
263 int h;
264
265 __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
266 __m128i dst_first;
267 __m128i even, odd;
268
269 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
270 src_ptr -= 1;
271
272 // Load Kernel
273 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
274 kernel_reg = _mm_srai_epi16(kernel_reg, 1);
275 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
276 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
277
278 for (h = height; h > 0; --h) {
279 // We will load multiple shifted versions of the row and shuffle them into
280 // 16-bit words of the form
281 // ... s[2] s[1] s[0] s[-1]
282 // ... s[4] s[3] s[2] s[1]
283 // Then we call multiply and add to get partial results
284 // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
285 // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
286 // The two results are then added together to get the even output
287 src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
288 src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
289 src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
290 src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
291
292 // Output 6 4 2 0
293 even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
294 &kernel_reg_45);
295
296 // Output 7 5 3 1
297 odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
298 &kernel_reg_23, &kernel_reg_45);
299
300 // Combine to get the first half of the dst
301 dst_first = mm_zip_epi32_sse2(&even, &odd);
302 dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6);
303
304 // Saturate and convert to 8-bit words
305 dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
306
307 _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
308
309 src_ptr += src_stride;
310 dst_ptr += dst_stride;
311 }
312 }
313
vpx_filter_block1d8_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)314 static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
315 ptrdiff_t src_stride, uint8_t *dst_ptr,
316 ptrdiff_t dst_stride, uint32_t height,
317 const int16_t *kernel) {
318 // Register for source s[-1:3, :]
319 __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
320 // Interleaved rows of the source. lo is first half, hi second
321 __m128i src_reg_m10_lo, src_reg_01_lo;
322 __m128i src_reg_12_lo, src_reg_23_lo;
323 // Half of half of the interleaved rows
324 __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
325 __m128i src_reg_01_lo_1, src_reg_01_lo_2;
326 __m128i src_reg_12_lo_1, src_reg_12_lo_2;
327 __m128i src_reg_23_lo_1, src_reg_23_lo_2;
328
329 __m128i kernel_reg; // Kernel
330 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
331
332 // Result after multiply and add
333 __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
334 __m128i res_reg_m1012, res_reg_0123;
335 __m128i res_reg_m1012_lo, res_reg_0123_lo;
336
337 const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
338
339 // We will compute the result two rows at a time
340 const ptrdiff_t src_stride_unrolled = src_stride << 1;
341 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
342 int h;
343
344 // Load Kernel
345 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
346 kernel_reg = _mm_srai_epi16(kernel_reg, 1);
347 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
348 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
349
350 // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
351 // words,
352 // shuffle the data into the form
353 // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
354 // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
355 // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
356 // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
357 // so that we can call multiply and add with the kernel to get 32-bit words of
358 // the form
359 // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
360 // Finally, we can add multiple rows together to get the desired output.
361
362 // First shuffle the data
363 src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
364 src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
365 src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
366 src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
367 src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
368
369 // More shuffling
370 src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
371 src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
372 src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
373 src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
374
375 for (h = height; h > 1; h -= 2) {
376 src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
377
378 src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
379
380 src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
381
382 src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
383
384 // Partial output
385 res_reg_m10_lo = mm_madd_packs_epi16_sse2(
386 &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
387
388 res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
389 &kernel_reg_23);
390
391 src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
392 src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
393 res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
394 &kernel_reg_45);
395
396 src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
397 src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
398 res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
399 &kernel_reg_45);
400
401 // Add to get results
402 res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
403 res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
404
405 // Round the words
406 res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6);
407 res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6);
408
409 // Convert to 8-bit words
410 res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
411 res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
412
413 // Save only half of the register (8 words)
414 _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
415 _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
416
417 // Update the source by two rows
418 src_ptr += src_stride_unrolled;
419 dst_ptr += dst_stride_unrolled;
420
421 src_reg_m10_lo_1 = src_reg_12_lo_1;
422 src_reg_m10_lo_2 = src_reg_12_lo_2;
423 src_reg_01_lo_1 = src_reg_23_lo_1;
424 src_reg_01_lo_2 = src_reg_23_lo_2;
425 src_reg_1 = src_reg_3;
426 }
427 }
428
vpx_filter_block1d4_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)429 static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
430 ptrdiff_t src_stride, uint8_t *dst_ptr,
431 ptrdiff_t dst_stride, uint32_t height,
432 const int16_t *kernel) {
433 __m128i kernel_reg; // Kernel
434 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
435 const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
436 int h;
437
438 __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
439 __m128i dst_first;
440 __m128i tmp_0, tmp_1;
441
442 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
443 src_ptr -= 1;
444
445 // Load Kernel
446 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
447 kernel_reg = _mm_srai_epi16(kernel_reg, 1);
448 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
449 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
450
451 for (h = height; h > 0; --h) {
452 // We will load multiple shifted versions of the row and shuffle them into
453 // 16-bit words of the form
454 // ... s[1] s[0] s[0] s[-1]
455 // ... s[3] s[2] s[2] s[1]
456 // Then we call multiply and add to get partial results
457 // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
458 // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
459 // The two results are then added together to get the output
460 src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
461 src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
462 src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
463 src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
464
465 // Convert to 16-bit words
466 src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
467 src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
468 src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
469 src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
470
471 // Shuffle into the right format
472 tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
473 tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
474
475 // Partial output
476 tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
477 tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
478
479 // Output
480 dst_first = _mm_add_epi32(tmp_0, tmp_1);
481 dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
482
483 dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6);
484
485 // Saturate and convert to 8-bit words
486 dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
487
488 *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
489
490 src_ptr += src_stride;
491 dst_ptr += dst_stride;
492 }
493 }
494
vpx_filter_block1d4_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)495 static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
496 ptrdiff_t src_stride, uint8_t *dst_ptr,
497 ptrdiff_t dst_stride, uint32_t height,
498 const int16_t *kernel) {
499 // Register for source s[-1:3, :]
500 __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
501 // Interleaved rows of the source. lo is first half, hi second
502 __m128i src_reg_m10_lo, src_reg_01_lo;
503 __m128i src_reg_12_lo, src_reg_23_lo;
504 // Half of half of the interleaved rows
505 __m128i src_reg_m10_lo_1;
506 __m128i src_reg_01_lo_1;
507 __m128i src_reg_12_lo_1;
508 __m128i src_reg_23_lo_1;
509
510 __m128i kernel_reg; // Kernel
511 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
512
513 // Result after multiply and add
514 __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
515 __m128i res_reg_m1012, res_reg_0123;
516 __m128i res_reg_m1012_lo, res_reg_0123_lo;
517
518 const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
519 const __m128i reg_zero = _mm_setzero_si128();
520
521 // We will compute the result two rows at a time
522 const ptrdiff_t src_stride_unrolled = src_stride << 1;
523 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
524 int h;
525
526 // Load Kernel
527 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
528 kernel_reg = _mm_srai_epi16(kernel_reg, 1);
529 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
530 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
531
532 // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
533 // words,
534 // shuffle the data into the form
535 // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
536 // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
537 // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
538 // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
539 // so that we can call multiply and add with the kernel to get 32-bit words of
540 // the form
541 // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
542 // Finally, we can add multiple rows together to get the desired output.
543
544 // First shuffle the data
545 src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
546 src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
547 src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
548 src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
549
550 // More shuffling
551 src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
552 src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
553 src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
554
555 for (h = height; h > 1; h -= 2) {
556 src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
557
558 src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
559
560 src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
561
562 src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
563
564 // Partial output
565 res_reg_m10_lo =
566 mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23);
567
568 res_reg_01_lo =
569 mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23);
570
571 src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
572 res_reg_12_lo =
573 mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45);
574
575 src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
576 res_reg_23_lo =
577 mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45);
578
579 // Add to get results
580 res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
581 res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
582
583 // Round the words
584 res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6);
585 res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6);
586
587 // Convert to 8-bit words
588 res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
589 res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
590
591 // Save only half of the register (8 words)
592 *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
593 *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
594
595 // Update the source by two rows
596 src_ptr += src_stride_unrolled;
597 dst_ptr += dst_stride_unrolled;
598
599 src_reg_m10_lo_1 = src_reg_12_lo_1;
600 src_reg_01_lo_1 = src_reg_23_lo_1;
601 src_reg_1 = src_reg_3;
602 }
603 }
604
605 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
vpx_highbd_filter_block1d4_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)606 static void vpx_highbd_filter_block1d4_h4_sse2(
607 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
608 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
609 // We will load multiple shifted versions of the row and shuffle them into
610 // 16-bit words of the form
611 // ... s[2] s[1] s[0] s[-1]
612 // ... s[4] s[3] s[2] s[1]
613 // Then we call multiply and add to get partial results
614 // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
615 // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
616 // The two results are then added together to get the even output
617
618 __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
619 __m128i res_reg;
620 __m128i even, odd;
621
622 __m128i kernel_reg; // Kernel
623 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
624 const __m128i reg_round =
625 _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
626 const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
627 const __m128i reg_zero = _mm_setzero_si128();
628 int h;
629
630 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
631 src_ptr -= 1;
632
633 // Load Kernel
634 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
635 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
636 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
637
638 for (h = height; h > 0; --h) {
639 src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
640 src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
641 src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
642 src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
643
644 // Output 2 0
645 even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
646 &kernel_reg_45);
647
648 // Output 3 1
649 odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
650 &kernel_reg_23, &kernel_reg_45);
651
652 // Combine to get the first half of the dst
653 res_reg = _mm_unpacklo_epi32(even, odd);
654 res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS);
655 res_reg = _mm_packs_epi32(res_reg, reg_zero);
656
657 // Saturate the result and save
658 res_reg = _mm_min_epi16(res_reg, reg_max);
659 res_reg = _mm_max_epi16(res_reg, reg_zero);
660 _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
661
662 src_ptr += src_stride;
663 dst_ptr += dst_stride;
664 }
665 }
666
vpx_highbd_filter_block1d4_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)667 static void vpx_highbd_filter_block1d4_v4_sse2(
668 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
669 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
670 // We will load two rows of pixels as 16-bit words, and shuffle them into the
671 // form
672 // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
673 // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
674 // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
675 // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
676 // so that we can call multiply and add with the kernel to get 32-bit words of
677 // the form
678 // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
679 // Finally, we can add multiple rows together to get the desired output.
680
681 // Register for source s[-1:3, :]
682 __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
683 // Interleaved rows of the source. lo is first half, hi second
684 __m128i src_reg_m10, src_reg_01;
685 __m128i src_reg_12, src_reg_23;
686
687 __m128i kernel_reg; // Kernel
688 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
689
690 // Result after multiply and add
691 __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
692 __m128i res_reg_m1012, res_reg_0123;
693
694 const __m128i reg_round =
695 _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
696 const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
697 const __m128i reg_zero = _mm_setzero_si128();
698
699 // We will compute the result two rows at a time
700 const ptrdiff_t src_stride_unrolled = src_stride << 1;
701 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
702 int h;
703
704 // Load Kernel
705 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
706 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
707 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
708
709 // First shuffle the data
710 src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
711 src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
712 src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
713
714 // More shuffling
715 src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
716 src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
717
718 for (h = height; h > 1; h -= 2) {
719 src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
720
721 src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
722
723 src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
724
725 src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
726
727 // Partial output
728 res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
729 res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
730 res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
731 res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
732
733 // Add to get results
734 res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
735 res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
736
737 // Round the words
738 res_reg_m1012 =
739 mm_round_epi32_sse2(&res_reg_m1012, ®_round, CONV8_ROUNDING_BITS);
740 res_reg_0123 =
741 mm_round_epi32_sse2(&res_reg_0123, ®_round, CONV8_ROUNDING_BITS);
742
743 res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
744 res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
745
746 // Saturate according to bit depth
747 res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
748 res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
749 res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
750 res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
751
752 // Save only half of the register (8 words)
753 _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
754 _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
755
756 // Update the source by two rows
757 src_ptr += src_stride_unrolled;
758 dst_ptr += dst_stride_unrolled;
759
760 src_reg_m10 = src_reg_12;
761 src_reg_01 = src_reg_23;
762 src_reg_1 = src_reg_3;
763 }
764 }
765
vpx_highbd_filter_block1d8_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)766 static void vpx_highbd_filter_block1d8_h4_sse2(
767 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
768 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
769 // We will load multiple shifted versions of the row and shuffle them into
770 // 16-bit words of the form
771 // ... s[2] s[1] s[0] s[-1]
772 // ... s[4] s[3] s[2] s[1]
773 // Then we call multiply and add to get partial results
774 // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
775 // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
776 // The two results are then added together for the first half of even
777 // output.
778 // Repeat multiple times to get the whole outoput
779
780 __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
781 src_reg_shift_3;
782 __m128i res_reg;
783 __m128i even, odd;
784 __m128i tmp_0, tmp_1;
785
786 __m128i kernel_reg; // Kernel
787 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
788 const __m128i reg_round =
789 _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
790 const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
791 const __m128i reg_zero = _mm_setzero_si128();
792 int h;
793
794 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
795 src_ptr -= 1;
796
797 // Load Kernel
798 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
799 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
800 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
801
802 for (h = height; h > 0; --h) {
803 // We will put first half in the first half of the reg, and second half in
804 // second half
805 src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
806 src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
807
808 // Output 6 4 2 0
809 tmp_0 = _mm_srli_si128(src_reg, 4);
810 tmp_1 = _mm_srli_si128(src_reg_next, 2);
811 src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
812 even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
813 &kernel_reg_45);
814
815 // Output 7 5 3 1
816 tmp_0 = _mm_srli_si128(src_reg, 2);
817 tmp_1 = src_reg_next;
818 src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
819
820 tmp_0 = _mm_srli_si128(src_reg, 6);
821 tmp_1 = _mm_srli_si128(src_reg_next, 4);
822 src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
823
824 odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
825 &kernel_reg_23, &kernel_reg_45);
826
827 // Combine to get the first half of the dst
828 even = mm_round_epi32_sse2(&even, ®_round, CONV8_ROUNDING_BITS);
829 odd = mm_round_epi32_sse2(&odd, ®_round, CONV8_ROUNDING_BITS);
830 res_reg = mm_zip_epi32_sse2(&even, &odd);
831
832 // Saturate the result and save
833 res_reg = _mm_min_epi16(res_reg, reg_max);
834 res_reg = _mm_max_epi16(res_reg, reg_zero);
835
836 _mm_store_si128((__m128i *)dst_ptr, res_reg);
837
838 src_ptr += src_stride;
839 dst_ptr += dst_stride;
840 }
841 }
842
vpx_highbd_filter_block1d8_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)843 static void vpx_highbd_filter_block1d8_v4_sse2(
844 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
845 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
846 // We will load two rows of pixels as 16-bit words, and shuffle them into the
847 // form
848 // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
849 // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
850 // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
851 // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
852 // so that we can call multiply and add with the kernel to get 32-bit words of
853 // the form
854 // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
855 // Finally, we can add multiple rows together to get the desired output.
856
857 // Register for source s[-1:3, :]
858 __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
859 // Interleaved rows of the source. lo is first half, hi second
860 __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
861 __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
862
863 // Result after multiply and add
864 __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
865 __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
866 __m128i res_reg_m1012, res_reg_0123;
867 __m128i res_reg_m1012_lo, res_reg_0123_lo;
868 __m128i res_reg_m1012_hi, res_reg_0123_hi;
869
870 __m128i kernel_reg; // Kernel
871 __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
872
873 const __m128i reg_round =
874 _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
875 const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
876 const __m128i reg_zero = _mm_setzero_si128();
877
878 // We will compute the result two rows at a time
879 const ptrdiff_t src_stride_unrolled = src_stride << 1;
880 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
881 int h;
882
883 // Load Kernel
884 kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
885 kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
886 kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
887
888 // First shuffle the data
889 src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
890 src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
891 src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
892 src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
893
894 // More shuffling
895 src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
896 src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
897 src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
898
899 for (h = height; h > 1; h -= 2) {
900 src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
901
902 src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
903 src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
904
905 src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
906
907 src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
908 src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
909
910 // Partial output for first half
911 res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
912 res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
913 res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
914 res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
915
916 // Add to get results
917 res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
918 res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
919
920 // Round the words
921 res_reg_m1012_lo =
922 mm_round_epi32_sse2(&res_reg_m1012_lo, ®_round, CONV8_ROUNDING_BITS);
923 res_reg_0123_lo =
924 mm_round_epi32_sse2(&res_reg_0123_lo, ®_round, CONV8_ROUNDING_BITS);
925
926 // Partial output for first half
927 res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
928 res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
929 res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
930 res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
931
932 // Add to get results
933 res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
934 res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
935
936 // Round the words
937 res_reg_m1012_hi =
938 mm_round_epi32_sse2(&res_reg_m1012_hi, ®_round, CONV8_ROUNDING_BITS);
939 res_reg_0123_hi =
940 mm_round_epi32_sse2(&res_reg_0123_hi, ®_round, CONV8_ROUNDING_BITS);
941
942 // Combine the two halfs
943 res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
944 res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
945
946 // Saturate according to bit depth
947 res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
948 res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
949 res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
950 res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
951
952 // Save only half of the register (8 words)
953 _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
954 _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
955
956 // Update the source by two rows
957 src_ptr += src_stride_unrolled;
958 dst_ptr += dst_stride_unrolled;
959
960 src_reg_m10_lo = src_reg_12_lo;
961 src_reg_m10_hi = src_reg_12_hi;
962 src_reg_01_lo = src_reg_23_lo;
963 src_reg_01_hi = src_reg_23_hi;
964 src_reg_1 = src_reg_3;
965 }
966 }
967
vpx_highbd_filter_block1d16_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)968 static void vpx_highbd_filter_block1d16_h4_sse2(
969 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
970 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
971 vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
972 height, kernel, bd);
973 vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
974 dst_stride, height, kernel, bd);
975 }
976
vpx_highbd_filter_block1d16_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)977 static void vpx_highbd_filter_block1d16_v4_sse2(
978 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
979 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
980 vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
981 height, kernel, bd);
982 vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
983 dst_stride, height, kernel, bd);
984 }
985 #endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
986
987 // From vpx_subpixel_8t_sse2.asm.
988 filter8_1dfunction vpx_filter_block1d16_v8_sse2;
989 filter8_1dfunction vpx_filter_block1d16_h8_sse2;
990 filter8_1dfunction vpx_filter_block1d8_v8_sse2;
991 filter8_1dfunction vpx_filter_block1d8_h8_sse2;
992 filter8_1dfunction vpx_filter_block1d4_v8_sse2;
993 filter8_1dfunction vpx_filter_block1d4_h8_sse2;
994 filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
995 filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
996 filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
997 filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
998 filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
999 filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
1000
1001 // Use the [vh]8 version because there is no [vh]4 implementation.
1002 #define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
1003 #define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
1004 #define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
1005 #define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
1006 #define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
1007 #define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
1008
1009 // From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
1010 filter8_1dfunction vpx_filter_block1d16_v2_sse2;
1011 filter8_1dfunction vpx_filter_block1d16_h2_sse2;
1012 filter8_1dfunction vpx_filter_block1d8_v2_sse2;
1013 filter8_1dfunction vpx_filter_block1d8_h2_sse2;
1014 filter8_1dfunction vpx_filter_block1d4_v2_sse2;
1015 filter8_1dfunction vpx_filter_block1d4_h2_sse2;
1016 filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
1017 filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
1018 filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
1019 filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
1020 filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
1021 filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
1022
1023 // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
1024 // uint8_t *dst, ptrdiff_t dst_stride,
1025 // const InterpKernel *filter, int x0_q4,
1026 // int32_t x_step_q4, int y0_q4, int y_step_q4,
1027 // int w, int h);
1028 // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
1029 // uint8_t *dst, ptrdiff_t dst_stride,
1030 // const InterpKernel *filter, int x0_q4,
1031 // int32_t x_step_q4, int y0_q4, int y_step_q4,
1032 // int w, int h);
1033 // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
1034 // uint8_t *dst, ptrdiff_t dst_stride,
1035 // const InterpKernel *filter, int x0_q4,
1036 // int32_t x_step_q4, int y0_q4,
1037 // int y_step_q4, int w, int h);
1038 // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
1039 // uint8_t *dst, ptrdiff_t dst_stride,
1040 // const InterpKernel *filter, int x0_q4,
1041 // int32_t x_step_q4, int y0_q4, int y_step_q4,
1042 // int w, int h);
1043 FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
1044 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
1045 sse2, 0)
1046 FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
1047 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1048 src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
1049
1050 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
1051 // uint8_t *dst, ptrdiff_t dst_stride,
1052 // const InterpKernel *filter, int x0_q4,
1053 // int32_t x_step_q4, int y0_q4, int y_step_q4,
1054 // int w, int h);
1055 // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
1056 // uint8_t *dst, ptrdiff_t dst_stride,
1057 // const InterpKernel *filter, int x0_q4,
1058 // int32_t x_step_q4, int y0_q4, int y_step_q4,
1059 // int w, int h);
1060 FUN_CONV_2D(, sse2, 0)
1061 FUN_CONV_2D(avg_, sse2, 1)
1062
1063 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
1064 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1065 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
1066 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
1067 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
1068 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
1069 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1070 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1071 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
1072 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
1073 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
1074 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
1075 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1076 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1077
1078 // Use the [vh]8 version because there is no [vh]4 implementation.
1079 #define vpx_highbd_filter_block1d16_v4_avg_sse2 \
1080 vpx_highbd_filter_block1d16_v8_avg_sse2
1081 #define vpx_highbd_filter_block1d16_h4_avg_sse2 \
1082 vpx_highbd_filter_block1d16_h8_avg_sse2
1083 #define vpx_highbd_filter_block1d8_v4_avg_sse2 \
1084 vpx_highbd_filter_block1d8_v8_avg_sse2
1085 #define vpx_highbd_filter_block1d8_h4_avg_sse2 \
1086 vpx_highbd_filter_block1d8_h8_avg_sse2
1087 #define vpx_highbd_filter_block1d4_v4_avg_sse2 \
1088 vpx_highbd_filter_block1d4_v8_avg_sse2
1089 #define vpx_highbd_filter_block1d4_h4_avg_sse2 \
1090 vpx_highbd_filter_block1d4_h8_avg_sse2
1091
1092 // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1093 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
1094 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
1095 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
1096 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
1097 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1098 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1099 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
1100 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
1101 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
1102 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
1103 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1104 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1105
1106 // void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
1107 // ptrdiff_t src_stride,
1108 // uint8_t *dst,
1109 // ptrdiff_t dst_stride,
1110 // const int16_t *filter_x,
1111 // int x_step_q4,
1112 // const int16_t *filter_y,
1113 // int y_step_q4,
1114 // int w, int h, int bd);
1115 // void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
1116 // ptrdiff_t src_stride,
1117 // uint8_t *dst,
1118 // ptrdiff_t dst_stride,
1119 // const int16_t *filter_x,
1120 // int x_step_q4,
1121 // const int16_t *filter_y,
1122 // int y_step_q4,
1123 // int w, int h, int bd);
1124 // void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
1125 // ptrdiff_t src_stride,
1126 // uint8_t *dst,
1127 // ptrdiff_t dst_stride,
1128 // const int16_t *filter_x,
1129 // int x_step_q4,
1130 // const int16_t *filter_y,
1131 // int y_step_q4,
1132 // int w, int h, int bd);
1133 // void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
1134 // ptrdiff_t src_stride,
1135 // uint8_t *dst,
1136 // ptrdiff_t dst_stride,
1137 // const int16_t *filter_x,
1138 // int x_step_q4,
1139 // const int16_t *filter_y,
1140 // int y_step_q4,
1141 // int w, int h, int bd);
1142 HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
1143 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1144 src - src_stride * (num_taps / 2 - 1), , sse2, 0)
1145 HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
1146 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1147 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
1148
1149 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
1150 // uint8_t *dst, ptrdiff_t dst_stride,
1151 // const InterpKernel *filter, int x0_q4,
1152 // int32_t x_step_q4, int y0_q4, int y_step_q4,
1153 // int w, int h, int bd);
1154 // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
1155 // uint8_t *dst, ptrdiff_t dst_stride,
1156 // const InterpKernel *filter, int x0_q4,
1157 // int32_t x_step_q4, int y0_q4,
1158 // int y_step_q4, int w, int h, int bd);
1159 HIGH_FUN_CONV_2D(, sse2, 0)
1160 HIGH_FUN_CONV_2D(avg_, sse2, 1)
1161 #endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
1162