xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/x86/convolve.h"
16 #include "vpx_dsp/x86/convolve_sse2.h"
17 #include "vpx_ports/mem.h"
18 
19 #define CONV8_ROUNDING_BITS (7)
20 #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
21 
vpx_filter_block1d16_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)22 static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
23                                          ptrdiff_t src_stride, uint8_t *dst_ptr,
24                                          ptrdiff_t dst_stride, uint32_t height,
25                                          const int16_t *kernel) {
26   __m128i kernel_reg;                         // Kernel
27   __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
28   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
29   int h;
30 
31   __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
32   __m128i dst_first, dst_second;
33   __m128i even, odd;
34 
35   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
36   src_ptr -= 1;
37 
38   // Load Kernel
39   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
40   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
41   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
42   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
43 
44   for (h = height; h > 0; --h) {
45     // We will load multiple shifted versions of the row and shuffle them into
46     // 16-bit words of the form
47     // ... s[2] s[1] s[0] s[-1]
48     // ... s[4] s[3] s[2] s[1]
49     // Then we call multiply and add to get partial results
50     // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
51     // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
52     // The two results are then added together for the first half of even
53     // output.
54     // Repeat multiple times to get the whole outoput
55     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
56     src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
57     src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
58     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
59 
60     // Output 6 4 2 0
61     even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
62                                  &kernel_reg_45);
63 
64     // Output 7 5 3 1
65     odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
66                                 &kernel_reg_23, &kernel_reg_45);
67 
68     // Combine to get the first half of the dst
69     dst_first = mm_zip_epi32_sse2(&even, &odd);
70 
71     // Do again to get the second half of dst
72     src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
73     src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
74     src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
75     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
76 
77     // Output 14 12 10 8
78     even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
79                                  &kernel_reg_45);
80 
81     // Output 15 13 11 9
82     odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
83                                 &kernel_reg_23, &kernel_reg_45);
84 
85     // Combine to get the second half of the dst
86     dst_second = mm_zip_epi32_sse2(&even, &odd);
87 
88     // Round each result
89     dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
90     dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
91 
92     // Finally combine to get the final dst
93     dst_first = _mm_packus_epi16(dst_first, dst_second);
94     _mm_store_si128((__m128i *)dst_ptr, dst_first);
95 
96     src_ptr += src_stride;
97     dst_ptr += dst_stride;
98   }
99 }
100 
101 /* The macro used to generate functions shifts the src_ptr up by 3 rows already
102  * */
103 
vpx_filter_block1d16_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)104 static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
105                                          ptrdiff_t src_stride, uint8_t *dst_ptr,
106                                          ptrdiff_t dst_stride, uint32_t height,
107                                          const int16_t *kernel) {
108   // Register for source s[-1:3, :]
109   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
110   // Interleaved rows of the source. lo is first half, hi second
111   __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
112   __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
113   // Half of half of the interleaved rows
114   __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
115       src_reg_m10_hi_2;
116   __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
117   __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
118   __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
119 
120   __m128i kernel_reg;                    // Kernel
121   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
122 
123   // Result after multiply and add
124   __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
125   __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
126   __m128i res_reg_m1012, res_reg_0123;
127   __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
128 
129   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
130 
131   // We will compute the result two rows at a time
132   const ptrdiff_t src_stride_unrolled = src_stride << 1;
133   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
134   int h;
135 
136   // Load Kernel
137   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
138   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
139   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
140   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
141 
142   // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
143   // words,
144   // shuffle the data into the form
145   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
146   // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
147   // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
148   // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
149   // so that we can call multiply and add with the kernel to get 32-bit words of
150   // the form
151   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
152   // Finally, we can add multiple rows together to get the desired output.
153 
154   // First shuffle the data
155   src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
156   src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
157   src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
158   src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
159   src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
160   src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
161   src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
162   src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
163 
164   // More shuffling
165   src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
166   src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
167   src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
168   src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
169   src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
170   src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
171   src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
172 
173   for (h = height; h > 1; h -= 2) {
174     src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
175 
176     src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
177     src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
178 
179     src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
180 
181     src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
182     src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
183 
184     // Partial output from first half
185     res_reg_m10_lo = mm_madd_packs_epi16_sse2(
186         &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
187 
188     res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
189                                              &kernel_reg_23);
190 
191     src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
192     src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
193     res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
194                                              &kernel_reg_45);
195 
196     src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
197     src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
198     res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
199                                              &kernel_reg_45);
200 
201     // Add to get first half of the results
202     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
203     res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
204 
205     // Now repeat everything again for the second half
206     // Partial output for second half
207     res_reg_m10_hi = mm_madd_packs_epi16_sse2(
208         &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
209 
210     res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
211                                              &kernel_reg_23);
212 
213     src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
214     src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
215     res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
216                                              &kernel_reg_45);
217 
218     src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
219     src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
220     res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
221                                              &kernel_reg_45);
222 
223     // Second half of the results
224     res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
225     res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
226 
227     // Round the words
228     res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
229     res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
230     res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
231     res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
232 
233     // Combine to get the result
234     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
235     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
236 
237     _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
238     _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
239 
240     // Update the source by two rows
241     src_ptr += src_stride_unrolled;
242     dst_ptr += dst_stride_unrolled;
243 
244     src_reg_m10_lo_1 = src_reg_12_lo_1;
245     src_reg_m10_lo_2 = src_reg_12_lo_2;
246     src_reg_m10_hi_1 = src_reg_12_hi_1;
247     src_reg_m10_hi_2 = src_reg_12_hi_2;
248     src_reg_01_lo_1 = src_reg_23_lo_1;
249     src_reg_01_lo_2 = src_reg_23_lo_2;
250     src_reg_01_hi_1 = src_reg_23_hi_1;
251     src_reg_01_hi_2 = src_reg_23_hi_2;
252     src_reg_1 = src_reg_3;
253   }
254 }
255 
vpx_filter_block1d8_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)256 static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
257                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
258                                         ptrdiff_t dst_stride, uint32_t height,
259                                         const int16_t *kernel) {
260   __m128i kernel_reg;                         // Kernel
261   __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
262   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
263   int h;
264 
265   __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
266   __m128i dst_first;
267   __m128i even, odd;
268 
269   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
270   src_ptr -= 1;
271 
272   // Load Kernel
273   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
274   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
275   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
276   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
277 
278   for (h = height; h > 0; --h) {
279     // We will load multiple shifted versions of the row and shuffle them into
280     // 16-bit words of the form
281     // ... s[2] s[1] s[0] s[-1]
282     // ... s[4] s[3] s[2] s[1]
283     // Then we call multiply and add to get partial results
284     // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
285     // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
286     // The two results are then added together to get the even output
287     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
288     src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
289     src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
290     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
291 
292     // Output 6 4 2 0
293     even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
294                                  &kernel_reg_45);
295 
296     // Output 7 5 3 1
297     odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
298                                 &kernel_reg_23, &kernel_reg_45);
299 
300     // Combine to get the first half of the dst
301     dst_first = mm_zip_epi32_sse2(&even, &odd);
302     dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
303 
304     // Saturate and convert to 8-bit words
305     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
306 
307     _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
308 
309     src_ptr += src_stride;
310     dst_ptr += dst_stride;
311   }
312 }
313 
vpx_filter_block1d8_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)314 static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
315                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
316                                         ptrdiff_t dst_stride, uint32_t height,
317                                         const int16_t *kernel) {
318   // Register for source s[-1:3, :]
319   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
320   // Interleaved rows of the source. lo is first half, hi second
321   __m128i src_reg_m10_lo, src_reg_01_lo;
322   __m128i src_reg_12_lo, src_reg_23_lo;
323   // Half of half of the interleaved rows
324   __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
325   __m128i src_reg_01_lo_1, src_reg_01_lo_2;
326   __m128i src_reg_12_lo_1, src_reg_12_lo_2;
327   __m128i src_reg_23_lo_1, src_reg_23_lo_2;
328 
329   __m128i kernel_reg;                    // Kernel
330   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
331 
332   // Result after multiply and add
333   __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
334   __m128i res_reg_m1012, res_reg_0123;
335   __m128i res_reg_m1012_lo, res_reg_0123_lo;
336 
337   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
338 
339   // We will compute the result two rows at a time
340   const ptrdiff_t src_stride_unrolled = src_stride << 1;
341   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
342   int h;
343 
344   // Load Kernel
345   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
346   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
347   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
348   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
349 
350   // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
351   // words,
352   // shuffle the data into the form
353   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
354   // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
355   // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
356   // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
357   // so that we can call multiply and add with the kernel to get 32-bit words of
358   // the form
359   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
360   // Finally, we can add multiple rows together to get the desired output.
361 
362   // First shuffle the data
363   src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
364   src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
365   src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
366   src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
367   src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
368 
369   // More shuffling
370   src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
371   src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
372   src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
373   src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
374 
375   for (h = height; h > 1; h -= 2) {
376     src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
377 
378     src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
379 
380     src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
381 
382     src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
383 
384     // Partial output
385     res_reg_m10_lo = mm_madd_packs_epi16_sse2(
386         &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
387 
388     res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
389                                              &kernel_reg_23);
390 
391     src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
392     src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
393     res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
394                                              &kernel_reg_45);
395 
396     src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
397     src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
398     res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
399                                              &kernel_reg_45);
400 
401     // Add to get results
402     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
403     res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
404 
405     // Round the words
406     res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
407     res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
408 
409     // Convert to 8-bit words
410     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
411     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
412 
413     // Save only half of the register (8 words)
414     _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
415     _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
416 
417     // Update the source by two rows
418     src_ptr += src_stride_unrolled;
419     dst_ptr += dst_stride_unrolled;
420 
421     src_reg_m10_lo_1 = src_reg_12_lo_1;
422     src_reg_m10_lo_2 = src_reg_12_lo_2;
423     src_reg_01_lo_1 = src_reg_23_lo_1;
424     src_reg_01_lo_2 = src_reg_23_lo_2;
425     src_reg_1 = src_reg_3;
426   }
427 }
428 
vpx_filter_block1d4_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)429 static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
430                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
431                                         ptrdiff_t dst_stride, uint32_t height,
432                                         const int16_t *kernel) {
433   __m128i kernel_reg;                         // Kernel
434   __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
435   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
436   int h;
437 
438   __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
439   __m128i dst_first;
440   __m128i tmp_0, tmp_1;
441 
442   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
443   src_ptr -= 1;
444 
445   // Load Kernel
446   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
447   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
448   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
449   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
450 
451   for (h = height; h > 0; --h) {
452     // We will load multiple shifted versions of the row and shuffle them into
453     // 16-bit words of the form
454     // ... s[1] s[0] s[0] s[-1]
455     // ... s[3] s[2] s[2] s[1]
456     // Then we call multiply and add to get partial results
457     // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
458     // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
459     // The two results are then added together to get the output
460     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
461     src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
462     src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
463     src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
464 
465     // Convert to 16-bit words
466     src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
467     src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
468     src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
469     src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
470 
471     // Shuffle into the right format
472     tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
473     tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
474 
475     // Partial output
476     tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
477     tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
478 
479     // Output
480     dst_first = _mm_add_epi32(tmp_0, tmp_1);
481     dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
482 
483     dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
484 
485     // Saturate and convert to 8-bit words
486     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
487 
488     *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
489 
490     src_ptr += src_stride;
491     dst_ptr += dst_stride;
492   }
493 }
494 
vpx_filter_block1d4_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)495 static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
496                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
497                                         ptrdiff_t dst_stride, uint32_t height,
498                                         const int16_t *kernel) {
499   // Register for source s[-1:3, :]
500   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
501   // Interleaved rows of the source. lo is first half, hi second
502   __m128i src_reg_m10_lo, src_reg_01_lo;
503   __m128i src_reg_12_lo, src_reg_23_lo;
504   // Half of half of the interleaved rows
505   __m128i src_reg_m10_lo_1;
506   __m128i src_reg_01_lo_1;
507   __m128i src_reg_12_lo_1;
508   __m128i src_reg_23_lo_1;
509 
510   __m128i kernel_reg;                    // Kernel
511   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
512 
513   // Result after multiply and add
514   __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
515   __m128i res_reg_m1012, res_reg_0123;
516   __m128i res_reg_m1012_lo, res_reg_0123_lo;
517 
518   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
519   const __m128i reg_zero = _mm_setzero_si128();
520 
521   // We will compute the result two rows at a time
522   const ptrdiff_t src_stride_unrolled = src_stride << 1;
523   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
524   int h;
525 
526   // Load Kernel
527   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
528   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
529   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
530   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
531 
532   // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
533   // words,
534   // shuffle the data into the form
535   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
536   // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
537   // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
538   // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
539   // so that we can call multiply and add with the kernel to get 32-bit words of
540   // the form
541   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
542   // Finally, we can add multiple rows together to get the desired output.
543 
544   // First shuffle the data
545   src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
546   src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
547   src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
548   src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
549 
550   // More shuffling
551   src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
552   src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
553   src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
554 
555   for (h = height; h > 1; h -= 2) {
556     src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
557 
558     src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
559 
560     src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
561 
562     src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
563 
564     // Partial output
565     res_reg_m10_lo =
566         mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
567 
568     res_reg_01_lo =
569         mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
570 
571     src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
572     res_reg_12_lo =
573         mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
574 
575     src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
576     res_reg_23_lo =
577         mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
578 
579     // Add to get results
580     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
581     res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
582 
583     // Round the words
584     res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
585     res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
586 
587     // Convert to 8-bit words
588     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
589     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
590 
591     // Save only half of the register (8 words)
592     *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
593     *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
594 
595     // Update the source by two rows
596     src_ptr += src_stride_unrolled;
597     dst_ptr += dst_stride_unrolled;
598 
599     src_reg_m10_lo_1 = src_reg_12_lo_1;
600     src_reg_01_lo_1 = src_reg_23_lo_1;
601     src_reg_1 = src_reg_3;
602   }
603 }
604 
605 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
vpx_highbd_filter_block1d4_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)606 static void vpx_highbd_filter_block1d4_h4_sse2(
607     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
608     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
609   // We will load multiple shifted versions of the row and shuffle them into
610   // 16-bit words of the form
611   // ... s[2] s[1] s[0] s[-1]
612   // ... s[4] s[3] s[2] s[1]
613   // Then we call multiply and add to get partial results
614   // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
615   // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
616   // The two results are then added together to get the even output
617 
618   __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
619   __m128i res_reg;
620   __m128i even, odd;
621 
622   __m128i kernel_reg;                    // Kernel
623   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
624   const __m128i reg_round =
625       _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
626   const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
627   const __m128i reg_zero = _mm_setzero_si128();
628   int h;
629 
630   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
631   src_ptr -= 1;
632 
633   // Load Kernel
634   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
635   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
636   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
637 
638   for (h = height; h > 0; --h) {
639     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
640     src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
641     src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
642     src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
643 
644     // Output 2 0
645     even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
646                                   &kernel_reg_45);
647 
648     // Output 3 1
649     odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
650                                  &kernel_reg_23, &kernel_reg_45);
651 
652     // Combine to get the first half of the dst
653     res_reg = _mm_unpacklo_epi32(even, odd);
654     res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
655     res_reg = _mm_packs_epi32(res_reg, reg_zero);
656 
657     // Saturate the result and save
658     res_reg = _mm_min_epi16(res_reg, reg_max);
659     res_reg = _mm_max_epi16(res_reg, reg_zero);
660     _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
661 
662     src_ptr += src_stride;
663     dst_ptr += dst_stride;
664   }
665 }
666 
vpx_highbd_filter_block1d4_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)667 static void vpx_highbd_filter_block1d4_v4_sse2(
668     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
669     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
670   // We will load two rows of pixels as 16-bit words, and shuffle them into the
671   // form
672   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
673   // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
674   // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
675   // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
676   // so that we can call multiply and add with the kernel to get 32-bit words of
677   // the form
678   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
679   // Finally, we can add multiple rows together to get the desired output.
680 
681   // Register for source s[-1:3, :]
682   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
683   // Interleaved rows of the source. lo is first half, hi second
684   __m128i src_reg_m10, src_reg_01;
685   __m128i src_reg_12, src_reg_23;
686 
687   __m128i kernel_reg;                    // Kernel
688   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
689 
690   // Result after multiply and add
691   __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
692   __m128i res_reg_m1012, res_reg_0123;
693 
694   const __m128i reg_round =
695       _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
696   const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
697   const __m128i reg_zero = _mm_setzero_si128();
698 
699   // We will compute the result two rows at a time
700   const ptrdiff_t src_stride_unrolled = src_stride << 1;
701   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
702   int h;
703 
704   // Load Kernel
705   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
706   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
707   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
708 
709   // First shuffle the data
710   src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
711   src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
712   src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
713 
714   // More shuffling
715   src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
716   src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
717 
718   for (h = height; h > 1; h -= 2) {
719     src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
720 
721     src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
722 
723     src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
724 
725     src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
726 
727     // Partial output
728     res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
729     res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
730     res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
731     res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
732 
733     // Add to get results
734     res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
735     res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
736 
737     // Round the words
738     res_reg_m1012 =
739         mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
740     res_reg_0123 =
741         mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
742 
743     res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
744     res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
745 
746     // Saturate according to bit depth
747     res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
748     res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
749     res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
750     res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
751 
752     // Save only half of the register (8 words)
753     _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
754     _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
755 
756     // Update the source by two rows
757     src_ptr += src_stride_unrolled;
758     dst_ptr += dst_stride_unrolled;
759 
760     src_reg_m10 = src_reg_12;
761     src_reg_01 = src_reg_23;
762     src_reg_1 = src_reg_3;
763   }
764 }
765 
vpx_highbd_filter_block1d8_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)766 static void vpx_highbd_filter_block1d8_h4_sse2(
767     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
768     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
769   // We will load multiple shifted versions of the row and shuffle them into
770   // 16-bit words of the form
771   // ... s[2] s[1] s[0] s[-1]
772   // ... s[4] s[3] s[2] s[1]
773   // Then we call multiply and add to get partial results
774   // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
775   // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
776   // The two results are then added together for the first half of even
777   // output.
778   // Repeat multiple times to get the whole outoput
779 
780   __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
781       src_reg_shift_3;
782   __m128i res_reg;
783   __m128i even, odd;
784   __m128i tmp_0, tmp_1;
785 
786   __m128i kernel_reg;                    // Kernel
787   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
788   const __m128i reg_round =
789       _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
790   const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
791   const __m128i reg_zero = _mm_setzero_si128();
792   int h;
793 
794   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
795   src_ptr -= 1;
796 
797   // Load Kernel
798   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
799   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
800   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
801 
802   for (h = height; h > 0; --h) {
803     // We will put first half in the first half of the reg, and second half in
804     // second half
805     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
806     src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
807 
808     // Output 6 4 2 0
809     tmp_0 = _mm_srli_si128(src_reg, 4);
810     tmp_1 = _mm_srli_si128(src_reg_next, 2);
811     src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
812     even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
813                                   &kernel_reg_45);
814 
815     // Output 7 5 3 1
816     tmp_0 = _mm_srli_si128(src_reg, 2);
817     tmp_1 = src_reg_next;
818     src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
819 
820     tmp_0 = _mm_srli_si128(src_reg, 6);
821     tmp_1 = _mm_srli_si128(src_reg_next, 4);
822     src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
823 
824     odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
825                                  &kernel_reg_23, &kernel_reg_45);
826 
827     // Combine to get the first half of the dst
828     even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
829     odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
830     res_reg = mm_zip_epi32_sse2(&even, &odd);
831 
832     // Saturate the result and save
833     res_reg = _mm_min_epi16(res_reg, reg_max);
834     res_reg = _mm_max_epi16(res_reg, reg_zero);
835 
836     _mm_store_si128((__m128i *)dst_ptr, res_reg);
837 
838     src_ptr += src_stride;
839     dst_ptr += dst_stride;
840   }
841 }
842 
vpx_highbd_filter_block1d8_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)843 static void vpx_highbd_filter_block1d8_v4_sse2(
844     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
845     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
846   // We will load two rows of pixels as 16-bit words, and shuffle them into the
847   // form
848   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
849   // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
850   // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
851   // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
852   // so that we can call multiply and add with the kernel to get 32-bit words of
853   // the form
854   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
855   // Finally, we can add multiple rows together to get the desired output.
856 
857   // Register for source s[-1:3, :]
858   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
859   // Interleaved rows of the source. lo is first half, hi second
860   __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
861   __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
862 
863   // Result after multiply and add
864   __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
865   __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
866   __m128i res_reg_m1012, res_reg_0123;
867   __m128i res_reg_m1012_lo, res_reg_0123_lo;
868   __m128i res_reg_m1012_hi, res_reg_0123_hi;
869 
870   __m128i kernel_reg;                    // Kernel
871   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
872 
873   const __m128i reg_round =
874       _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
875   const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
876   const __m128i reg_zero = _mm_setzero_si128();
877 
878   // We will compute the result two rows at a time
879   const ptrdiff_t src_stride_unrolled = src_stride << 1;
880   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
881   int h;
882 
883   // Load Kernel
884   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
885   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
886   kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
887 
888   // First shuffle the data
889   src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
890   src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
891   src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
892   src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
893 
894   // More shuffling
895   src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
896   src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
897   src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
898 
899   for (h = height; h > 1; h -= 2) {
900     src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
901 
902     src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
903     src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
904 
905     src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
906 
907     src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
908     src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
909 
910     // Partial output for first half
911     res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
912     res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
913     res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
914     res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
915 
916     // Add to get results
917     res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
918     res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
919 
920     // Round the words
921     res_reg_m1012_lo =
922         mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
923     res_reg_0123_lo =
924         mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
925 
926     // Partial output for first half
927     res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
928     res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
929     res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
930     res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
931 
932     // Add to get results
933     res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
934     res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
935 
936     // Round the words
937     res_reg_m1012_hi =
938         mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
939     res_reg_0123_hi =
940         mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
941 
942     // Combine the two halfs
943     res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
944     res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
945 
946     // Saturate according to bit depth
947     res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
948     res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
949     res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
950     res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
951 
952     // Save only half of the register (8 words)
953     _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
954     _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
955 
956     // Update the source by two rows
957     src_ptr += src_stride_unrolled;
958     dst_ptr += dst_stride_unrolled;
959 
960     src_reg_m10_lo = src_reg_12_lo;
961     src_reg_m10_hi = src_reg_12_hi;
962     src_reg_01_lo = src_reg_23_lo;
963     src_reg_01_hi = src_reg_23_hi;
964     src_reg_1 = src_reg_3;
965   }
966 }
967 
vpx_highbd_filter_block1d16_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)968 static void vpx_highbd_filter_block1d16_h4_sse2(
969     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
970     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
971   vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
972                                      height, kernel, bd);
973   vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
974                                      dst_stride, height, kernel, bd);
975 }
976 
vpx_highbd_filter_block1d16_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)977 static void vpx_highbd_filter_block1d16_v4_sse2(
978     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
979     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
980   vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
981                                      height, kernel, bd);
982   vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
983                                      dst_stride, height, kernel, bd);
984 }
985 #endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
986 
987 // From vpx_subpixel_8t_sse2.asm.
988 filter8_1dfunction vpx_filter_block1d16_v8_sse2;
989 filter8_1dfunction vpx_filter_block1d16_h8_sse2;
990 filter8_1dfunction vpx_filter_block1d8_v8_sse2;
991 filter8_1dfunction vpx_filter_block1d8_h8_sse2;
992 filter8_1dfunction vpx_filter_block1d4_v8_sse2;
993 filter8_1dfunction vpx_filter_block1d4_h8_sse2;
994 filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
995 filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
996 filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
997 filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
998 filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
999 filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
1000 
1001 // Use the [vh]8 version because there is no [vh]4 implementation.
1002 #define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
1003 #define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
1004 #define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
1005 #define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
1006 #define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
1007 #define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
1008 
1009 // From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
1010 filter8_1dfunction vpx_filter_block1d16_v2_sse2;
1011 filter8_1dfunction vpx_filter_block1d16_h2_sse2;
1012 filter8_1dfunction vpx_filter_block1d8_v2_sse2;
1013 filter8_1dfunction vpx_filter_block1d8_h2_sse2;
1014 filter8_1dfunction vpx_filter_block1d4_v2_sse2;
1015 filter8_1dfunction vpx_filter_block1d4_h2_sse2;
1016 filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
1017 filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
1018 filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
1019 filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
1020 filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
1021 filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
1022 
1023 // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
1024 //                               uint8_t *dst, ptrdiff_t dst_stride,
1025 //                               const InterpKernel *filter, int x0_q4,
1026 //                               int32_t x_step_q4, int y0_q4, int y_step_q4,
1027 //                               int w, int h);
1028 // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
1029 //                              uint8_t *dst, ptrdiff_t dst_stride,
1030 //                              const InterpKernel *filter, int x0_q4,
1031 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
1032 //                              int w, int h);
1033 // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
1034 //                                   uint8_t *dst, ptrdiff_t dst_stride,
1035 //                                   const InterpKernel *filter, int x0_q4,
1036 //                                   int32_t x_step_q4, int y0_q4,
1037 //                                   int y_step_q4, int w, int h);
1038 // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
1039 //                                  uint8_t *dst, ptrdiff_t dst_stride,
1040 //                                  const InterpKernel *filter, int x0_q4,
1041 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
1042 //                                  int w, int h);
1043 FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
1044 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
1045             sse2, 0)
1046 FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
1047 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1048             src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
1049 
1050 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
1051 //                         uint8_t *dst, ptrdiff_t dst_stride,
1052 //                         const InterpKernel *filter, int x0_q4,
1053 //                         int32_t x_step_q4, int y0_q4, int y_step_q4,
1054 //                         int w, int h);
1055 // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
1056 //                             uint8_t *dst, ptrdiff_t dst_stride,
1057 //                             const InterpKernel *filter, int x0_q4,
1058 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
1059 //                             int w, int h);
1060 FUN_CONV_2D(, sse2, 0)
1061 FUN_CONV_2D(avg_, sse2, 1)
1062 
1063 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
1064 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1065 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
1066 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
1067 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
1068 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
1069 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1070 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1071 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
1072 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
1073 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
1074 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
1075 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1076 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1077 
1078 // Use the [vh]8 version because there is no [vh]4 implementation.
1079 #define vpx_highbd_filter_block1d16_v4_avg_sse2 \
1080   vpx_highbd_filter_block1d16_v8_avg_sse2
1081 #define vpx_highbd_filter_block1d16_h4_avg_sse2 \
1082   vpx_highbd_filter_block1d16_h8_avg_sse2
1083 #define vpx_highbd_filter_block1d8_v4_avg_sse2 \
1084   vpx_highbd_filter_block1d8_v8_avg_sse2
1085 #define vpx_highbd_filter_block1d8_h4_avg_sse2 \
1086   vpx_highbd_filter_block1d8_h8_avg_sse2
1087 #define vpx_highbd_filter_block1d4_v4_avg_sse2 \
1088   vpx_highbd_filter_block1d4_v8_avg_sse2
1089 #define vpx_highbd_filter_block1d4_h4_avg_sse2 \
1090   vpx_highbd_filter_block1d4_h8_avg_sse2
1091 
1092 // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1093 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
1094 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
1095 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
1096 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
1097 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1098 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1099 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
1100 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
1101 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
1102 highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
1103 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1104 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1105 
1106 // void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
1107 //                                      ptrdiff_t src_stride,
1108 //                                      uint8_t *dst,
1109 //                                      ptrdiff_t dst_stride,
1110 //                                      const int16_t *filter_x,
1111 //                                      int x_step_q4,
1112 //                                      const int16_t *filter_y,
1113 //                                      int y_step_q4,
1114 //                                      int w, int h, int bd);
1115 // void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
1116 //                                     ptrdiff_t src_stride,
1117 //                                     uint8_t *dst,
1118 //                                     ptrdiff_t dst_stride,
1119 //                                     const int16_t *filter_x,
1120 //                                     int x_step_q4,
1121 //                                     const int16_t *filter_y,
1122 //                                     int y_step_q4,
1123 //                                     int w, int h, int bd);
1124 // void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
1125 //                                          ptrdiff_t src_stride,
1126 //                                          uint8_t *dst,
1127 //                                          ptrdiff_t dst_stride,
1128 //                                          const int16_t *filter_x,
1129 //                                          int x_step_q4,
1130 //                                          const int16_t *filter_y,
1131 //                                          int y_step_q4,
1132 //                                          int w, int h, int bd);
1133 // void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
1134 //                                         ptrdiff_t src_stride,
1135 //                                         uint8_t *dst,
1136 //                                         ptrdiff_t dst_stride,
1137 //                                         const int16_t *filter_x,
1138 //                                         int x_step_q4,
1139 //                                         const int16_t *filter_y,
1140 //                                         int y_step_q4,
1141 //                                         int w, int h, int bd);
1142 HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
1143 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1144                  src - src_stride * (num_taps / 2 - 1), , sse2, 0)
1145 HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
1146 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1147                  src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
1148 
1149 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
1150 //                                uint8_t *dst, ptrdiff_t dst_stride,
1151 //                                const InterpKernel *filter, int x0_q4,
1152 //                                int32_t x_step_q4, int y0_q4, int y_step_q4,
1153 //                                int w, int h, int bd);
1154 // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
1155 //                                    uint8_t *dst, ptrdiff_t dst_stride,
1156 //                                    const InterpKernel *filter, int x0_q4,
1157 //                                    int32_t x_step_q4, int y0_q4,
1158 //                                    int y_step_q4, int w, int h, int bd);
1159 HIGH_FUN_CONV_2D(, sse2, 0)
1160 HIGH_FUN_CONV_2D(avg_, sse2, 1)
1161 #endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
1162