xref: /aosp_15_r20/external/libaom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <tmmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/aom_filter.h"
17 #include "aom_dsp/x86/convolve.h"
18 #include "aom_dsp/x86/convolve_sse2.h"
19 #include "aom_dsp/x86/convolve_ssse3.h"
20 #include "aom_dsp/x86/mem_sse2.h"
21 #include "aom_dsp/x86/transpose_sse2.h"
22 #include "aom_mem/aom_mem.h"
23 #include "aom_ports/mem.h"
24 #include "aom_ports/emmintrin_compat.h"
25 
26 DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
27   0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
28   2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
29   5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
30   7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
31   10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
32   12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
33   7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
34 };
35 
36 DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
37   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
38   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
39 };
40 
aom_filter_block1d4_h4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)41 static void aom_filter_block1d4_h4_ssse3(
42     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
43     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
44   __m128i filtersReg;
45   __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
46   unsigned int i;
47   src_ptr -= 3;
48   addFilterReg32 = _mm_set1_epi16(32);
49   filtersReg = _mm_loadu_si128((const __m128i *)filter);
50   filtersReg = _mm_srai_epi16(filtersReg, 1);
51   // converting the 16 bit (short) to 8 bit (byte) and have the same data
52   // in both lanes of 128 bit register.
53   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
54 
55   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
56   filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
57 
58   for (i = output_height; i > 0; i -= 1) {
59     // load the 2 strides of source
60     srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
61 
62     // filter the source buffer
63     srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
64 
65     // multiply 4 adjacent elements with the filter and add the result
66     srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
67 
68     srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
69 
70     // shift by 6 bit each 16 bit
71     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
72     srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
73 
74     // shrink to 8 bit each 16 bits, the first lane contain the first
75     // convolve result and the second lane contain the second convolve result
76     srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
77 
78     src_ptr += src_pixels_per_line;
79 
80     *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
81     output_ptr += output_pitch;
82   }
83 }
84 
aom_filter_block1d4_v4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)85 static void aom_filter_block1d4_v4_ssse3(
86     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
87     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
88   __m128i filtersReg;
89   __m128i addFilterReg32;
90   __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
91       srcReg6, srcReg56;
92   __m128i srcReg23_34_lo, srcReg45_56_lo;
93   __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
94   __m128i resReglo, resReghi;
95   __m128i firstFilters;
96   unsigned int i;
97   ptrdiff_t src_stride, dst_stride;
98 
99   addFilterReg32 = _mm_set1_epi16(32);
100   filtersReg = _mm_loadu_si128((const __m128i *)filter);
101   // converting the 16 bit (short) to  8 bit (byte) and have the
102   // same data in both lanes of 128 bit register.
103   filtersReg = _mm_srai_epi16(filtersReg, 1);
104   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
105 
106   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
107 
108   // multiple the size of the source and destination stride by two
109   src_stride = src_pitch << 1;
110   dst_stride = out_pitch << 1;
111 
112   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
113   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
114   srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
115 
116   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
117 
118   // have consecutive loads on the same 256 register
119   srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
120 
121   srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
122 
123   for (i = output_height; i > 1; i -= 2) {
124     srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
125     srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
126 
127     srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
128     srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
129 
130     // merge every two consecutive registers
131     srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
132 
133     srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
134     srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
135 
136     // multiply 2 adjacent elements with the filter and add the result
137     resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
138     resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
139 
140     resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
141     resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
142 
143     // shift by 6 bit each 16 bit
144     resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
145     resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
146     resReglo = _mm_srai_epi16(resReglo, 6);
147     resReghi = _mm_srai_epi16(resReghi, 6);
148 
149     // shrink to 8 bit each 16 bits, the first lane contain the first
150     // convolve result and the second lane contain the second convolve
151     // result
152     resReglo = _mm_packus_epi16(resReglo, resReglo);
153     resReghi = _mm_packus_epi16(resReghi, resReghi);
154 
155     src_ptr += src_stride;
156 
157     *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
158     *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
159 
160     output_ptr += dst_stride;
161 
162     // save part of the registers for next strides
163     srcReg23_34_lo = srcReg45_56_lo;
164     srcReg4 = srcReg6;
165   }
166 }
167 
aom_filter_block1d8_h4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)168 static void aom_filter_block1d8_h4_ssse3(
169     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
170     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
171   __m128i filtersReg;
172   __m128i addFilterReg32, filt2Reg, filt3Reg;
173   __m128i secondFilters, thirdFilters;
174   __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
175   __m128i srcReg32b1;
176   unsigned int i;
177   src_ptr -= 3;
178   addFilterReg32 = _mm_set1_epi16(32);
179   filtersReg = _mm_loadu_si128((const __m128i *)filter);
180   filtersReg = _mm_srai_epi16(filtersReg, 1);
181   // converting the 16 bit (short) to 8 bit (byte) and have the same data
182   // in both lanes of 128 bit register.
183   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
184 
185   // duplicate only the second 16 bits (third and forth byte)
186   // across 256 bit register
187   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
188   // duplicate only the third 16 bits (fifth and sixth byte)
189   // across 256 bit register
190   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
191 
192   filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
193   filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
194 
195   for (i = output_height; i > 0; i -= 1) {
196     srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
197 
198     // filter the source buffer
199     srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
200     srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
201 
202     // multiply 2 adjacent elements with the filter and add the result
203     srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
204     srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
205 
206     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
207 
208     // shift by 6 bit each 16 bit
209     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
210     srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
211 
212     // shrink to 8 bit each 16 bits
213     srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
214 
215     src_ptr += src_pixels_per_line;
216 
217     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
218 
219     output_ptr += output_pitch;
220   }
221 }
222 
aom_filter_block1d8_v4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)223 static void aom_filter_block1d8_v4_ssse3(
224     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
225     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
226   __m128i filtersReg;
227   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
228   __m128i srcReg23, srcReg34, srcReg45, srcReg56;
229   __m128i resReg23, resReg34, resReg45, resReg56;
230   __m128i resReg23_45, resReg34_56;
231   __m128i addFilterReg32, secondFilters, thirdFilters;
232   unsigned int i;
233   ptrdiff_t src_stride, dst_stride;
234 
235   addFilterReg32 = _mm_set1_epi16(32);
236   filtersReg = _mm_loadu_si128((const __m128i *)filter);
237   // converting the 16 bit (short) to  8 bit (byte) and have the
238   // same data in both lanes of 128 bit register.
239   filtersReg = _mm_srai_epi16(filtersReg, 1);
240   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
241 
242   // duplicate only the second 16 bits (third and forth byte)
243   // across 128 bit register
244   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
245   // duplicate only the third 16 bits (fifth and sixth byte)
246   // across 128 bit register
247   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
248 
249   // multiple the size of the source and destination stride by two
250   src_stride = src_pitch << 1;
251   dst_stride = out_pitch << 1;
252 
253   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
254   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
255   srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
256 
257   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
258 
259   // have consecutive loads on the same 256 register
260   srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
261 
262   for (i = output_height; i > 1; i -= 2) {
263     srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
264 
265     srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
266 
267     srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
268 
269     srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
270 
271     // multiply 2 adjacent elements with the filter and add the result
272     resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
273     resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
274     resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
275     resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
276 
277     // add and saturate the results together
278     resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
279     resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
280 
281     // shift by 6 bit each 16 bit
282     resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
283     resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
284     resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
285     resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
286 
287     // shrink to 8 bit each 16 bits, the first lane contain the first
288     // convolve result and the second lane contain the second convolve
289     // result
290     resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
291     resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
292 
293     src_ptr += src_stride;
294 
295     _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
296     _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
297 
298     output_ptr += dst_stride;
299 
300     // save part of the registers for next strides
301     srcReg23 = srcReg45;
302     srcReg34 = srcReg56;
303     srcReg4 = srcReg6;
304   }
305 }
306 
aom_filter_block1d16_h4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)307 static void aom_filter_block1d16_h4_ssse3(
308     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
309     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
310   __m128i filtersReg;
311   __m128i addFilterReg32, filt2Reg, filt3Reg;
312   __m128i secondFilters, thirdFilters;
313   __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
314   __m128i srcReg32b1, srcReg32b2;
315   unsigned int i;
316   src_ptr -= 3;
317   addFilterReg32 = _mm_set1_epi16(32);
318   filtersReg = _mm_loadu_si128((const __m128i *)filter);
319   filtersReg = _mm_srai_epi16(filtersReg, 1);
320   // converting the 16 bit (short) to 8 bit (byte) and have the same data
321   // in both lanes of 128 bit register.
322   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
323 
324   // duplicate only the second 16 bits (third and forth byte)
325   // across 256 bit register
326   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
327   // duplicate only the third 16 bits (fifth and sixth byte)
328   // across 256 bit register
329   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
330 
331   filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
332   filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
333 
334   for (i = output_height; i > 0; i -= 1) {
335     srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
336 
337     // filter the source buffer
338     srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
339     srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
340 
341     // multiply 2 adjacent elements with the filter and add the result
342     srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
343     srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
344 
345     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
346 
347     // reading stride of the next 16 bytes
348     // (part of it was being read by earlier read)
349     srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
350 
351     // filter the source buffer
352     srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
353     srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
354 
355     // multiply 2 adjacent elements with the filter and add the result
356     srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
357     srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
358 
359     // add and saturate the results together
360     srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
361 
362     // shift by 6 bit each 16 bit
363     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
364     srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
365     srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
366     srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
367 
368     // shrink to 8 bit each 16 bits, the first lane contain the first
369     // convolve result and the second lane contain the second convolve result
370     srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
371 
372     src_ptr += src_pixels_per_line;
373 
374     _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
375 
376     output_ptr += output_pitch;
377   }
378 }
379 
aom_filter_block1d16_v4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)380 static void aom_filter_block1d16_v4_ssse3(
381     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
382     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
383   __m128i filtersReg;
384   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
385   __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
386   __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
387   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
388   __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
389   __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
390   __m128i resReg23_45, resReg34_56;
391   __m128i addFilterReg32, secondFilters, thirdFilters;
392   unsigned int i;
393   ptrdiff_t src_stride, dst_stride;
394 
395   addFilterReg32 = _mm_set1_epi16(32);
396   filtersReg = _mm_loadu_si128((const __m128i *)filter);
397   // converting the 16 bit (short) to  8 bit (byte) and have the
398   // same data in both lanes of 128 bit register.
399   filtersReg = _mm_srai_epi16(filtersReg, 1);
400   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
401 
402   // duplicate only the second 16 bits (third and forth byte)
403   // across 128 bit register
404   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
405   // duplicate only the third 16 bits (fifth and sixth byte)
406   // across 128 bit register
407   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
408 
409   // multiple the size of the source and destination stride by two
410   src_stride = src_pitch << 1;
411   dst_stride = out_pitch << 1;
412 
413   srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
414   srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
415   srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
416   srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
417 
418   srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
419 
420   // have consecutive loads on the same 256 register
421   srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
422   srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
423 
424   for (i = output_height; i > 1; i -= 2) {
425     srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
426 
427     srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
428     srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
429 
430     srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
431 
432     srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
433     srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
434 
435     // multiply 2 adjacent elements with the filter and add the result
436     resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
437     resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
438     resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
439     resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
440 
441     // add and saturate the results together
442     resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
443     resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
444 
445     // multiply 2 adjacent elements with the filter and add the result
446 
447     resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
448     resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
449     resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
450     resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
451 
452     // add and saturate the results together
453     resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
454     resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
455 
456     // shift by 6 bit each 16 bit
457     resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
458     resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
459     resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
460     resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
461     resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
462     resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
463     resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
464     resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
465 
466     // shrink to 8 bit each 16 bits, the first lane contain the first
467     // convolve result and the second lane contain the second convolve
468     // result
469     resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
470     resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
471 
472     src_ptr += src_stride;
473 
474     _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
475     _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
476 
477     output_ptr += dst_stride;
478 
479     // save part of the registers for next strides
480     srcReg23_lo = srcReg45_lo;
481     srcReg34_lo = srcReg56_lo;
482     srcReg23_hi = srcReg45_hi;
483     srcReg34_hi = srcReg56_hi;
484     srcReg4 = srcReg6;
485   }
486 }
487 
shuffle_filter_convolve8_8_ssse3(const __m128i * const s,const int16_t * const filter)488 static inline __m128i shuffle_filter_convolve8_8_ssse3(
489     const __m128i *const s, const int16_t *const filter) {
490   __m128i f[4];
491   shuffle_filter_ssse3(filter, f);
492   return convolve8_8_ssse3(s, f);
493 }
494 
filter_horiz_w8_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const x_filter)495 static void filter_horiz_w8_ssse3(const uint8_t *const src,
496                                   const ptrdiff_t src_stride,
497                                   uint8_t *const dst,
498                                   const int16_t *const x_filter) {
499   __m128i s[8], ss[4], temp;
500 
501   load_8bit_8x8(src, src_stride, s);
502   // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
503   // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
504   // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
505   // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
506   transpose_16bit_4x8(s, ss);
507   temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
508   // shrink to 8 bit each 16 bits
509   temp = _mm_packus_epi16(temp, temp);
510   // save only 8 bytes convolve result
511   _mm_storel_epi64((__m128i *)dst, temp);
512 }
513 
transpose8x8_to_dst(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride)514 static void transpose8x8_to_dst(const uint8_t *const src,
515                                 const ptrdiff_t src_stride, uint8_t *const dst,
516                                 const ptrdiff_t dst_stride) {
517   __m128i s[8];
518 
519   load_8bit_8x8(src, src_stride, s);
520   transpose_8bit_8x8(s, s);
521   store_8bit_8x8(s, dst, dst_stride);
522 }
523 
scaledconvolve_horiz_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)524 static void scaledconvolve_horiz_w8(const uint8_t *src,
525                                     const ptrdiff_t src_stride, uint8_t *dst,
526                                     const ptrdiff_t dst_stride,
527                                     const InterpKernel *const x_filters,
528                                     const int x0_q4, const int x_step_q4,
529                                     const int w, const int h) {
530   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
531   int x, y, z;
532   src -= SUBPEL_TAPS / 2 - 1;
533 
534   // This function processes 8x8 areas. The intermediate height is not always
535   // a multiple of 8, so force it to be a multiple of 8 here.
536   y = h + (8 - (h & 0x7));
537 
538   do {
539     int x_q4 = x0_q4;
540     for (x = 0; x < w; x += 8) {
541       // process 8 src_x steps
542       for (z = 0; z < 8; ++z) {
543         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
544         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
545         if (x_q4 & SUBPEL_MASK) {
546           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
547         } else {
548           int i;
549           for (i = 0; i < 8; ++i) {
550             temp[z * 8 + i] = src_x[i * src_stride + 3];
551           }
552         }
553         x_q4 += x_step_q4;
554       }
555 
556       // transpose the 8x8 filters values back to dst
557       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
558     }
559 
560     src += src_stride * 8;
561     dst += dst_stride * 8;
562   } while (y -= 8);
563 }
564 
filter_horiz_w4_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter)565 static void filter_horiz_w4_ssse3(const uint8_t *const src,
566                                   const ptrdiff_t src_stride,
567                                   uint8_t *const dst,
568                                   const int16_t *const filter) {
569   __m128i s[4];
570   __m128i temp;
571 
572   load_8bit_8x4(src, src_stride, s);
573   transpose_16bit_4x4(s, s);
574 
575   temp = shuffle_filter_convolve8_8_ssse3(s, filter);
576   // shrink to 8 bit each 16 bits
577   temp = _mm_packus_epi16(temp, temp);
578   // save only 4 bytes
579   *(int *)dst = _mm_cvtsi128_si32(temp);
580 }
581 
transpose4x4_to_dst(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride)582 static void transpose4x4_to_dst(const uint8_t *const src,
583                                 const ptrdiff_t src_stride, uint8_t *const dst,
584                                 const ptrdiff_t dst_stride) {
585   __m128i s[4];
586 
587   load_8bit_4x4(src, src_stride, s);
588   s[0] = transpose_8bit_4x4(s);
589   s[1] = _mm_srli_si128(s[0], 4);
590   s[2] = _mm_srli_si128(s[0], 8);
591   s[3] = _mm_srli_si128(s[0], 12);
592   store_8bit_4x4(s, dst, dst_stride);
593 }
594 
scaledconvolve_horiz_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)595 static void scaledconvolve_horiz_w4(const uint8_t *src,
596                                     const ptrdiff_t src_stride, uint8_t *dst,
597                                     const ptrdiff_t dst_stride,
598                                     const InterpKernel *const x_filters,
599                                     const int x0_q4, const int x_step_q4,
600                                     const int w, const int h) {
601   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
602   int x, y, z;
603   src -= SUBPEL_TAPS / 2 - 1;
604 
605   for (y = 0; y < h; y += 4) {
606     int x_q4 = x0_q4;
607     for (x = 0; x < w; x += 4) {
608       // process 4 src_x steps
609       for (z = 0; z < 4; ++z) {
610         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
611         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
612         if (x_q4 & SUBPEL_MASK) {
613           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
614         } else {
615           int i;
616           for (i = 0; i < 4; ++i) {
617             temp[z * 4 + i] = src_x[i * src_stride + 3];
618           }
619         }
620         x_q4 += x_step_q4;
621       }
622 
623       // transpose the 4x4 filters values back to dst
624       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
625     }
626 
627     src += src_stride * 4;
628     dst += dst_stride * 4;
629   }
630 }
631 
filter_vert_kernel(const __m128i * const s,const int16_t * const filter)632 static __m128i filter_vert_kernel(const __m128i *const s,
633                                   const int16_t *const filter) {
634   __m128i ss[4];
635   __m128i temp;
636 
637   // 00 10 01 11 02 12 03 13
638   ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
639   // 20 30 21 31 22 32 23 33
640   ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
641   // 40 50 41 51 42 52 43 53
642   ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
643   // 60 70 61 71 62 72 63 73
644   ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
645 
646   temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
647   // shrink to 8 bit each 16 bits
648   return _mm_packus_epi16(temp, temp);
649 }
650 
filter_vert_w4_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter)651 static void filter_vert_w4_ssse3(const uint8_t *const src,
652                                  const ptrdiff_t src_stride, uint8_t *const dst,
653                                  const int16_t *const filter) {
654   __m128i s[8];
655   __m128i temp;
656 
657   load_8bit_4x8(src, src_stride, s);
658   temp = filter_vert_kernel(s, filter);
659   // save only 4 bytes
660   *(int *)dst = _mm_cvtsi128_si32(temp);
661 }
662 
scaledconvolve_vert_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)663 static void scaledconvolve_vert_w4(
664     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
665     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
666     const int y0_q4, const int y_step_q4, const int w, const int h) {
667   int y;
668   int y_q4 = y0_q4;
669 
670   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
671   for (y = 0; y < h; ++y) {
672     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
673     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
674 
675     if (y_q4 & SUBPEL_MASK) {
676       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
677     } else {
678       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
679     }
680 
681     y_q4 += y_step_q4;
682   }
683 }
684 
filter_vert_w8_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter)685 static void filter_vert_w8_ssse3(const uint8_t *const src,
686                                  const ptrdiff_t src_stride, uint8_t *const dst,
687                                  const int16_t *const filter) {
688   __m128i s[8], temp;
689 
690   load_8bit_8x8(src, src_stride, s);
691   temp = filter_vert_kernel(s, filter);
692   // save only 8 bytes convolve result
693   _mm_storel_epi64((__m128i *)dst, temp);
694 }
695 
scaledconvolve_vert_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)696 static void scaledconvolve_vert_w8(
697     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
698     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
699     const int y0_q4, const int y_step_q4, const int w, const int h) {
700   int y;
701   int y_q4 = y0_q4;
702 
703   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
704   for (y = 0; y < h; ++y) {
705     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
706     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
707     if (y_q4 & SUBPEL_MASK) {
708       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
709     } else {
710       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
711     }
712     y_q4 += y_step_q4;
713   }
714 }
715 
filter_vert_w16_ssse3(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter,const int w)716 static void filter_vert_w16_ssse3(const uint8_t *src,
717                                   const ptrdiff_t src_stride,
718                                   uint8_t *const dst,
719                                   const int16_t *const filter, const int w) {
720   int i;
721   __m128i f[4];
722   shuffle_filter_ssse3(filter, f);
723 
724   for (i = 0; i < w; i += 16) {
725     __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
726 
727     loadu_8bit_16x8(src, src_stride, s);
728 
729     // merge the result together
730     s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
731     s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
732     s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
733     s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
734     s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
735     s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
736     s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
737     s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
738     temp_lo = convolve8_8_ssse3(s_lo, f);
739     temp_hi = convolve8_8_ssse3(s_hi, f);
740 
741     // shrink to 8 bit each 16 bits, the first lane contain the first convolve
742     // result and the second lane contain the second convolve result
743     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
744     src += 16;
745     // save 16 bytes convolve result
746     _mm_store_si128((__m128i *)&dst[i], temp_hi);
747   }
748 }
749 
scaledconvolve_vert_w16(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)750 static void scaledconvolve_vert_w16(
751     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
752     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
753     const int y0_q4, const int y_step_q4, const int w, const int h) {
754   int y;
755   int y_q4 = y0_q4;
756 
757   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
758   for (y = 0; y < h; ++y) {
759     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
760     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
761     if (y_q4 & SUBPEL_MASK) {
762       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
763                             w);
764     } else {
765       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
766     }
767     y_q4 += y_step_q4;
768   }
769 }
770 
aom_scaled_2d_ssse3(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)771 void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
772                          ptrdiff_t dst_stride, const InterpKernel *filter,
773                          int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
774                          int w, int h) {
775   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
776   // 2d filtering proceeds in 2 steps:
777   //   (1) Interpolate horizontally into an intermediate buffer, temp.
778   //   (2) Interpolate temp vertically to derive the sub-pixel result.
779   // Deriving the maximum number of rows in the temp buffer (135):
780   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
781   // --Largest block size is 64x64 pixels.
782   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
783   //   original frame (in 1/16th pixel units).
784   // --Must round-up because block may be located at sub-pixel position.
785   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
786   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
787   // --Require an additional 8 rows for the horiz_w8 transpose tail.
788   // When calling in frame scaling function, the smallest scaling factor is x1/4
789   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
790   // big enough.
791   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
792   const int intermediate_height =
793       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
794 
795   assert(w <= 64);
796   assert(h <= 64);
797   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
798   assert(x_step_q4 <= 64);
799 
800   if (w >= 8) {
801     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
802                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
803                             intermediate_height);
804   } else {
805     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
806                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
807                             intermediate_height);
808   }
809 
810   if (w >= 16) {
811     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
812                             dst_stride, filter, y0_q4, y_step_q4, w, h);
813   } else if (w == 8) {
814     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
815                            dst_stride, filter, y0_q4, y_step_q4, w, h);
816   } else {
817     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
818                            dst_stride, filter, y0_q4, y_step_q4, w, h);
819   }
820 }
821 
822 filter8_1dfunction aom_filter_block1d16_v8_ssse3;
823 filter8_1dfunction aom_filter_block1d16_h8_ssse3;
824 filter8_1dfunction aom_filter_block1d8_v8_ssse3;
825 filter8_1dfunction aom_filter_block1d8_h8_ssse3;
826 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
827 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
828 
829 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
830 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
831 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
832 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
833 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
834 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
835 
836 // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
837 //                                uint8_t *dst, ptrdiff_t dst_stride,
838 //                                const int16_t *filter_x, int x_step_q4,
839 //                                const int16_t *filter_y, int y_step_q4,
840 //                                int w, int h);
841 // void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
842 //                               uint8_t *dst, ptrdiff_t dst_stride,
843 //                               const int16_t *filter_x, int x_step_q4,
844 //                               const int16_t *filter_y, int y_step_q4,
845 //                               int w, int h);
846 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
847 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)
848