xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/convolve_ssse3.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
13*fb1b10abSAndroid Build Coastguard Worker 
14*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
15*fb1b10abSAndroid Build Coastguard Worker #include <tmmintrin.h>  // SSSE3
16*fb1b10abSAndroid Build Coastguard Worker 
17*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
18*fb1b10abSAndroid Build Coastguard Worker 
shuffle_filter_ssse3(const int16_t * const filter,__m128i * const f)19*fb1b10abSAndroid Build Coastguard Worker static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
20*fb1b10abSAndroid Build Coastguard Worker                                         __m128i *const f) {
21*fb1b10abSAndroid Build Coastguard Worker   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
22*fb1b10abSAndroid Build Coastguard Worker   // pack and duplicate the filter values
23*fb1b10abSAndroid Build Coastguard Worker   f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
24*fb1b10abSAndroid Build Coastguard Worker   f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
25*fb1b10abSAndroid Build Coastguard Worker   f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
26*fb1b10abSAndroid Build Coastguard Worker   f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
27*fb1b10abSAndroid Build Coastguard Worker }
28*fb1b10abSAndroid Build Coastguard Worker 
shuffle_filter_odd_ssse3(const int16_t * const filter,__m128i * const f)29*fb1b10abSAndroid Build Coastguard Worker static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
30*fb1b10abSAndroid Build Coastguard Worker                                             __m128i *const f) {
31*fb1b10abSAndroid Build Coastguard Worker   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
32*fb1b10abSAndroid Build Coastguard Worker   // pack and duplicate the filter values
33*fb1b10abSAndroid Build Coastguard Worker   // It utilizes the fact that the high byte of filter[3] is always 0 to clean
34*fb1b10abSAndroid Build Coastguard Worker   // half of f[0] and f[4].
35*fb1b10abSAndroid Build Coastguard Worker   assert(filter[3] >= 0 && filter[3] < 256);
36*fb1b10abSAndroid Build Coastguard Worker   f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
37*fb1b10abSAndroid Build Coastguard Worker   f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
38*fb1b10abSAndroid Build Coastguard Worker   f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
39*fb1b10abSAndroid Build Coastguard Worker   f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
40*fb1b10abSAndroid Build Coastguard Worker   f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
41*fb1b10abSAndroid Build Coastguard Worker }
42*fb1b10abSAndroid Build Coastguard Worker 
convolve8_8_ssse3(const __m128i * const s,const __m128i * const f)43*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
44*fb1b10abSAndroid Build Coastguard Worker                                         const __m128i *const f) {
45*fb1b10abSAndroid Build Coastguard Worker   // multiply 2 adjacent elements with the filter and add the result
46*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_64 = _mm_set1_epi16(1 << 6);
47*fb1b10abSAndroid Build Coastguard Worker   const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
48*fb1b10abSAndroid Build Coastguard Worker   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
49*fb1b10abSAndroid Build Coastguard Worker   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
50*fb1b10abSAndroid Build Coastguard Worker   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
51*fb1b10abSAndroid Build Coastguard Worker   __m128i sum1, sum2;
52*fb1b10abSAndroid Build Coastguard Worker 
53*fb1b10abSAndroid Build Coastguard Worker   // sum the results together, saturating only on the final step
54*fb1b10abSAndroid Build Coastguard Worker   // adding x0 with x2 and x1 with x3 is the only order that prevents
55*fb1b10abSAndroid Build Coastguard Worker   // outranges for all filters
56*fb1b10abSAndroid Build Coastguard Worker   sum1 = _mm_add_epi16(x0, x2);
57*fb1b10abSAndroid Build Coastguard Worker   sum2 = _mm_add_epi16(x1, x3);
58*fb1b10abSAndroid Build Coastguard Worker   // add the rounding offset early to avoid another saturated add
59*fb1b10abSAndroid Build Coastguard Worker   sum1 = _mm_add_epi16(sum1, k_64);
60*fb1b10abSAndroid Build Coastguard Worker   sum1 = _mm_adds_epi16(sum1, sum2);
61*fb1b10abSAndroid Build Coastguard Worker   // shift by 7 bit each 16 bit
62*fb1b10abSAndroid Build Coastguard Worker   sum1 = _mm_srai_epi16(sum1, 7);
63*fb1b10abSAndroid Build Coastguard Worker   return sum1;
64*fb1b10abSAndroid Build Coastguard Worker }
65*fb1b10abSAndroid Build Coastguard Worker 
convolve8_8_even_offset_ssse3(const __m128i * const s,const __m128i * const f)66*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
67*fb1b10abSAndroid Build Coastguard Worker                                                     const __m128i *const f) {
68*fb1b10abSAndroid Build Coastguard Worker   // multiply 2 adjacent elements with the filter and add the result
69*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_64 = _mm_set1_epi16(1 << 6);
70*fb1b10abSAndroid Build Coastguard Worker   const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
71*fb1b10abSAndroid Build Coastguard Worker   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
72*fb1b10abSAndroid Build Coastguard Worker   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
73*fb1b10abSAndroid Build Coastguard Worker   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
74*fb1b10abSAndroid Build Coastguard Worker   // compensate the subtracted 64 in f[1]. x4 is always non negative.
75*fb1b10abSAndroid Build Coastguard Worker   const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
76*fb1b10abSAndroid Build Coastguard Worker   // add and saturate the results together
77*fb1b10abSAndroid Build Coastguard Worker   __m128i temp = _mm_adds_epi16(x0, x3);
78*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x1);
79*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x2);
80*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x4);
81*fb1b10abSAndroid Build Coastguard Worker   // round and shift by 7 bit each 16 bit
82*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, k_64);
83*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_srai_epi16(temp, 7);
84*fb1b10abSAndroid Build Coastguard Worker   return temp;
85*fb1b10abSAndroid Build Coastguard Worker }
86*fb1b10abSAndroid Build Coastguard Worker 
convolve8_8_odd_offset_ssse3(const __m128i * const s,const __m128i * const f)87*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
88*fb1b10abSAndroid Build Coastguard Worker                                                    const __m128i *const f) {
89*fb1b10abSAndroid Build Coastguard Worker   // multiply 2 adjacent elements with the filter and add the result
90*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_64 = _mm_set1_epi16(1 << 6);
91*fb1b10abSAndroid Build Coastguard Worker   const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
92*fb1b10abSAndroid Build Coastguard Worker   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
93*fb1b10abSAndroid Build Coastguard Worker   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
94*fb1b10abSAndroid Build Coastguard Worker   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
95*fb1b10abSAndroid Build Coastguard Worker   const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
96*fb1b10abSAndroid Build Coastguard Worker   // compensate the subtracted 64 in f[2]. x5 is always non negative.
97*fb1b10abSAndroid Build Coastguard Worker   const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
98*fb1b10abSAndroid Build Coastguard Worker   __m128i temp;
99*fb1b10abSAndroid Build Coastguard Worker 
100*fb1b10abSAndroid Build Coastguard Worker   // add and saturate the results together
101*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(x0, x1);
102*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x2);
103*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x3);
104*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x4);
105*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, x5);
106*fb1b10abSAndroid Build Coastguard Worker   // round and shift by 7 bit each 16 bit
107*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_adds_epi16(temp, k_64);
108*fb1b10abSAndroid Build Coastguard Worker   temp = _mm_srai_epi16(temp, 7);
109*fb1b10abSAndroid Build Coastguard Worker   return temp;
110*fb1b10abSAndroid Build Coastguard Worker }
111*fb1b10abSAndroid Build Coastguard Worker 
112*fb1b10abSAndroid Build Coastguard Worker #endif  // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
113