1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include <immintrin.h> // AVX2
15*fb1b10abSAndroid Build Coastguard Worker
16*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard Worker #if defined(__clang__)
19*fb1b10abSAndroid Build Coastguard Worker #if (__clang_major__ > 0 && __clang_major__ < 3) || \
20*fb1b10abSAndroid Build Coastguard Worker (__clang_major__ == 3 && __clang_minor__ <= 3) || \
21*fb1b10abSAndroid Build Coastguard Worker (defined(__APPLE__) && defined(__apple_build_version__) && \
22*fb1b10abSAndroid Build Coastguard Worker ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
23*fb1b10abSAndroid Build Coastguard Worker (__clang_major__ == 5 && __clang_minor__ == 0)))
24*fb1b10abSAndroid Build Coastguard Worker #define MM256_BROADCASTSI128_SI256(x) \
25*fb1b10abSAndroid Build Coastguard Worker _mm_broadcastsi128_si256((__m128i const *)&(x))
26*fb1b10abSAndroid Build Coastguard Worker #else // clang > 3.3, and not 5.0 on macosx.
27*fb1b10abSAndroid Build Coastguard Worker #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
28*fb1b10abSAndroid Build Coastguard Worker #endif // clang <= 3.3
29*fb1b10abSAndroid Build Coastguard Worker #elif defined(__GNUC__)
30*fb1b10abSAndroid Build Coastguard Worker #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
31*fb1b10abSAndroid Build Coastguard Worker #define MM256_BROADCASTSI128_SI256(x) \
32*fb1b10abSAndroid Build Coastguard Worker _mm_broadcastsi128_si256((__m128i const *)&(x))
33*fb1b10abSAndroid Build Coastguard Worker #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
34*fb1b10abSAndroid Build Coastguard Worker #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
35*fb1b10abSAndroid Build Coastguard Worker #else // gcc > 4.7
36*fb1b10abSAndroid Build Coastguard Worker #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
37*fb1b10abSAndroid Build Coastguard Worker #endif // gcc <= 4.6
38*fb1b10abSAndroid Build Coastguard Worker #else // !(gcc || clang)
39*fb1b10abSAndroid Build Coastguard Worker #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
40*fb1b10abSAndroid Build Coastguard Worker #endif // __clang__
41*fb1b10abSAndroid Build Coastguard Worker
shuffle_filter_avx2(const int16_t * const filter,__m256i * const f)42*fb1b10abSAndroid Build Coastguard Worker static INLINE void shuffle_filter_avx2(const int16_t *const filter,
43*fb1b10abSAndroid Build Coastguard Worker __m256i *const f) {
44*fb1b10abSAndroid Build Coastguard Worker const __m256i f_values =
45*fb1b10abSAndroid Build Coastguard Worker MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
46*fb1b10abSAndroid Build Coastguard Worker // pack and duplicate the filter values
47*fb1b10abSAndroid Build Coastguard Worker f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
48*fb1b10abSAndroid Build Coastguard Worker f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
49*fb1b10abSAndroid Build Coastguard Worker f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
50*fb1b10abSAndroid Build Coastguard Worker f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
51*fb1b10abSAndroid Build Coastguard Worker }
52*fb1b10abSAndroid Build Coastguard Worker
convolve8_16_avx2(const __m256i * const s,const __m256i * const f)53*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
54*fb1b10abSAndroid Build Coastguard Worker const __m256i *const f) {
55*fb1b10abSAndroid Build Coastguard Worker // multiply 2 adjacent elements with the filter and add the result
56*fb1b10abSAndroid Build Coastguard Worker const __m256i k_64 = _mm256_set1_epi16(1 << 6);
57*fb1b10abSAndroid Build Coastguard Worker const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
58*fb1b10abSAndroid Build Coastguard Worker const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
59*fb1b10abSAndroid Build Coastguard Worker const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
60*fb1b10abSAndroid Build Coastguard Worker const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
61*fb1b10abSAndroid Build Coastguard Worker __m256i sum1, sum2;
62*fb1b10abSAndroid Build Coastguard Worker
63*fb1b10abSAndroid Build Coastguard Worker // sum the results together, saturating only on the final step
64*fb1b10abSAndroid Build Coastguard Worker // adding x0 with x2 and x1 with x3 is the only order that prevents
65*fb1b10abSAndroid Build Coastguard Worker // outranges for all filters
66*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm256_add_epi16(x0, x2);
67*fb1b10abSAndroid Build Coastguard Worker sum2 = _mm256_add_epi16(x1, x3);
68*fb1b10abSAndroid Build Coastguard Worker // add the rounding offset early to avoid another saturated add
69*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm256_add_epi16(sum1, k_64);
70*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm256_adds_epi16(sum1, sum2);
71*fb1b10abSAndroid Build Coastguard Worker // round and shift by 7 bit each 16 bit
72*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm256_srai_epi16(sum1, 7);
73*fb1b10abSAndroid Build Coastguard Worker return sum1;
74*fb1b10abSAndroid Build Coastguard Worker }
75*fb1b10abSAndroid Build Coastguard Worker
convolve8_8_avx2(const __m256i * const s,const __m256i * const f)76*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
77*fb1b10abSAndroid Build Coastguard Worker const __m256i *const f) {
78*fb1b10abSAndroid Build Coastguard Worker // multiply 2 adjacent elements with the filter and add the result
79*fb1b10abSAndroid Build Coastguard Worker const __m128i k_64 = _mm_set1_epi16(1 << 6);
80*fb1b10abSAndroid Build Coastguard Worker const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
81*fb1b10abSAndroid Build Coastguard Worker _mm256_castsi256_si128(f[0]));
82*fb1b10abSAndroid Build Coastguard Worker const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
83*fb1b10abSAndroid Build Coastguard Worker _mm256_castsi256_si128(f[1]));
84*fb1b10abSAndroid Build Coastguard Worker const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
85*fb1b10abSAndroid Build Coastguard Worker _mm256_castsi256_si128(f[2]));
86*fb1b10abSAndroid Build Coastguard Worker const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
87*fb1b10abSAndroid Build Coastguard Worker _mm256_castsi256_si128(f[3]));
88*fb1b10abSAndroid Build Coastguard Worker __m128i sum1, sum2;
89*fb1b10abSAndroid Build Coastguard Worker
90*fb1b10abSAndroid Build Coastguard Worker // sum the results together, saturating only on the final step
91*fb1b10abSAndroid Build Coastguard Worker // adding x0 with x2 and x1 with x3 is the only order that prevents
92*fb1b10abSAndroid Build Coastguard Worker // outranges for all filters
93*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm_add_epi16(x0, x2);
94*fb1b10abSAndroid Build Coastguard Worker sum2 = _mm_add_epi16(x1, x3);
95*fb1b10abSAndroid Build Coastguard Worker // add the rounding offset early to avoid another saturated add
96*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm_add_epi16(sum1, k_64);
97*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm_adds_epi16(sum1, sum2);
98*fb1b10abSAndroid Build Coastguard Worker // shift by 7 bit each 16 bit
99*fb1b10abSAndroid Build Coastguard Worker sum1 = _mm_srai_epi16(sum1, 7);
100*fb1b10abSAndroid Build Coastguard Worker return sum1;
101*fb1b10abSAndroid Build Coastguard Worker }
102*fb1b10abSAndroid Build Coastguard Worker
mm256_loadu2_si128(const void * lo,const void * hi)103*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
104*fb1b10abSAndroid Build Coastguard Worker const __m256i tmp =
105*fb1b10abSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
106*fb1b10abSAndroid Build Coastguard Worker return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
107*fb1b10abSAndroid Build Coastguard Worker }
108*fb1b10abSAndroid Build Coastguard Worker
mm256_loadu2_epi64(const void * lo,const void * hi)109*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
110*fb1b10abSAndroid Build Coastguard Worker const __m256i tmp =
111*fb1b10abSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
112*fb1b10abSAndroid Build Coastguard Worker return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
113*fb1b10abSAndroid Build Coastguard Worker }
114*fb1b10abSAndroid Build Coastguard Worker
mm256_store2_si128(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)115*fb1b10abSAndroid Build Coastguard Worker static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
116*fb1b10abSAndroid Build Coastguard Worker __m128i *const dst_ptr_2,
117*fb1b10abSAndroid Build Coastguard Worker const __m256i *const src) {
118*fb1b10abSAndroid Build Coastguard Worker _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
119*fb1b10abSAndroid Build Coastguard Worker _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
120*fb1b10abSAndroid Build Coastguard Worker }
121*fb1b10abSAndroid Build Coastguard Worker
mm256_storeu2_epi64(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)122*fb1b10abSAndroid Build Coastguard Worker static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
123*fb1b10abSAndroid Build Coastguard Worker __m128i *const dst_ptr_2,
124*fb1b10abSAndroid Build Coastguard Worker const __m256i *const src) {
125*fb1b10abSAndroid Build Coastguard Worker _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
126*fb1b10abSAndroid Build Coastguard Worker _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
127*fb1b10abSAndroid Build Coastguard Worker }
128*fb1b10abSAndroid Build Coastguard Worker
mm256_storeu2_epi32(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)129*fb1b10abSAndroid Build Coastguard Worker static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
130*fb1b10abSAndroid Build Coastguard Worker __m128i *const dst_ptr_2,
131*fb1b10abSAndroid Build Coastguard Worker const __m256i *const src) {
132*fb1b10abSAndroid Build Coastguard Worker *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
133*fb1b10abSAndroid Build Coastguard Worker *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
134*fb1b10abSAndroid Build Coastguard Worker }
135*fb1b10abSAndroid Build Coastguard Worker
mm256_round_epi32(const __m256i * const src,const __m256i * const half_depth,const int depth)136*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i mm256_round_epi32(const __m256i *const src,
137*fb1b10abSAndroid Build Coastguard Worker const __m256i *const half_depth,
138*fb1b10abSAndroid Build Coastguard Worker const int depth) {
139*fb1b10abSAndroid Build Coastguard Worker const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
140*fb1b10abSAndroid Build Coastguard Worker return _mm256_srai_epi32(nearest_src, depth);
141*fb1b10abSAndroid Build Coastguard Worker }
142*fb1b10abSAndroid Build Coastguard Worker
mm256_round_epi16(const __m256i * const src,const __m256i * const half_depth,const int depth)143*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i mm256_round_epi16(const __m256i *const src,
144*fb1b10abSAndroid Build Coastguard Worker const __m256i *const half_depth,
145*fb1b10abSAndroid Build Coastguard Worker const int depth) {
146*fb1b10abSAndroid Build Coastguard Worker const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
147*fb1b10abSAndroid Build Coastguard Worker return _mm256_srai_epi16(nearest_src, depth);
148*fb1b10abSAndroid Build Coastguard Worker }
149*fb1b10abSAndroid Build Coastguard Worker
mm256_madd_add_epi32(const __m256i * const src_0,const __m256i * const src_1,const __m256i * const ker_0,const __m256i * const ker_1)150*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
151*fb1b10abSAndroid Build Coastguard Worker const __m256i *const src_1,
152*fb1b10abSAndroid Build Coastguard Worker const __m256i *const ker_0,
153*fb1b10abSAndroid Build Coastguard Worker const __m256i *const ker_1) {
154*fb1b10abSAndroid Build Coastguard Worker const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
155*fb1b10abSAndroid Build Coastguard Worker const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
156*fb1b10abSAndroid Build Coastguard Worker return _mm256_add_epi32(tmp_0, tmp_1);
157*fb1b10abSAndroid Build Coastguard Worker }
158*fb1b10abSAndroid Build Coastguard Worker
159*fb1b10abSAndroid Build Coastguard Worker #undef MM256_BROADCASTSI128_SI256
160*fb1b10abSAndroid Build Coastguard Worker
161*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
162