1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include <emmintrin.h> // SSE2
15*fb1b10abSAndroid Build Coastguard Worker
16*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard Worker // Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
19*fb1b10abSAndroid Build Coastguard Worker // values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
extract_quarter_2_epi16_sse2(const __m128i * const reg)20*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
21*fb1b10abSAndroid Build Coastguard Worker __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
22*fb1b10abSAndroid Build Coastguard Worker return _mm_unpackhi_epi64(tmp, tmp);
23*fb1b10abSAndroid Build Coastguard Worker }
24*fb1b10abSAndroid Build Coastguard Worker
25*fb1b10abSAndroid Build Coastguard Worker // Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
26*fb1b10abSAndroid Build Coastguard Worker // values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
extract_quarter_3_epi16_sse2(const __m128i * const reg)27*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
28*fb1b10abSAndroid Build Coastguard Worker __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
29*fb1b10abSAndroid Build Coastguard Worker return _mm_unpacklo_epi64(tmp, tmp);
30*fb1b10abSAndroid Build Coastguard Worker }
31*fb1b10abSAndroid Build Coastguard Worker
32*fb1b10abSAndroid Build Coastguard Worker // Interprets src as 8-bit words, zero extends to form 16-bit words, then
33*fb1b10abSAndroid Build Coastguard Worker // multiplies with ker and add the adjacent results to form 32-bit words.
34*fb1b10abSAndroid Build Coastguard Worker // Finally adds the result from 1 and 2 together.
mm_madd_add_epi8_sse2(const __m128i * const src_1,const __m128i * const src_2,const __m128i * const ker_1,const __m128i * const ker_2)35*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
36*fb1b10abSAndroid Build Coastguard Worker const __m128i *const src_2,
37*fb1b10abSAndroid Build Coastguard Worker const __m128i *const ker_1,
38*fb1b10abSAndroid Build Coastguard Worker const __m128i *const ker_2) {
39*fb1b10abSAndroid Build Coastguard Worker const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
40*fb1b10abSAndroid Build Coastguard Worker const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
41*fb1b10abSAndroid Build Coastguard Worker const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
42*fb1b10abSAndroid Build Coastguard Worker const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
43*fb1b10abSAndroid Build Coastguard Worker return _mm_add_epi32(madd_1, madd_2);
44*fb1b10abSAndroid Build Coastguard Worker }
45*fb1b10abSAndroid Build Coastguard Worker
46*fb1b10abSAndroid Build Coastguard Worker // Interprets src as 16-bit words, then multiplies with ker and add the
47*fb1b10abSAndroid Build Coastguard Worker // adjacent results to form 32-bit words. Finally adds the result from 1 and 2
48*fb1b10abSAndroid Build Coastguard Worker // together.
mm_madd_add_epi16_sse2(const __m128i * const src_1,const __m128i * const src_2,const __m128i * const ker_1,const __m128i * const ker_2)49*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
50*fb1b10abSAndroid Build Coastguard Worker const __m128i *const src_2,
51*fb1b10abSAndroid Build Coastguard Worker const __m128i *const ker_1,
52*fb1b10abSAndroid Build Coastguard Worker const __m128i *const ker_2) {
53*fb1b10abSAndroid Build Coastguard Worker const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
54*fb1b10abSAndroid Build Coastguard Worker const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
55*fb1b10abSAndroid Build Coastguard Worker return _mm_add_epi32(madd_1, madd_2);
56*fb1b10abSAndroid Build Coastguard Worker }
57*fb1b10abSAndroid Build Coastguard Worker
mm_madd_packs_epi16_sse2(const __m128i * const src_0,const __m128i * const src_1,const __m128i * const ker)58*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
59*fb1b10abSAndroid Build Coastguard Worker const __m128i *const src_1,
60*fb1b10abSAndroid Build Coastguard Worker const __m128i *const ker) {
61*fb1b10abSAndroid Build Coastguard Worker const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
62*fb1b10abSAndroid Build Coastguard Worker const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
63*fb1b10abSAndroid Build Coastguard Worker return _mm_packs_epi32(madd_1, madd_2);
64*fb1b10abSAndroid Build Coastguard Worker }
65*fb1b10abSAndroid Build Coastguard Worker
66*fb1b10abSAndroid Build Coastguard Worker // Interleaves src_1 and src_2
mm_zip_epi32_sse2(const __m128i * const src_1,const __m128i * const src_2)67*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
68*fb1b10abSAndroid Build Coastguard Worker const __m128i *const src_2) {
69*fb1b10abSAndroid Build Coastguard Worker const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
70*fb1b10abSAndroid Build Coastguard Worker const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
71*fb1b10abSAndroid Build Coastguard Worker return _mm_packs_epi32(tmp_1, tmp_2);
72*fb1b10abSAndroid Build Coastguard Worker }
73*fb1b10abSAndroid Build Coastguard Worker
mm_round_epi32_sse2(const __m128i * const src,const __m128i * const half_depth,const int depth)74*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
75*fb1b10abSAndroid Build Coastguard Worker const __m128i *const half_depth,
76*fb1b10abSAndroid Build Coastguard Worker const int depth) {
77*fb1b10abSAndroid Build Coastguard Worker const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
78*fb1b10abSAndroid Build Coastguard Worker return _mm_srai_epi32(nearest_src, depth);
79*fb1b10abSAndroid Build Coastguard Worker }
80*fb1b10abSAndroid Build Coastguard Worker
mm_round_epi16_sse2(const __m128i * const src,const __m128i * const half_depth,const int depth)81*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
82*fb1b10abSAndroid Build Coastguard Worker const __m128i *const half_depth,
83*fb1b10abSAndroid Build Coastguard Worker const int depth) {
84*fb1b10abSAndroid Build Coastguard Worker const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
85*fb1b10abSAndroid Build Coastguard Worker return _mm_srai_epi16(nearest_src, depth);
86*fb1b10abSAndroid Build Coastguard Worker }
87*fb1b10abSAndroid Build Coastguard Worker
88*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
89