1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AOM_DSP_X86_BLEND_SSE4_H_
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/blend.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms.h"
17*77c1e3ccSAndroid Build Coastguard Worker static const uint8_t g_blend_a64_mask_shuffle[32] = {
18*77c1e3ccSAndroid Build Coastguard Worker 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
19*77c1e3ccSAndroid Build Coastguard Worker 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
20*77c1e3ccSAndroid Build Coastguard Worker };
21*77c1e3ccSAndroid Build Coastguard Worker
22*77c1e3ccSAndroid Build Coastguard Worker //////////////////////////////////////////////////////////////////////////////
23*77c1e3ccSAndroid Build Coastguard Worker // Common kernels
24*77c1e3ccSAndroid Build Coastguard Worker //////////////////////////////////////////////////////////////////////////////
25*77c1e3ccSAndroid Build Coastguard Worker
blend_4(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_w,const __m128i * v_m1_w)26*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
27*77c1e3ccSAndroid Build Coastguard Worker const __m128i *v_m0_w, const __m128i *v_m1_w) {
28*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_b = xx_loadl_32(src0);
29*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_b = xx_loadl_32(src1);
30*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
31*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
32*77c1e3ccSAndroid Build Coastguard Worker
33*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
34*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
35*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
36*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
37*77c1e3ccSAndroid Build Coastguard Worker
38*77c1e3ccSAndroid Build Coastguard Worker return v_res_w;
39*77c1e3ccSAndroid Build Coastguard Worker }
40*77c1e3ccSAndroid Build Coastguard Worker
blend_8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_w,const __m128i * v_m1_w)41*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
42*77c1e3ccSAndroid Build Coastguard Worker const __m128i *v_m0_w, const __m128i *v_m1_w) {
43*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_b = xx_loadl_64(src0);
44*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_b = xx_loadl_64(src1);
45*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
46*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
47*77c1e3ccSAndroid Build Coastguard Worker
48*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
49*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
50*77c1e3ccSAndroid Build Coastguard Worker
51*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
52*77c1e3ccSAndroid Build Coastguard Worker
53*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
54*77c1e3ccSAndroid Build Coastguard Worker
55*77c1e3ccSAndroid Build Coastguard Worker return v_res_w;
56*77c1e3ccSAndroid Build Coastguard Worker }
57*77c1e3ccSAndroid Build Coastguard Worker
blend_4_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)58*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
59*77c1e3ccSAndroid Build Coastguard Worker const __m128i *v_m0_b, const __m128i *v_m1_b,
60*77c1e3ccSAndroid Build Coastguard Worker const __m128i *rounding) {
61*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_b = xx_loadl_32(src0);
62*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_b = xx_loadl_32(src1);
63*77c1e3ccSAndroid Build Coastguard Worker
64*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
65*77c1e3ccSAndroid Build Coastguard Worker _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
66*77c1e3ccSAndroid Build Coastguard Worker
67*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
68*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
69*77c1e3ccSAndroid Build Coastguard Worker return v_res;
70*77c1e3ccSAndroid Build Coastguard Worker }
71*77c1e3ccSAndroid Build Coastguard Worker
blend_8_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)72*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
73*77c1e3ccSAndroid Build Coastguard Worker const __m128i *v_m0_b, const __m128i *v_m1_b,
74*77c1e3ccSAndroid Build Coastguard Worker const __m128i *rounding) {
75*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_b = xx_loadl_64(src0);
76*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_b = xx_loadl_64(src1);
77*77c1e3ccSAndroid Build Coastguard Worker
78*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
79*77c1e3ccSAndroid Build Coastguard Worker _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
80*77c1e3ccSAndroid Build Coastguard Worker
81*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
82*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
83*77c1e3ccSAndroid Build Coastguard Worker return v_res;
84*77c1e3ccSAndroid Build Coastguard Worker }
85*77c1e3ccSAndroid Build Coastguard Worker
blend_16_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)86*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
87*77c1e3ccSAndroid Build Coastguard Worker const __m128i *v_m0_b, const __m128i *v_m1_b,
88*77c1e3ccSAndroid Build Coastguard Worker const __m128i *rounding) {
89*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_b = xx_loadu_128(src0);
90*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_b = xx_loadu_128(src1);
91*77c1e3ccSAndroid Build Coastguard Worker
92*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
93*77c1e3ccSAndroid Build Coastguard Worker _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
94*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
95*77c1e3ccSAndroid Build Coastguard Worker _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
96*77c1e3ccSAndroid Build Coastguard Worker
97*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
98*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
99*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
100*77c1e3ccSAndroid Build Coastguard Worker return v_res;
101*77c1e3ccSAndroid Build Coastguard Worker }
102*77c1e3ccSAndroid Build Coastguard Worker
103*77c1e3ccSAndroid Build Coastguard Worker typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
104*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m0_w, const __m128i v_m1_w);
105*77c1e3ccSAndroid Build Coastguard Worker
blend_4_b10(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)106*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
107*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m0_w, const __m128i v_m1_w) {
108*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_w = xx_loadl_64(src0);
109*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_w = xx_loadl_64(src1);
110*77c1e3ccSAndroid Build Coastguard Worker
111*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
112*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
113*77c1e3ccSAndroid Build Coastguard Worker
114*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
115*77c1e3ccSAndroid Build Coastguard Worker
116*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
117*77c1e3ccSAndroid Build Coastguard Worker
118*77c1e3ccSAndroid Build Coastguard Worker return v_res_w;
119*77c1e3ccSAndroid Build Coastguard Worker }
120*77c1e3ccSAndroid Build Coastguard Worker
blend_8_b10(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)121*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
122*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m0_w, const __m128i v_m1_w) {
123*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_w = xx_loadu_128(src0);
124*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_w = xx_loadu_128(src1);
125*77c1e3ccSAndroid Build Coastguard Worker
126*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
127*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
128*77c1e3ccSAndroid Build Coastguard Worker
129*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
130*77c1e3ccSAndroid Build Coastguard Worker
131*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
132*77c1e3ccSAndroid Build Coastguard Worker
133*77c1e3ccSAndroid Build Coastguard Worker return v_res_w;
134*77c1e3ccSAndroid Build Coastguard Worker }
135*77c1e3ccSAndroid Build Coastguard Worker
blend_4_b12(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)136*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
137*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m0_w, const __m128i v_m1_w) {
138*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_w = xx_loadl_64(src0);
139*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_w = xx_loadl_64(src1);
140*77c1e3ccSAndroid Build Coastguard Worker
141*77c1e3ccSAndroid Build Coastguard Worker // Interleave
142*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
143*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
144*77c1e3ccSAndroid Build Coastguard Worker
145*77c1e3ccSAndroid Build Coastguard Worker // Multiply-Add
146*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
147*77c1e3ccSAndroid Build Coastguard Worker
148*77c1e3ccSAndroid Build Coastguard Worker // Scale
149*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_ssum_d =
150*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
151*77c1e3ccSAndroid Build Coastguard Worker
152*77c1e3ccSAndroid Build Coastguard Worker // Pack
153*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
154*77c1e3ccSAndroid Build Coastguard Worker
155*77c1e3ccSAndroid Build Coastguard Worker // Round
156*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = xx_round_epu16(v_pssum_d);
157*77c1e3ccSAndroid Build Coastguard Worker
158*77c1e3ccSAndroid Build Coastguard Worker return v_res_w;
159*77c1e3ccSAndroid Build Coastguard Worker }
160*77c1e3ccSAndroid Build Coastguard Worker
blend_8_b12(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)161*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
162*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m0_w, const __m128i v_m1_w) {
163*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s0_w = xx_loadu_128(src0);
164*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s1_w = xx_loadu_128(src1);
165*77c1e3ccSAndroid Build Coastguard Worker
166*77c1e3ccSAndroid Build Coastguard Worker // Interleave
167*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
168*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
169*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
170*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
171*77c1e3ccSAndroid Build Coastguard Worker
172*77c1e3ccSAndroid Build Coastguard Worker // Multiply-Add
173*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
174*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
175*77c1e3ccSAndroid Build Coastguard Worker
176*77c1e3ccSAndroid Build Coastguard Worker // Scale
177*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_ssuml_d =
178*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
179*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_ssumh_d =
180*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
181*77c1e3ccSAndroid Build Coastguard Worker
182*77c1e3ccSAndroid Build Coastguard Worker // Pack
183*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
184*77c1e3ccSAndroid Build Coastguard Worker
185*77c1e3ccSAndroid Build Coastguard Worker // Round
186*77c1e3ccSAndroid Build Coastguard Worker const __m128i v_res_w = xx_round_epu16(v_pssum_d);
187*77c1e3ccSAndroid Build Coastguard Worker
188*77c1e3ccSAndroid Build Coastguard Worker return v_res_w;
189*77c1e3ccSAndroid Build Coastguard Worker }
190*77c1e3ccSAndroid Build Coastguard Worker
191*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
192