1*abb65b4bSAndroid Build Coastguard Worker /*
2*abb65b4bSAndroid Build Coastguard Worker * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3*abb65b4bSAndroid Build Coastguard Worker * All Rights Reserved.
4*abb65b4bSAndroid Build Coastguard Worker *
5*abb65b4bSAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
6*abb65b4bSAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
7*abb65b4bSAndroid Build Coastguard Worker *
8*abb65b4bSAndroid Build Coastguard Worker * - Redistributions of source code must retain the above copyright notice,
9*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer.
10*abb65b4bSAndroid Build Coastguard Worker *
11*abb65b4bSAndroid Build Coastguard Worker * - Redistributions in binary form must reproduce the above copyright notice,
12*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation
13*abb65b4bSAndroid Build Coastguard Worker * and/or other materials provided with the distribution.
14*abb65b4bSAndroid Build Coastguard Worker *
15*abb65b4bSAndroid Build Coastguard Worker * - Neither the name of the copyright owner, nor the names of its contributors
16*abb65b4bSAndroid Build Coastguard Worker * may be used to endorse or promote products derived from this software
17*abb65b4bSAndroid Build Coastguard Worker * without specific prior written permission.
18*abb65b4bSAndroid Build Coastguard Worker *
19*abb65b4bSAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20*abb65b4bSAndroid Build Coastguard Worker * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21*abb65b4bSAndroid Build Coastguard Worker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22*abb65b4bSAndroid Build Coastguard Worker * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23*abb65b4bSAndroid Build Coastguard Worker * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*abb65b4bSAndroid Build Coastguard Worker * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*abb65b4bSAndroid Build Coastguard Worker * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*abb65b4bSAndroid Build Coastguard Worker * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*abb65b4bSAndroid Build Coastguard Worker * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*abb65b4bSAndroid Build Coastguard Worker * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*abb65b4bSAndroid Build Coastguard Worker * POSSIBILITY OF SUCH DAMAGE.
30*abb65b4bSAndroid Build Coastguard Worker */
31*abb65b4bSAndroid Build Coastguard Worker
32*abb65b4bSAndroid Build Coastguard Worker #include "oapv_def.h"
33*abb65b4bSAndroid Build Coastguard Worker #include "oapv_tq_avx.h"
34*abb65b4bSAndroid Build Coastguard Worker
35*abb65b4bSAndroid Build Coastguard Worker #ifndef _mm256_set_m128i
36*abb65b4bSAndroid Build Coastguard Worker #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
37*abb65b4bSAndroid Build Coastguard Worker _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
38*abb65b4bSAndroid Build Coastguard Worker #endif // !_mm256_set_m128i
39*abb65b4bSAndroid Build Coastguard Worker
40*abb65b4bSAndroid Build Coastguard Worker #ifndef _mm256_loadu2_m128i
41*abb65b4bSAndroid Build Coastguard Worker #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \
42*abb65b4bSAndroid Build Coastguard Worker /* __m128i const* */ loaddr) \
43*abb65b4bSAndroid Build Coastguard Worker _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
44*abb65b4bSAndroid Build Coastguard Worker #endif // !_mm256_loadu2_m128i
45*abb65b4bSAndroid Build Coastguard Worker
oapv_tx_part_avx(s16 * src,s16 * dst,int shift,int line)46*abb65b4bSAndroid Build Coastguard Worker static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
47*abb65b4bSAndroid Build Coastguard Worker {
48*abb65b4bSAndroid Build Coastguard Worker __m256i v0, v1, v2, v3, v4, v5, v6, v7;
49*abb65b4bSAndroid Build Coastguard Worker __m256i d0, d1, d2, d3;
50*abb65b4bSAndroid Build Coastguard Worker __m256i coeff[8];
51*abb65b4bSAndroid Build Coastguard Worker coeff[0] = _mm256_set1_epi16(64);
52*abb65b4bSAndroid Build Coastguard Worker coeff[1] = _mm256_set_epi16(64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64);
53*abb65b4bSAndroid Build Coastguard Worker coeff[2] = _mm256_set_epi16(84, 35, -35, -84, -84, -35, 35, 84, 84, 35, -35, -84, -84, -35, 35, 84);
54*abb65b4bSAndroid Build Coastguard Worker coeff[3] = _mm256_set_epi16(35, -84, 84, -35, -35, 84, -84, 35, 35, -84, 84, -35, -35, 84, -84, 35);
55*abb65b4bSAndroid Build Coastguard Worker coeff[4] = _mm256_set_epi16(-89, -75, -50, -18, 18, 50, 75, 89, -89, -75, -50, -18, 18, 50, 75, 89);
56*abb65b4bSAndroid Build Coastguard Worker coeff[5] = _mm256_set_epi16(-75, 18, 89, 50, -50, -89, -18, 75, -75, 18, 89, 50, -50, -89, -18, 75);
57*abb65b4bSAndroid Build Coastguard Worker coeff[6] = _mm256_set_epi16(-50, 89, -18, -75, 75, 18, -89, 50, -50, 89, -18, -75, 75, 18, -89, 50);
58*abb65b4bSAndroid Build Coastguard Worker coeff[7] = _mm256_set_epi16(-18, 50, -75, 89, -89, 75, -50, 18, -18, 50, -75, 89, -89, 75, -50, 18);
59*abb65b4bSAndroid Build Coastguard Worker __m256i add = _mm256_set1_epi32(1 << (shift - 1));
60*abb65b4bSAndroid Build Coastguard Worker
61*abb65b4bSAndroid Build Coastguard Worker __m256i s0, s1, s2, s3;
62*abb65b4bSAndroid Build Coastguard Worker
63*abb65b4bSAndroid Build Coastguard Worker s0 = _mm256_loadu2_m128i((const __m128i *)&src[32], (const __m128i *)&src[0]);
64*abb65b4bSAndroid Build Coastguard Worker s1 = _mm256_loadu2_m128i((const __m128i *)&src[40], (const __m128i *)&src[8]);
65*abb65b4bSAndroid Build Coastguard Worker s2 = _mm256_loadu2_m128i((const __m128i *)&src[48], (const __m128i *)&src[16]);
66*abb65b4bSAndroid Build Coastguard Worker s3 = _mm256_loadu2_m128i((const __m128i *)&src[56], (const __m128i *)&src[24]);
67*abb65b4bSAndroid Build Coastguard Worker
68*abb65b4bSAndroid Build Coastguard Worker CALCU_2x8(coeff[0], coeff[4], d0, d1);
69*abb65b4bSAndroid Build Coastguard Worker CALCU_2x8(coeff[2], coeff[5], d2, d3);
70*abb65b4bSAndroid Build Coastguard Worker CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift)
71*abb65b4bSAndroid Build Coastguard Worker
72*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_packs_epi32(d0, d1);
73*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_packs_epi32(d2, d3);
74*abb65b4bSAndroid Build Coastguard Worker
75*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_permute4x64_epi64(d0, 0xd8);
76*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_permute4x64_epi64(d1, 0xd8);
77*abb65b4bSAndroid Build Coastguard Worker
78*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)dst, _mm256_castsi256_si128(d0));
79*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 1 * line), _mm256_extracti128_si256(d0, 1));
80*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 2 * line), _mm256_castsi256_si128(d1));
81*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 3 * line), _mm256_extracti128_si256(d1, 1));
82*abb65b4bSAndroid Build Coastguard Worker
83*abb65b4bSAndroid Build Coastguard Worker CALCU_2x8(coeff[1], coeff[6], d0, d1);
84*abb65b4bSAndroid Build Coastguard Worker CALCU_2x8(coeff[3], coeff[7], d2, d3);
85*abb65b4bSAndroid Build Coastguard Worker CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift);
86*abb65b4bSAndroid Build Coastguard Worker
87*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_packs_epi32(d0, d1);
88*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_packs_epi32(d2, d3);
89*abb65b4bSAndroid Build Coastguard Worker
90*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_permute4x64_epi64(d0, 0xd8);
91*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_permute4x64_epi64(d1, 0xd8);
92*abb65b4bSAndroid Build Coastguard Worker
93*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 4 * line), _mm256_castsi256_si128(d0));
94*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 5 * line), _mm256_extracti128_si256(d0, 1));
95*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 6 * line), _mm256_castsi256_si128(d1));
96*abb65b4bSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)(dst + 7 * line), _mm256_extracti128_si256(d1, 1));
97*abb65b4bSAndroid Build Coastguard Worker }
98*abb65b4bSAndroid Build Coastguard Worker
99*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2] =
100*abb65b4bSAndroid Build Coastguard Worker {
101*abb65b4bSAndroid Build Coastguard Worker oapv_tx_part_avx,
102*abb65b4bSAndroid Build Coastguard Worker NULL
103*abb65b4bSAndroid Build Coastguard Worker };
104*abb65b4bSAndroid Build Coastguard Worker
105*abb65b4bSAndroid Build Coastguard Worker ///////////////////////////////////////////////////////////////////////////////
106*abb65b4bSAndroid Build Coastguard Worker // end of encoder code
107*abb65b4bSAndroid Build Coastguard Worker // ENABLE_ENCODER
108*abb65b4bSAndroid Build Coastguard Worker ///////////////////////////////////////////////////////////////////////////////
109*abb65b4bSAndroid Build Coastguard Worker
110*abb65b4bSAndroid Build Coastguard Worker #define TRANSPOSE_8x4_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3) \
111*abb65b4bSAndroid Build Coastguard Worker tr0_0 = _mm_unpacklo_epi16(I0, I1); \
112*abb65b4bSAndroid Build Coastguard Worker tr0_1 = _mm_unpacklo_epi16(I2, I3); \
113*abb65b4bSAndroid Build Coastguard Worker tr0_2 = _mm_unpacklo_epi16(I4, I5); \
114*abb65b4bSAndroid Build Coastguard Worker tr0_3 = _mm_unpacklo_epi16(I6, I7); \
115*abb65b4bSAndroid Build Coastguard Worker tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
116*abb65b4bSAndroid Build Coastguard Worker tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
117*abb65b4bSAndroid Build Coastguard Worker tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
118*abb65b4bSAndroid Build Coastguard Worker tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
119*abb65b4bSAndroid Build Coastguard Worker O0 = _mm_unpacklo_epi64(tr1_0, tr1_2); \
120*abb65b4bSAndroid Build Coastguard Worker O1 = _mm_unpackhi_epi64(tr1_0, tr1_2); \
121*abb65b4bSAndroid Build Coastguard Worker O2 = _mm_unpacklo_epi64(tr1_1, tr1_3); \
122*abb65b4bSAndroid Build Coastguard Worker O3 = _mm_unpackhi_epi64(tr1_1, tr1_3);
123*abb65b4bSAndroid Build Coastguard Worker
124*abb65b4bSAndroid Build Coastguard Worker // transpose 8x8: 8 x 8(32bit) --> 8 x 8(16bit)
125*abb65b4bSAndroid Build Coastguard Worker // O0: row0, row4
126*abb65b4bSAndroid Build Coastguard Worker // O1: row1, row5
127*abb65b4bSAndroid Build Coastguard Worker // O2: row2, row6
128*abb65b4bSAndroid Build Coastguard Worker // O3: row3, row7
129*abb65b4bSAndroid Build Coastguard Worker #define TRANSPOSE_8x8_32BIT_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3) \
130*abb65b4bSAndroid Build Coastguard Worker I0 = _mm256_packs_epi32(I0, I4); \
131*abb65b4bSAndroid Build Coastguard Worker I1 = _mm256_packs_epi32(I1, I5); \
132*abb65b4bSAndroid Build Coastguard Worker I2 = _mm256_packs_epi32(I2, I6); \
133*abb65b4bSAndroid Build Coastguard Worker I3 = _mm256_packs_epi32(I3, I7); \
134*abb65b4bSAndroid Build Coastguard Worker I4 = _mm256_unpacklo_epi16(I0, I2); \
135*abb65b4bSAndroid Build Coastguard Worker I5 = _mm256_unpackhi_epi16(I0, I2); \
136*abb65b4bSAndroid Build Coastguard Worker I6 = _mm256_unpacklo_epi16(I1, I3); \
137*abb65b4bSAndroid Build Coastguard Worker I7 = _mm256_unpackhi_epi16(I1, I3); \
138*abb65b4bSAndroid Build Coastguard Worker I0 = _mm256_unpacklo_epi16(I4, I6); \
139*abb65b4bSAndroid Build Coastguard Worker I1 = _mm256_unpackhi_epi16(I4, I6); \
140*abb65b4bSAndroid Build Coastguard Worker I2 = _mm256_unpacklo_epi16(I5, I7); \
141*abb65b4bSAndroid Build Coastguard Worker I3 = _mm256_unpackhi_epi16(I5, I7); \
142*abb65b4bSAndroid Build Coastguard Worker O0 = _mm256_unpacklo_epi64(I0, I2); \
143*abb65b4bSAndroid Build Coastguard Worker O1 = _mm256_unpackhi_epi64(I0, I2); \
144*abb65b4bSAndroid Build Coastguard Worker O2 = _mm256_unpacklo_epi64(I1, I3); \
145*abb65b4bSAndroid Build Coastguard Worker O3 = _mm256_unpackhi_epi64(I1, I3)
146*abb65b4bSAndroid Build Coastguard Worker
147*abb65b4bSAndroid Build Coastguard Worker // transpose 8x8: 16 x 8(32bit) --> 8 x 16(16bit)
148*abb65b4bSAndroid Build Coastguard Worker #define TRANSPOSE_16x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I08, I09, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7)\
149*abb65b4bSAndroid Build Coastguard Worker TRANSPOSE_8x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I04, I05, I06, I07); \
150*abb65b4bSAndroid Build Coastguard Worker TRANSPOSE_8x8_32BIT_16BIT(I08, I09, I10, I11, I12, I13, I14, I15, I12, I13, I14, I15); \
151*abb65b4bSAndroid Build Coastguard Worker O0 = _mm256_insertf128_si256(I04, _mm256_castsi256_si128(I12), 1); \
152*abb65b4bSAndroid Build Coastguard Worker O1 = _mm256_insertf128_si256(I05, _mm256_castsi256_si128(I13), 1); \
153*abb65b4bSAndroid Build Coastguard Worker O2 = _mm256_insertf128_si256(I06, _mm256_castsi256_si128(I14), 1); \
154*abb65b4bSAndroid Build Coastguard Worker O3 = _mm256_insertf128_si256(I07, _mm256_castsi256_si128(I15), 1); \
155*abb65b4bSAndroid Build Coastguard Worker O4 = _mm256_insertf128_si256(I12, _mm256_extracti128_si256(I04, 1), 0); \
156*abb65b4bSAndroid Build Coastguard Worker O5 = _mm256_insertf128_si256(I13, _mm256_extracti128_si256(I05, 1), 0); \
157*abb65b4bSAndroid Build Coastguard Worker O6 = _mm256_insertf128_si256(I14, _mm256_extracti128_si256(I06, 1), 0); \
158*abb65b4bSAndroid Build Coastguard Worker O7 = _mm256_insertf128_si256(I15, _mm256_extracti128_si256(I07, 1), 0)
159*abb65b4bSAndroid Build Coastguard Worker
160*abb65b4bSAndroid Build Coastguard Worker #define set_vals(a,b) b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a
161*abb65b4bSAndroid Build Coastguard Worker #define set_vals1(a,b) b, a, b, a, b, a, b, a
162*abb65b4bSAndroid Build Coastguard Worker
oapv_itx_part_avx(s16 * src,s16 * dst,int shift,int line)163*abb65b4bSAndroid Build Coastguard Worker static void oapv_itx_part_avx(s16* src, s16* dst, int shift, int line)
164*abb65b4bSAndroid Build Coastguard Worker {
165*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p89_p75 = _mm256_setr_epi16(89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75); // 89 75
166*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p50_p18 = _mm256_setr_epi16(50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18); // 50, 18
167*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p75_n18 = _mm256_setr_epi16(75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18); // 75, -18
168*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_n89_n50 = _mm256_setr_epi16(-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50); // -89, -50
169*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p50_n89 = _mm256_setr_epi16(50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89); // 50,-89
170*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p18_p75 = _mm256_setr_epi16(18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75); // 18, 75
171*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p18_n50 = _mm256_setr_epi16(18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50); // 18,-50
172*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p75_n89 = _mm256_setr_epi16(75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89); // 75,-89
173*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p64_p64 = _mm256_setr_epi16(64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64); // 64, 64
174*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p64_n64 = _mm256_setr_epi16(64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64); // 64, -64
175*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p84_n35 = _mm256_setr_epi16(84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35, 84, 35); // 84, 35
176*abb65b4bSAndroid Build Coastguard Worker const __m256i coeff_p35_n84 = _mm256_setr_epi16(35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84, 35, -84); // 35, -84
177*abb65b4bSAndroid Build Coastguard Worker
178*abb65b4bSAndroid Build Coastguard Worker __m128i s0, s1, s2, s3, s4, s5, s6, s7;
179*abb65b4bSAndroid Build Coastguard Worker __m128i ss0, ss1, ss2, ss3;
180*abb65b4bSAndroid Build Coastguard Worker __m256i e0, e1, e2, e3, o0, o1, o2, o3, ee0, ee1, eo0, eo1;
181*abb65b4bSAndroid Build Coastguard Worker __m256i t0, t1, t2, t3;
182*abb65b4bSAndroid Build Coastguard Worker __m256i d0, d1, d2, d3, d4, d5, d6, d7;
183*abb65b4bSAndroid Build Coastguard Worker __m256i offset = _mm256_set1_epi32(1 << (shift - 1));
184*abb65b4bSAndroid Build Coastguard Worker int j;
185*abb65b4bSAndroid Build Coastguard Worker int i_src = line;
186*abb65b4bSAndroid Build Coastguard Worker int i_src2 = line << 1;
187*abb65b4bSAndroid Build Coastguard Worker int i_src3 = i_src + i_src2;
188*abb65b4bSAndroid Build Coastguard Worker int i_src4 = i_src << 2;
189*abb65b4bSAndroid Build Coastguard Worker int i_src5 = i_src2 + i_src3;
190*abb65b4bSAndroid Build Coastguard Worker int i_src6 = i_src3 << 1;
191*abb65b4bSAndroid Build Coastguard Worker int i_src7 = i_src3 + i_src4;
192*abb65b4bSAndroid Build Coastguard Worker for (j = 0; j < line; j += 8)
193*abb65b4bSAndroid Build Coastguard Worker {
194*abb65b4bSAndroid Build Coastguard Worker // O[0] -- O[3]
195*abb65b4bSAndroid Build Coastguard Worker s1 = _mm_loadu_si128((__m128i*)(src + i_src + j));
196*abb65b4bSAndroid Build Coastguard Worker s3 = _mm_loadu_si128((__m128i*)(src + i_src3 + j));
197*abb65b4bSAndroid Build Coastguard Worker s5 = _mm_loadu_si128((__m128i*)(src + i_src5 + j));
198*abb65b4bSAndroid Build Coastguard Worker s7 = _mm_loadu_si128((__m128i*)(src + i_src7 + j));
199*abb65b4bSAndroid Build Coastguard Worker
200*abb65b4bSAndroid Build Coastguard Worker ss0 = _mm_unpacklo_epi16(s1, s3);
201*abb65b4bSAndroid Build Coastguard Worker ss1 = _mm_unpackhi_epi16(s1, s3);
202*abb65b4bSAndroid Build Coastguard Worker ss2 = _mm_unpacklo_epi16(s5, s7);
203*abb65b4bSAndroid Build Coastguard Worker ss3 = _mm_unpackhi_epi16(s5, s7);
204*abb65b4bSAndroid Build Coastguard Worker
205*abb65b4bSAndroid Build Coastguard Worker e0 = _mm256_set_m128i(ss1, ss0);
206*abb65b4bSAndroid Build Coastguard Worker e1 = _mm256_set_m128i(ss3, ss2);
207*abb65b4bSAndroid Build Coastguard Worker
208*abb65b4bSAndroid Build Coastguard Worker t0 = _mm256_madd_epi16(e0, coeff_p89_p75);
209*abb65b4bSAndroid Build Coastguard Worker t1 = _mm256_madd_epi16(e1, coeff_p50_p18);
210*abb65b4bSAndroid Build Coastguard Worker t2 = _mm256_madd_epi16(e0, coeff_p75_n18);
211*abb65b4bSAndroid Build Coastguard Worker t3 = _mm256_madd_epi16(e1, coeff_n89_n50);
212*abb65b4bSAndroid Build Coastguard Worker o0 = _mm256_add_epi32(t0, t1);
213*abb65b4bSAndroid Build Coastguard Worker o1 = _mm256_add_epi32(t2, t3);
214*abb65b4bSAndroid Build Coastguard Worker
215*abb65b4bSAndroid Build Coastguard Worker t0 = _mm256_madd_epi16(e0, coeff_p50_n89);
216*abb65b4bSAndroid Build Coastguard Worker t1 = _mm256_madd_epi16(e1, coeff_p18_p75);
217*abb65b4bSAndroid Build Coastguard Worker t2 = _mm256_madd_epi16(e0, coeff_p18_n50);
218*abb65b4bSAndroid Build Coastguard Worker t3 = _mm256_madd_epi16(e1, coeff_p75_n89);
219*abb65b4bSAndroid Build Coastguard Worker
220*abb65b4bSAndroid Build Coastguard Worker o2 = _mm256_add_epi32(t0, t1);
221*abb65b4bSAndroid Build Coastguard Worker o3 = _mm256_add_epi32(t2, t3);
222*abb65b4bSAndroid Build Coastguard Worker
223*abb65b4bSAndroid Build Coastguard Worker // E[0] - E[3]
224*abb65b4bSAndroid Build Coastguard Worker s0 = _mm_loadu_si128((__m128i*)(src + j));
225*abb65b4bSAndroid Build Coastguard Worker s2 = _mm_loadu_si128((__m128i*)(src + i_src2 + j));
226*abb65b4bSAndroid Build Coastguard Worker s4 = _mm_loadu_si128((__m128i*)(src + i_src4 + j));
227*abb65b4bSAndroid Build Coastguard Worker s6 = _mm_loadu_si128((__m128i*)(src + i_src6 + j));
228*abb65b4bSAndroid Build Coastguard Worker
229*abb65b4bSAndroid Build Coastguard Worker ss0 = _mm_unpacklo_epi16(s0, s4);
230*abb65b4bSAndroid Build Coastguard Worker ss1 = _mm_unpackhi_epi16(s0, s4);
231*abb65b4bSAndroid Build Coastguard Worker ss2 = _mm_unpacklo_epi16(s2, s6);
232*abb65b4bSAndroid Build Coastguard Worker ss3 = _mm_unpackhi_epi16(s2, s6);
233*abb65b4bSAndroid Build Coastguard Worker
234*abb65b4bSAndroid Build Coastguard Worker e0 = _mm256_set_m128i(ss1, ss0);
235*abb65b4bSAndroid Build Coastguard Worker e1 = _mm256_set_m128i(ss3, ss2);
236*abb65b4bSAndroid Build Coastguard Worker
237*abb65b4bSAndroid Build Coastguard Worker ee0 = _mm256_madd_epi16(e0, coeff_p64_p64);
238*abb65b4bSAndroid Build Coastguard Worker ee1 = _mm256_madd_epi16(e0, coeff_p64_n64);
239*abb65b4bSAndroid Build Coastguard Worker eo0 = _mm256_madd_epi16(e1, coeff_p84_n35);
240*abb65b4bSAndroid Build Coastguard Worker eo1 = _mm256_madd_epi16(e1, coeff_p35_n84);
241*abb65b4bSAndroid Build Coastguard Worker
242*abb65b4bSAndroid Build Coastguard Worker e0 = _mm256_add_epi32(ee0, eo0);
243*abb65b4bSAndroid Build Coastguard Worker e3 = _mm256_sub_epi32(ee0, eo0);
244*abb65b4bSAndroid Build Coastguard Worker e1 = _mm256_add_epi32(ee1, eo1);
245*abb65b4bSAndroid Build Coastguard Worker e2 = _mm256_sub_epi32(ee1, eo1);
246*abb65b4bSAndroid Build Coastguard Worker
247*abb65b4bSAndroid Build Coastguard Worker e0 = _mm256_add_epi32(e0, offset);
248*abb65b4bSAndroid Build Coastguard Worker e3 = _mm256_add_epi32(e3, offset);
249*abb65b4bSAndroid Build Coastguard Worker e1 = _mm256_add_epi32(e1, offset);
250*abb65b4bSAndroid Build Coastguard Worker e2 = _mm256_add_epi32(e2, offset);
251*abb65b4bSAndroid Build Coastguard Worker
252*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_add_epi32(e0, o0);
253*abb65b4bSAndroid Build Coastguard Worker d7 = _mm256_sub_epi32(e0, o0);
254*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_add_epi32(e1, o1);
255*abb65b4bSAndroid Build Coastguard Worker d6 = _mm256_sub_epi32(e1, o1);
256*abb65b4bSAndroid Build Coastguard Worker d2 = _mm256_add_epi32(e2, o2);
257*abb65b4bSAndroid Build Coastguard Worker d5 = _mm256_sub_epi32(e2, o2);
258*abb65b4bSAndroid Build Coastguard Worker d3 = _mm256_add_epi32(e3, o3);
259*abb65b4bSAndroid Build Coastguard Worker d4 = _mm256_sub_epi32(e3, o3);
260*abb65b4bSAndroid Build Coastguard Worker
261*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_srai_epi32(d0, shift);
262*abb65b4bSAndroid Build Coastguard Worker d7 = _mm256_srai_epi32(d7, shift);
263*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_srai_epi32(d1, shift);
264*abb65b4bSAndroid Build Coastguard Worker d6 = _mm256_srai_epi32(d6, shift);
265*abb65b4bSAndroid Build Coastguard Worker d2 = _mm256_srai_epi32(d2, shift);
266*abb65b4bSAndroid Build Coastguard Worker d5 = _mm256_srai_epi32(d5, shift);
267*abb65b4bSAndroid Build Coastguard Worker d3 = _mm256_srai_epi32(d3, shift);
268*abb65b4bSAndroid Build Coastguard Worker d4 = _mm256_srai_epi32(d4, shift);
269*abb65b4bSAndroid Build Coastguard Worker
270*abb65b4bSAndroid Build Coastguard Worker // transpose 8x8 : 8 x 8(32bit) --> 4 x 16(16bit)
271*abb65b4bSAndroid Build Coastguard Worker TRANSPOSE_8x8_32BIT_16BIT(d0, d1, d2, d3, d4, d5, d6, d7, d4, d5, d6, d7);
272*abb65b4bSAndroid Build Coastguard Worker d0 = _mm256_insertf128_si256(d4, _mm256_castsi256_si128(d5), 1);
273*abb65b4bSAndroid Build Coastguard Worker d1 = _mm256_insertf128_si256(d6, _mm256_castsi256_si128(d7), 1);
274*abb65b4bSAndroid Build Coastguard Worker d2 = _mm256_insertf128_si256(d5, _mm256_extracti128_si256(d4, 1), 0);
275*abb65b4bSAndroid Build Coastguard Worker d3 = _mm256_insertf128_si256(d7, _mm256_extracti128_si256(d6, 1), 0);
276*abb65b4bSAndroid Build Coastguard Worker // store line x 8
277*abb65b4bSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*)dst, d0);
278*abb65b4bSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*)(dst + 16), d1);
279*abb65b4bSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*)(dst + 32), d2);
280*abb65b4bSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*)(dst + 48), d3);
281*abb65b4bSAndroid Build Coastguard Worker dst += 64;
282*abb65b4bSAndroid Build Coastguard Worker }
283*abb65b4bSAndroid Build Coastguard Worker }
284*abb65b4bSAndroid Build Coastguard Worker
285*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_itx_part_t oapv_tbl_fn_itx_part_avx[2] =
286*abb65b4bSAndroid Build Coastguard Worker {
287*abb65b4bSAndroid Build Coastguard Worker oapv_itx_part_avx,
288*abb65b4bSAndroid Build Coastguard Worker NULL
289*abb65b4bSAndroid Build Coastguard Worker };
290*abb65b4bSAndroid Build Coastguard Worker
oapv_itx_avx(s16 * src,int shift1,int shift2,int line)291*abb65b4bSAndroid Build Coastguard Worker static void oapv_itx_avx(s16* src, int shift1, int shift2, int line)
292*abb65b4bSAndroid Build Coastguard Worker {
293*abb65b4bSAndroid Build Coastguard Worker // To Do: Merge 2 passes and optimize AVX further
294*abb65b4bSAndroid Build Coastguard Worker ALIGNED_16(s16 dst[OAPV_BLK_D]);
295*abb65b4bSAndroid Build Coastguard Worker oapv_itx_part_avx(src, dst, shift1, line);
296*abb65b4bSAndroid Build Coastguard Worker oapv_itx_part_avx(dst, src, shift2, line);
297*abb65b4bSAndroid Build Coastguard Worker }
298*abb65b4bSAndroid Build Coastguard Worker
299*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_itx_t oapv_tbl_fn_itx_avx[2] =
300*abb65b4bSAndroid Build Coastguard Worker {
301*abb65b4bSAndroid Build Coastguard Worker oapv_itx_avx,
302*abb65b4bSAndroid Build Coastguard Worker NULL
303*abb65b4bSAndroid Build Coastguard Worker };
304*abb65b4bSAndroid Build Coastguard Worker
mul_128i_to_256i_and_add(__m256i offset_vector,__m128i a,__m128i b)305*abb65b4bSAndroid Build Coastguard Worker __m256i mul_128i_to_256i_and_add(__m256i offset_vector, __m128i a, __m128i b)
306*abb65b4bSAndroid Build Coastguard Worker {
307*abb65b4bSAndroid Build Coastguard Worker __m256i a_64 = _mm256_cvtepi32_epi64(a);
308*abb65b4bSAndroid Build Coastguard Worker __m256i b_64 = _mm256_cvtepi32_epi64(b);
309*abb65b4bSAndroid Build Coastguard Worker __m256i result = _mm256_mul_epi32(a_64, b_64);
310*abb65b4bSAndroid Build Coastguard Worker result = _mm256_add_epi64(result, offset_vector);
311*abb65b4bSAndroid Build Coastguard Worker return result;
312*abb65b4bSAndroid Build Coastguard Worker }
313*abb65b4bSAndroid Build Coastguard Worker
oapv_quant_avx(s16 * coef,u8 qp,int q_matrix[OAPV_BLK_D],int log2_w,int log2_h,int bit_depth,int deadzone_offset)314*abb65b4bSAndroid Build Coastguard Worker static int oapv_quant_avx(s16* coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)
315*abb65b4bSAndroid Build Coastguard Worker {
316*abb65b4bSAndroid Build Coastguard Worker s64 offset;
317*abb65b4bSAndroid Build Coastguard Worker int shift;
318*abb65b4bSAndroid Build Coastguard Worker int tr_shift;
319*abb65b4bSAndroid Build Coastguard Worker
320*abb65b4bSAndroid Build Coastguard Worker int log2_size = (log2_w + log2_h) >> 1;
321*abb65b4bSAndroid Build Coastguard Worker tr_shift = MAX_TX_DYNAMIC_RANGE - bit_depth - log2_size;
322*abb65b4bSAndroid Build Coastguard Worker shift = QUANT_SHIFT + tr_shift + (qp / 6);
323*abb65b4bSAndroid Build Coastguard Worker offset = (s64)deadzone_offset << (shift - 9);
324*abb65b4bSAndroid Build Coastguard Worker __m256i offset_vector = _mm256_set1_epi64x(offset);
325*abb65b4bSAndroid Build Coastguard Worker
326*abb65b4bSAndroid Build Coastguard Worker int pixels = (1 << (log2_w + log2_h));
327*abb65b4bSAndroid Build Coastguard Worker int i;
328*abb65b4bSAndroid Build Coastguard Worker __m256i shuffle0 = _mm256_setr_epi32(1, 3, 5, 7, 0, 2, 4, 6);
329*abb65b4bSAndroid Build Coastguard Worker __m256i shuffle1 = _mm256_setr_epi8(
330*abb65b4bSAndroid Build Coastguard Worker 0, 1, 4, 5, 8, 9, 12, 13,
331*abb65b4bSAndroid Build Coastguard Worker -128, -128, -128, -128, -128, -128, -128, -128,
332*abb65b4bSAndroid Build Coastguard Worker -128, -128, -128, -128, -128, -128, -128, -128,
333*abb65b4bSAndroid Build Coastguard Worker -128, -128, -128, -128, -128, -128, -128, -128);
334*abb65b4bSAndroid Build Coastguard Worker __m256i shuffle2 = _mm256_setr_epi8(
335*abb65b4bSAndroid Build Coastguard Worker -128, -128, -128, -128, -128, -128, -128, -128,
336*abb65b4bSAndroid Build Coastguard Worker 0, 1, 4, 5, 8, 9, 12, 13,
337*abb65b4bSAndroid Build Coastguard Worker -128, -128, -128, -128, -128, -128, -128, -128,
338*abb65b4bSAndroid Build Coastguard Worker -128, -128, -128, -128, -128, -128, -128, -128);
339*abb65b4bSAndroid Build Coastguard Worker
340*abb65b4bSAndroid Build Coastguard Worker for (i = 0; i < pixels; i += 8)
341*abb65b4bSAndroid Build Coastguard Worker {
342*abb65b4bSAndroid Build Coastguard Worker // Load first row
343*abb65b4bSAndroid Build Coastguard Worker __m256i quant_matrix = _mm256_lddqu_si256((__m256i*)(q_matrix + i));
344*abb65b4bSAndroid Build Coastguard Worker __m128i coef_row = _mm_lddqu_si128((__m128i*)(coef + i));
345*abb65b4bSAndroid Build Coastguard Worker
346*abb65b4bSAndroid Build Coastguard Worker // Extract sign
347*abb65b4bSAndroid Build Coastguard Worker __m256i coef_row_cast = _mm256_castsi128_si256(coef_row);
348*abb65b4bSAndroid Build Coastguard Worker __m256i sign_mask = _mm256_srai_epi16(coef_row_cast, 15);
349*abb65b4bSAndroid Build Coastguard Worker
350*abb65b4bSAndroid Build Coastguard Worker // Convert to 32 bits and take abs()
351*abb65b4bSAndroid Build Coastguard Worker __m256i coef_row_ext = _mm256_cvtepi16_epi32(coef_row);
352*abb65b4bSAndroid Build Coastguard Worker __m256i coef_row_abs = _mm256_abs_epi32(coef_row_ext);
353*abb65b4bSAndroid Build Coastguard Worker
354*abb65b4bSAndroid Build Coastguard Worker // Multiply coeff with quant values, add offset to result and shift
355*abb65b4bSAndroid Build Coastguard Worker __m256i lev1_low = mul_128i_to_256i_and_add(offset_vector, _mm256_castsi256_si128(coef_row_abs), _mm256_castsi256_si128(quant_matrix));
356*abb65b4bSAndroid Build Coastguard Worker __m256i lev1_high = mul_128i_to_256i_and_add(offset_vector, _mm256_extracti128_si256(coef_row_abs, 1), _mm256_extracti128_si256(quant_matrix, 1));
357*abb65b4bSAndroid Build Coastguard Worker __m256i lev2_low = _mm256_srli_epi64(lev1_low, shift);
358*abb65b4bSAndroid Build Coastguard Worker __m256i lev2_high = _mm256_srli_epi64(lev1_high, shift);
359*abb65b4bSAndroid Build Coastguard Worker
360*abb65b4bSAndroid Build Coastguard Worker // First level of combination
361*abb65b4bSAndroid Build Coastguard Worker lev2_low = _mm256_slli_epi64(lev2_low, 32);
362*abb65b4bSAndroid Build Coastguard Worker __m256i combined = _mm256_or_si256(lev2_low, lev2_high);
363*abb65b4bSAndroid Build Coastguard Worker
364*abb65b4bSAndroid Build Coastguard Worker // Second level of combination
365*abb65b4bSAndroid Build Coastguard Worker __m256i levx = _mm256_permutevar8x32_epi32(combined, shuffle0);
366*abb65b4bSAndroid Build Coastguard Worker __m128i levx_low = _mm256_castsi256_si128(levx);
367*abb65b4bSAndroid Build Coastguard Worker __m256i levx_low_ext = _mm256_castsi128_si256(levx_low);
368*abb65b4bSAndroid Build Coastguard Worker levx_low_ext = _mm256_shuffle_epi8(levx_low_ext, shuffle1);
369*abb65b4bSAndroid Build Coastguard Worker __m128i levx_high = _mm256_extracti128_si256(levx, 1);
370*abb65b4bSAndroid Build Coastguard Worker __m256i levx_high_ext = _mm256_castsi128_si256(levx_high);
371*abb65b4bSAndroid Build Coastguard Worker levx_high_ext = _mm256_shuffle_epi8(levx_high_ext, shuffle2);
372*abb65b4bSAndroid Build Coastguard Worker levx = _mm256_or_si256(levx_high_ext, levx_low_ext);
373*abb65b4bSAndroid Build Coastguard Worker
374*abb65b4bSAndroid Build Coastguard Worker // Apply sign
375*abb65b4bSAndroid Build Coastguard Worker levx = _mm256_sub_epi16(_mm256_xor_si256(levx, sign_mask), sign_mask);
376*abb65b4bSAndroid Build Coastguard Worker
377*abb65b4bSAndroid Build Coastguard Worker // Clip and store in coef
378*abb65b4bSAndroid Build Coastguard Worker __m128i lev4 = _mm256_castsi256_si128(levx);
379*abb65b4bSAndroid Build Coastguard Worker __m128i lev5 = _mm_max_epi16(lev4, _mm_set1_epi16(-32768));
380*abb65b4bSAndroid Build Coastguard Worker __m128i lev6 = _mm_min_epi16(lev5, _mm_set1_epi16(32767));
381*abb65b4bSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*)(coef + i), lev6);
382*abb65b4bSAndroid Build Coastguard Worker }
383*abb65b4bSAndroid Build Coastguard Worker return OAPV_OK;
384*abb65b4bSAndroid Build Coastguard Worker }
385*abb65b4bSAndroid Build Coastguard Worker
386*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_quant_t oapv_tbl_fn_quant_avx[2] =
387*abb65b4bSAndroid Build Coastguard Worker {
388*abb65b4bSAndroid Build Coastguard Worker oapv_quant_avx,
389*abb65b4bSAndroid Build Coastguard Worker NULL
390*abb65b4bSAndroid Build Coastguard Worker };
391*abb65b4bSAndroid Build Coastguard Worker
392*abb65b4bSAndroid Build Coastguard Worker
oapv_dquant_avx(s16 * coef,s16 q_matrix[OAPV_BLK_D],int log2_w,int log2_h,s8 shift)393*abb65b4bSAndroid Build Coastguard Worker static void oapv_dquant_avx(s16 *coef, s16 q_matrix[OAPV_BLK_D], int log2_w, int log2_h, s8 shift)
394*abb65b4bSAndroid Build Coastguard Worker {
395*abb65b4bSAndroid Build Coastguard Worker int i;
396*abb65b4bSAndroid Build Coastguard Worker int pixels = (1 << (log2_w + log2_h));
397*abb65b4bSAndroid Build Coastguard Worker __m256i shuffle = _mm256_setr_epi8(
398*abb65b4bSAndroid Build Coastguard Worker 0, 1, 4, 5, 8, 9, 12, 13,
399*abb65b4bSAndroid Build Coastguard Worker -1, -1, -1, -1, -1, -1, -1, -1,
400*abb65b4bSAndroid Build Coastguard Worker -1, -1, -1, -1, -1, -1, -1, -1,
401*abb65b4bSAndroid Build Coastguard Worker 0, 1, 4, 5, 8, 9, 12, 13 );
402*abb65b4bSAndroid Build Coastguard Worker if (shift > 0)
403*abb65b4bSAndroid Build Coastguard Worker {
404*abb65b4bSAndroid Build Coastguard Worker s32 offset = (1 << (shift - 1));
405*abb65b4bSAndroid Build Coastguard Worker __m256i offset_1 = _mm256_set1_epi32(offset);
406*abb65b4bSAndroid Build Coastguard Worker for (i = 0; i < pixels; i += 8)
407*abb65b4bSAndroid Build Coastguard Worker {
408*abb65b4bSAndroid Build Coastguard Worker __m256i cur_q_matrix = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(q_matrix + i)));
409*abb65b4bSAndroid Build Coastguard Worker __m256i coef_8_val_act = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(coef + i)));
410*abb65b4bSAndroid Build Coastguard Worker
411*abb65b4bSAndroid Build Coastguard Worker __m256i lev1 = _mm256_mullo_epi32(coef_8_val_act, cur_q_matrix);
412*abb65b4bSAndroid Build Coastguard Worker __m256i lev2 = _mm256_add_epi32(lev1, offset_1);
413*abb65b4bSAndroid Build Coastguard Worker __m256i lev3 = _mm256_srai_epi32(lev2, shift);
414*abb65b4bSAndroid Build Coastguard Worker
415*abb65b4bSAndroid Build Coastguard Worker lev3 = _mm256_shuffle_epi8( lev3, shuffle );
416*abb65b4bSAndroid Build Coastguard Worker __m128i low = _mm256_castsi256_si128( lev3 );
417*abb65b4bSAndroid Build Coastguard Worker __m128i high = _mm256_extracti128_si256( lev3, 1 );
418*abb65b4bSAndroid Build Coastguard Worker __m128i lev4 = _mm_or_si128( low, high );
419*abb65b4bSAndroid Build Coastguard Worker
420*abb65b4bSAndroid Build Coastguard Worker __m128i lev5 = _mm_max_epi16(lev4, _mm_set1_epi16(-32768));
421*abb65b4bSAndroid Build Coastguard Worker __m128i lev6 = _mm_min_epi16(lev5, _mm_set1_epi16(32767));
422*abb65b4bSAndroid Build Coastguard Worker
423*abb65b4bSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(coef + i), lev6);
424*abb65b4bSAndroid Build Coastguard Worker }
425*abb65b4bSAndroid Build Coastguard Worker }
426*abb65b4bSAndroid Build Coastguard Worker else
427*abb65b4bSAndroid Build Coastguard Worker {
428*abb65b4bSAndroid Build Coastguard Worker int left_shift = -shift;
429*abb65b4bSAndroid Build Coastguard Worker for (i = 0; i < pixels; i += 8)
430*abb65b4bSAndroid Build Coastguard Worker {
431*abb65b4bSAndroid Build Coastguard Worker __m256i cur_q_matrix = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(q_matrix + i)));
432*abb65b4bSAndroid Build Coastguard Worker __m256i coef_8_val_act = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(coef + i)));
433*abb65b4bSAndroid Build Coastguard Worker
434*abb65b4bSAndroid Build Coastguard Worker __m256i lev1 = _mm256_mullo_epi32(coef_8_val_act, cur_q_matrix);
435*abb65b4bSAndroid Build Coastguard Worker __m256i lev3 = _mm256_slli_epi32(lev1, left_shift);
436*abb65b4bSAndroid Build Coastguard Worker
437*abb65b4bSAndroid Build Coastguard Worker lev3 = _mm256_shuffle_epi8( lev3, shuffle );
438*abb65b4bSAndroid Build Coastguard Worker __m128i low = _mm256_castsi256_si128( lev3 );
439*abb65b4bSAndroid Build Coastguard Worker __m128i high = _mm256_extracti128_si256( lev3, 1 );
440*abb65b4bSAndroid Build Coastguard Worker __m128i lev4 = _mm_or_si128( low, high );
441*abb65b4bSAndroid Build Coastguard Worker
442*abb65b4bSAndroid Build Coastguard Worker __m128i lev5 = _mm_max_epi16(lev4, _mm_set1_epi16(-32768));
443*abb65b4bSAndroid Build Coastguard Worker __m128i lev6 = _mm_min_epi16(lev5, _mm_set1_epi16(32767));
444*abb65b4bSAndroid Build Coastguard Worker
445*abb65b4bSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(coef + i), lev6);
446*abb65b4bSAndroid Build Coastguard Worker }
447*abb65b4bSAndroid Build Coastguard Worker }
448*abb65b4bSAndroid Build Coastguard Worker }
449*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_dquant_t oapv_tbl_fn_dquant_avx[2] =
450*abb65b4bSAndroid Build Coastguard Worker {
451*abb65b4bSAndroid Build Coastguard Worker oapv_dquant_avx,
452*abb65b4bSAndroid Build Coastguard Worker NULL,
453*abb65b4bSAndroid Build Coastguard Worker };
454*abb65b4bSAndroid Build Coastguard Worker
oapv_adjust_itrans_avx(int * src,int * dst,int itrans_diff_idx,int diff_step,int shift)455*abb65b4bSAndroid Build Coastguard Worker void oapv_adjust_itrans_avx(int* src, int* dst, int itrans_diff_idx, int diff_step, int shift)
456*abb65b4bSAndroid Build Coastguard Worker {
457*abb65b4bSAndroid Build Coastguard Worker __m256i v0 = _mm256_set1_epi32(diff_step);
458*abb65b4bSAndroid Build Coastguard Worker __m256i v1 = _mm256_set1_epi32(1 << (shift - 1));
459*abb65b4bSAndroid Build Coastguard Worker __m256i s0, s1;
460*abb65b4bSAndroid Build Coastguard Worker
461*abb65b4bSAndroid Build Coastguard Worker for (int j = 0; j < 64; j += 8) {
462*abb65b4bSAndroid Build Coastguard Worker s0 = _mm256_loadu_si256((const __m256i*)(src + j));
463*abb65b4bSAndroid Build Coastguard Worker s1 = _mm256_loadu_si256((const __m256i*)(oapv_itrans_diff[itrans_diff_idx] + j));
464*abb65b4bSAndroid Build Coastguard Worker s1 = _mm256_mullo_epi32(s1, v0);
465*abb65b4bSAndroid Build Coastguard Worker s1 = _mm256_add_epi32(s1, v1);
466*abb65b4bSAndroid Build Coastguard Worker s1 = _mm256_srai_epi32(s1, shift);
467*abb65b4bSAndroid Build Coastguard Worker s1 = _mm256_add_epi32(s0, s1);
468*abb65b4bSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*)(dst + j), s1);
469*abb65b4bSAndroid Build Coastguard Worker }
470*abb65b4bSAndroid Build Coastguard Worker }
471*abb65b4bSAndroid Build Coastguard Worker
472*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_itx_adj_t oapv_tbl_fn_itx_adj_avx[2] =
473*abb65b4bSAndroid Build Coastguard Worker {
474*abb65b4bSAndroid Build Coastguard Worker oapv_adjust_itrans_avx,
475*abb65b4bSAndroid Build Coastguard Worker NULL,
476*abb65b4bSAndroid Build Coastguard Worker };