1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
12*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/ppc/transpose_vsx.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/ppc/txfm_common_vsx.h"
16*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/ppc/types_vsx.h"
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard Worker // Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
single_butterfly(int16x8_t a,int16x8_t b,int16x8_t * add,int16x8_t * sub)19*fb1b10abSAndroid Build Coastguard Worker static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
20*fb1b10abSAndroid Build Coastguard Worker int16x8_t *sub) {
21*fb1b10abSAndroid Build Coastguard Worker // Since a + b can overflow 16 bits, the multiplication is distributed
22*fb1b10abSAndroid Build Coastguard Worker // (a * c +/- b * c).
23*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ac_e = vec_mule(a, cospi16_v);
24*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ac_o = vec_mulo(a, cospi16_v);
25*fb1b10abSAndroid Build Coastguard Worker const int32x4_t bc_e = vec_mule(b, cospi16_v);
26*fb1b10abSAndroid Build Coastguard Worker const int32x4_t bc_o = vec_mulo(b, cospi16_v);
27*fb1b10abSAndroid Build Coastguard Worker
28*fb1b10abSAndroid Build Coastguard Worker // Reuse the same multiplies for sum and difference.
29*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sum_e = vec_add(ac_e, bc_e);
30*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sum_o = vec_add(ac_o, bc_o);
31*fb1b10abSAndroid Build Coastguard Worker const int32x4_t diff_e = vec_sub(ac_e, bc_e);
32*fb1b10abSAndroid Build Coastguard Worker const int32x4_t diff_o = vec_sub(ac_o, bc_o);
33*fb1b10abSAndroid Build Coastguard Worker
34*fb1b10abSAndroid Build Coastguard Worker // Add rounding offset
35*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
36*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
37*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
38*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
39*fb1b10abSAndroid Build Coastguard Worker
40*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
41*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
42*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
43*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
44*fb1b10abSAndroid Build Coastguard Worker
45*fb1b10abSAndroid Build Coastguard Worker // There's no pack operation for even and odd, so we need to permute.
46*fb1b10abSAndroid Build Coastguard Worker *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
47*fb1b10abSAndroid Build Coastguard Worker *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
48*fb1b10abSAndroid Build Coastguard Worker }
49*fb1b10abSAndroid Build Coastguard Worker
50*fb1b10abSAndroid Build Coastguard Worker // Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14
double_butterfly(int16x8_t a,int16x8_t c1,int16x8_t b,int16x8_t c2,int16x8_t * add,int16x8_t * sub)51*fb1b10abSAndroid Build Coastguard Worker static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b,
52*fb1b10abSAndroid Build Coastguard Worker int16x8_t c2, int16x8_t *add,
53*fb1b10abSAndroid Build Coastguard Worker int16x8_t *sub) {
54*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ac1_o = vec_mulo(a, c1);
55*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ac1_e = vec_mule(a, c1);
56*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ac2_o = vec_mulo(a, c2);
57*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ac2_e = vec_mule(a, c2);
58*fb1b10abSAndroid Build Coastguard Worker
59*fb1b10abSAndroid Build Coastguard Worker const int32x4_t bc1_o = vec_mulo(b, c1);
60*fb1b10abSAndroid Build Coastguard Worker const int32x4_t bc1_e = vec_mule(b, c1);
61*fb1b10abSAndroid Build Coastguard Worker const int32x4_t bc2_o = vec_mulo(b, c2);
62*fb1b10abSAndroid Build Coastguard Worker const int32x4_t bc2_e = vec_mule(b, c2);
63*fb1b10abSAndroid Build Coastguard Worker
64*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sum_o = vec_add(ac1_o, bc2_o);
65*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sum_e = vec_add(ac1_e, bc2_e);
66*fb1b10abSAndroid Build Coastguard Worker const int32x4_t diff_o = vec_sub(ac2_o, bc1_o);
67*fb1b10abSAndroid Build Coastguard Worker const int32x4_t diff_e = vec_sub(ac2_e, bc1_e);
68*fb1b10abSAndroid Build Coastguard Worker
69*fb1b10abSAndroid Build Coastguard Worker // Add rounding offset
70*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
71*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
72*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
73*fb1b10abSAndroid Build Coastguard Worker const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
74*fb1b10abSAndroid Build Coastguard Worker
75*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
76*fb1b10abSAndroid Build Coastguard Worker const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
77*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
78*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
79*fb1b10abSAndroid Build Coastguard Worker
80*fb1b10abSAndroid Build Coastguard Worker // There's no pack operation for even and odd, so we need to permute.
81*fb1b10abSAndroid Build Coastguard Worker *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
82*fb1b10abSAndroid Build Coastguard Worker *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
83*fb1b10abSAndroid Build Coastguard Worker }
84*fb1b10abSAndroid Build Coastguard Worker
85*fb1b10abSAndroid Build Coastguard Worker // While other architecture combine the load and the stage 1 operations, Power9
86*fb1b10abSAndroid Build Coastguard Worker // benchmarking show no benefit in such an approach.
load(const int16_t * a,int stride,int16x8_t * b)87*fb1b10abSAndroid Build Coastguard Worker static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
88*fb1b10abSAndroid Build Coastguard Worker // Tried out different combinations of load and shift instructions, this is
89*fb1b10abSAndroid Build Coastguard Worker // the fastest one.
90*fb1b10abSAndroid Build Coastguard Worker {
91*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l0 = vec_vsx_ld(0, a);
92*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l1 = vec_vsx_ld(0, a + stride);
93*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride);
94*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride);
95*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride);
96*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride);
97*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride);
98*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride);
99*fb1b10abSAndroid Build Coastguard Worker
100*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride);
101*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride);
102*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride);
103*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride);
104*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride);
105*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride);
106*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride);
107*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride);
108*fb1b10abSAndroid Build Coastguard Worker
109*fb1b10abSAndroid Build Coastguard Worker b[0] = vec_sl(l0, vec_dct_scale_log2);
110*fb1b10abSAndroid Build Coastguard Worker b[1] = vec_sl(l1, vec_dct_scale_log2);
111*fb1b10abSAndroid Build Coastguard Worker b[2] = vec_sl(l2, vec_dct_scale_log2);
112*fb1b10abSAndroid Build Coastguard Worker b[3] = vec_sl(l3, vec_dct_scale_log2);
113*fb1b10abSAndroid Build Coastguard Worker b[4] = vec_sl(l4, vec_dct_scale_log2);
114*fb1b10abSAndroid Build Coastguard Worker b[5] = vec_sl(l5, vec_dct_scale_log2);
115*fb1b10abSAndroid Build Coastguard Worker b[6] = vec_sl(l6, vec_dct_scale_log2);
116*fb1b10abSAndroid Build Coastguard Worker b[7] = vec_sl(l7, vec_dct_scale_log2);
117*fb1b10abSAndroid Build Coastguard Worker
118*fb1b10abSAndroid Build Coastguard Worker b[8] = vec_sl(l8, vec_dct_scale_log2);
119*fb1b10abSAndroid Build Coastguard Worker b[9] = vec_sl(l9, vec_dct_scale_log2);
120*fb1b10abSAndroid Build Coastguard Worker b[10] = vec_sl(l10, vec_dct_scale_log2);
121*fb1b10abSAndroid Build Coastguard Worker b[11] = vec_sl(l11, vec_dct_scale_log2);
122*fb1b10abSAndroid Build Coastguard Worker b[12] = vec_sl(l12, vec_dct_scale_log2);
123*fb1b10abSAndroid Build Coastguard Worker b[13] = vec_sl(l13, vec_dct_scale_log2);
124*fb1b10abSAndroid Build Coastguard Worker b[14] = vec_sl(l14, vec_dct_scale_log2);
125*fb1b10abSAndroid Build Coastguard Worker b[15] = vec_sl(l15, vec_dct_scale_log2);
126*fb1b10abSAndroid Build Coastguard Worker }
127*fb1b10abSAndroid Build Coastguard Worker {
128*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride);
129*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride);
130*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride);
131*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride);
132*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride);
133*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride);
134*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride);
135*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride);
136*fb1b10abSAndroid Build Coastguard Worker
137*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride);
138*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride);
139*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride);
140*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride);
141*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride);
142*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride);
143*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride);
144*fb1b10abSAndroid Build Coastguard Worker const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride);
145*fb1b10abSAndroid Build Coastguard Worker
146*fb1b10abSAndroid Build Coastguard Worker b[16] = vec_sl(l16, vec_dct_scale_log2);
147*fb1b10abSAndroid Build Coastguard Worker b[17] = vec_sl(l17, vec_dct_scale_log2);
148*fb1b10abSAndroid Build Coastguard Worker b[18] = vec_sl(l18, vec_dct_scale_log2);
149*fb1b10abSAndroid Build Coastguard Worker b[19] = vec_sl(l19, vec_dct_scale_log2);
150*fb1b10abSAndroid Build Coastguard Worker b[20] = vec_sl(l20, vec_dct_scale_log2);
151*fb1b10abSAndroid Build Coastguard Worker b[21] = vec_sl(l21, vec_dct_scale_log2);
152*fb1b10abSAndroid Build Coastguard Worker b[22] = vec_sl(l22, vec_dct_scale_log2);
153*fb1b10abSAndroid Build Coastguard Worker b[23] = vec_sl(l23, vec_dct_scale_log2);
154*fb1b10abSAndroid Build Coastguard Worker
155*fb1b10abSAndroid Build Coastguard Worker b[24] = vec_sl(l24, vec_dct_scale_log2);
156*fb1b10abSAndroid Build Coastguard Worker b[25] = vec_sl(l25, vec_dct_scale_log2);
157*fb1b10abSAndroid Build Coastguard Worker b[26] = vec_sl(l26, vec_dct_scale_log2);
158*fb1b10abSAndroid Build Coastguard Worker b[27] = vec_sl(l27, vec_dct_scale_log2);
159*fb1b10abSAndroid Build Coastguard Worker b[28] = vec_sl(l28, vec_dct_scale_log2);
160*fb1b10abSAndroid Build Coastguard Worker b[29] = vec_sl(l29, vec_dct_scale_log2);
161*fb1b10abSAndroid Build Coastguard Worker b[30] = vec_sl(l30, vec_dct_scale_log2);
162*fb1b10abSAndroid Build Coastguard Worker b[31] = vec_sl(l31, vec_dct_scale_log2);
163*fb1b10abSAndroid Build Coastguard Worker }
164*fb1b10abSAndroid Build Coastguard Worker }
165*fb1b10abSAndroid Build Coastguard Worker
store(tran_low_t * a,const int16x8_t * b)166*fb1b10abSAndroid Build Coastguard Worker static INLINE void store(tran_low_t *a, const int16x8_t *b) {
167*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[0], 0, a);
168*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[8], 0, a + 8);
169*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[16], 0, a + 16);
170*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[24], 0, a + 24);
171*fb1b10abSAndroid Build Coastguard Worker
172*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[1], 0, a + 32);
173*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[9], 0, a + 40);
174*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[17], 0, a + 48);
175*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[25], 0, a + 56);
176*fb1b10abSAndroid Build Coastguard Worker
177*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[2], 0, a + 64);
178*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[10], 0, a + 72);
179*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[18], 0, a + 80);
180*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[26], 0, a + 88);
181*fb1b10abSAndroid Build Coastguard Worker
182*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[3], 0, a + 96);
183*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[11], 0, a + 104);
184*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[19], 0, a + 112);
185*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[27], 0, a + 120);
186*fb1b10abSAndroid Build Coastguard Worker
187*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[4], 0, a + 128);
188*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[12], 0, a + 136);
189*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[20], 0, a + 144);
190*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[28], 0, a + 152);
191*fb1b10abSAndroid Build Coastguard Worker
192*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[5], 0, a + 160);
193*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[13], 0, a + 168);
194*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[21], 0, a + 176);
195*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[29], 0, a + 184);
196*fb1b10abSAndroid Build Coastguard Worker
197*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[6], 0, a + 192);
198*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[14], 0, a + 200);
199*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[22], 0, a + 208);
200*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[30], 0, a + 216);
201*fb1b10abSAndroid Build Coastguard Worker
202*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[7], 0, a + 224);
203*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[15], 0, a + 232);
204*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[23], 0, a + 240);
205*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(b[31], 0, a + 248);
206*fb1b10abSAndroid Build Coastguard Worker }
207*fb1b10abSAndroid Build Coastguard Worker
208*fb1b10abSAndroid Build Coastguard Worker // Returns 1 if negative 0 if positive
vec_sign_s16(int16x8_t a)209*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t vec_sign_s16(int16x8_t a) {
210*fb1b10abSAndroid Build Coastguard Worker return vec_sr(a, vec_shift_sign_s16);
211*fb1b10abSAndroid Build Coastguard Worker }
212*fb1b10abSAndroid Build Coastguard Worker
213*fb1b10abSAndroid Build Coastguard Worker // Add 2 if positive, 1 if negative, and shift by 2.
sub_round_shift(const int16x8_t a)214*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
215*fb1b10abSAndroid Build Coastguard Worker const int16x8_t sign = vec_sign_s16(a);
216*fb1b10abSAndroid Build Coastguard Worker return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2);
217*fb1b10abSAndroid Build Coastguard Worker }
218*fb1b10abSAndroid Build Coastguard Worker
219*fb1b10abSAndroid Build Coastguard Worker // Add 1 if positive, 2 if negative, and shift by 2.
220*fb1b10abSAndroid Build Coastguard Worker // In practice, add 1, then add the sign bit, then shift without rounding.
add_round_shift_s16(const int16x8_t a)221*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
222*fb1b10abSAndroid Build Coastguard Worker const int16x8_t sign = vec_sign_s16(a);
223*fb1b10abSAndroid Build Coastguard Worker return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
224*fb1b10abSAndroid Build Coastguard Worker }
225*fb1b10abSAndroid Build Coastguard Worker
fdct32_vsx(const int16x8_t * in,int16x8_t * out,int pass)226*fb1b10abSAndroid Build Coastguard Worker static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
227*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp0[32]; // Hold stages: 1, 4, 7
228*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp1[32]; // Hold stages: 2, 5
229*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp2[32]; // Hold stages: 3, 6
230*fb1b10abSAndroid Build Coastguard Worker int i;
231*fb1b10abSAndroid Build Coastguard Worker
232*fb1b10abSAndroid Build Coastguard Worker // Stage 1
233*fb1b10abSAndroid Build Coastguard Worker // Unrolling this loops actually slows down Power9 benchmarks
234*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < 16; i++) {
235*fb1b10abSAndroid Build Coastguard Worker temp0[i] = vec_add(in[i], in[31 - i]);
236*fb1b10abSAndroid Build Coastguard Worker // pass through to stage 3.
237*fb1b10abSAndroid Build Coastguard Worker temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
238*fb1b10abSAndroid Build Coastguard Worker }
239*fb1b10abSAndroid Build Coastguard Worker
240*fb1b10abSAndroid Build Coastguard Worker // Stage 2
241*fb1b10abSAndroid Build Coastguard Worker // Unrolling this loops actually slows down Power9 benchmarks
242*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < 8; i++) {
243*fb1b10abSAndroid Build Coastguard Worker temp1[i] = vec_add(temp0[i], temp0[15 - i]);
244*fb1b10abSAndroid Build Coastguard Worker temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
245*fb1b10abSAndroid Build Coastguard Worker }
246*fb1b10abSAndroid Build Coastguard Worker
247*fb1b10abSAndroid Build Coastguard Worker // Apply butterflies (in place) on pass through to stage 3.
248*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]);
249*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]);
250*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]);
251*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]);
252*fb1b10abSAndroid Build Coastguard Worker
253*fb1b10abSAndroid Build Coastguard Worker // dump the magnitude by 4, hence the intermediate values are within
254*fb1b10abSAndroid Build Coastguard Worker // the range of 16 bits.
255*fb1b10abSAndroid Build Coastguard Worker if (pass) {
256*fb1b10abSAndroid Build Coastguard Worker temp1[0] = add_round_shift_s16(temp1[0]);
257*fb1b10abSAndroid Build Coastguard Worker temp1[1] = add_round_shift_s16(temp1[1]);
258*fb1b10abSAndroid Build Coastguard Worker temp1[2] = add_round_shift_s16(temp1[2]);
259*fb1b10abSAndroid Build Coastguard Worker temp1[3] = add_round_shift_s16(temp1[3]);
260*fb1b10abSAndroid Build Coastguard Worker temp1[4] = add_round_shift_s16(temp1[4]);
261*fb1b10abSAndroid Build Coastguard Worker temp1[5] = add_round_shift_s16(temp1[5]);
262*fb1b10abSAndroid Build Coastguard Worker temp1[6] = add_round_shift_s16(temp1[6]);
263*fb1b10abSAndroid Build Coastguard Worker temp1[7] = add_round_shift_s16(temp1[7]);
264*fb1b10abSAndroid Build Coastguard Worker temp1[8] = add_round_shift_s16(temp1[8]);
265*fb1b10abSAndroid Build Coastguard Worker temp1[9] = add_round_shift_s16(temp1[9]);
266*fb1b10abSAndroid Build Coastguard Worker temp1[10] = add_round_shift_s16(temp1[10]);
267*fb1b10abSAndroid Build Coastguard Worker temp1[11] = add_round_shift_s16(temp1[11]);
268*fb1b10abSAndroid Build Coastguard Worker temp1[12] = add_round_shift_s16(temp1[12]);
269*fb1b10abSAndroid Build Coastguard Worker temp1[13] = add_round_shift_s16(temp1[13]);
270*fb1b10abSAndroid Build Coastguard Worker temp1[14] = add_round_shift_s16(temp1[14]);
271*fb1b10abSAndroid Build Coastguard Worker temp1[15] = add_round_shift_s16(temp1[15]);
272*fb1b10abSAndroid Build Coastguard Worker
273*fb1b10abSAndroid Build Coastguard Worker temp1[16] = add_round_shift_s16(temp1[16]);
274*fb1b10abSAndroid Build Coastguard Worker temp1[17] = add_round_shift_s16(temp1[17]);
275*fb1b10abSAndroid Build Coastguard Worker temp1[18] = add_round_shift_s16(temp1[18]);
276*fb1b10abSAndroid Build Coastguard Worker temp1[19] = add_round_shift_s16(temp1[19]);
277*fb1b10abSAndroid Build Coastguard Worker temp1[20] = add_round_shift_s16(temp1[20]);
278*fb1b10abSAndroid Build Coastguard Worker temp1[21] = add_round_shift_s16(temp1[21]);
279*fb1b10abSAndroid Build Coastguard Worker temp1[22] = add_round_shift_s16(temp1[22]);
280*fb1b10abSAndroid Build Coastguard Worker temp1[23] = add_round_shift_s16(temp1[23]);
281*fb1b10abSAndroid Build Coastguard Worker temp1[24] = add_round_shift_s16(temp1[24]);
282*fb1b10abSAndroid Build Coastguard Worker temp1[25] = add_round_shift_s16(temp1[25]);
283*fb1b10abSAndroid Build Coastguard Worker temp1[26] = add_round_shift_s16(temp1[26]);
284*fb1b10abSAndroid Build Coastguard Worker temp1[27] = add_round_shift_s16(temp1[27]);
285*fb1b10abSAndroid Build Coastguard Worker temp1[28] = add_round_shift_s16(temp1[28]);
286*fb1b10abSAndroid Build Coastguard Worker temp1[29] = add_round_shift_s16(temp1[29]);
287*fb1b10abSAndroid Build Coastguard Worker temp1[30] = add_round_shift_s16(temp1[30]);
288*fb1b10abSAndroid Build Coastguard Worker temp1[31] = add_round_shift_s16(temp1[31]);
289*fb1b10abSAndroid Build Coastguard Worker }
290*fb1b10abSAndroid Build Coastguard Worker
291*fb1b10abSAndroid Build Coastguard Worker // Stage 3
292*fb1b10abSAndroid Build Coastguard Worker temp2[0] = vec_add(temp1[0], temp1[7]);
293*fb1b10abSAndroid Build Coastguard Worker temp2[1] = vec_add(temp1[1], temp1[6]);
294*fb1b10abSAndroid Build Coastguard Worker temp2[2] = vec_add(temp1[2], temp1[5]);
295*fb1b10abSAndroid Build Coastguard Worker temp2[3] = vec_add(temp1[3], temp1[4]);
296*fb1b10abSAndroid Build Coastguard Worker temp2[5] = vec_sub(temp1[2], temp1[5]);
297*fb1b10abSAndroid Build Coastguard Worker temp2[6] = vec_sub(temp1[1], temp1[6]);
298*fb1b10abSAndroid Build Coastguard Worker temp2[8] = temp1[8];
299*fb1b10abSAndroid Build Coastguard Worker temp2[9] = temp1[9];
300*fb1b10abSAndroid Build Coastguard Worker
301*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]);
302*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]);
303*fb1b10abSAndroid Build Coastguard Worker temp2[14] = temp1[14];
304*fb1b10abSAndroid Build Coastguard Worker temp2[15] = temp1[15];
305*fb1b10abSAndroid Build Coastguard Worker
306*fb1b10abSAndroid Build Coastguard Worker temp2[18] = vec_add(temp1[18], temp1[21]);
307*fb1b10abSAndroid Build Coastguard Worker temp2[19] = vec_add(temp1[19], temp1[20]);
308*fb1b10abSAndroid Build Coastguard Worker
309*fb1b10abSAndroid Build Coastguard Worker temp2[20] = vec_sub(temp1[19], temp1[20]);
310*fb1b10abSAndroid Build Coastguard Worker temp2[21] = vec_sub(temp1[18], temp1[21]);
311*fb1b10abSAndroid Build Coastguard Worker
312*fb1b10abSAndroid Build Coastguard Worker temp2[26] = vec_sub(temp1[29], temp1[26]);
313*fb1b10abSAndroid Build Coastguard Worker temp2[27] = vec_sub(temp1[28], temp1[27]);
314*fb1b10abSAndroid Build Coastguard Worker
315*fb1b10abSAndroid Build Coastguard Worker temp2[28] = vec_add(temp1[28], temp1[27]);
316*fb1b10abSAndroid Build Coastguard Worker temp2[29] = vec_add(temp1[29], temp1[26]);
317*fb1b10abSAndroid Build Coastguard Worker
318*fb1b10abSAndroid Build Coastguard Worker // Pass through Stage 4
319*fb1b10abSAndroid Build Coastguard Worker temp0[7] = vec_sub(temp1[0], temp1[7]);
320*fb1b10abSAndroid Build Coastguard Worker temp0[4] = vec_sub(temp1[3], temp1[4]);
321*fb1b10abSAndroid Build Coastguard Worker temp0[16] = vec_add(temp1[16], temp1[23]);
322*fb1b10abSAndroid Build Coastguard Worker temp0[17] = vec_add(temp1[17], temp1[22]);
323*fb1b10abSAndroid Build Coastguard Worker temp0[22] = vec_sub(temp1[17], temp1[22]);
324*fb1b10abSAndroid Build Coastguard Worker temp0[23] = vec_sub(temp1[16], temp1[23]);
325*fb1b10abSAndroid Build Coastguard Worker temp0[24] = vec_sub(temp1[31], temp1[24]);
326*fb1b10abSAndroid Build Coastguard Worker temp0[25] = vec_sub(temp1[30], temp1[25]);
327*fb1b10abSAndroid Build Coastguard Worker temp0[30] = vec_add(temp1[30], temp1[25]);
328*fb1b10abSAndroid Build Coastguard Worker temp0[31] = vec_add(temp1[31], temp1[24]);
329*fb1b10abSAndroid Build Coastguard Worker
330*fb1b10abSAndroid Build Coastguard Worker // Stage 4
331*fb1b10abSAndroid Build Coastguard Worker temp0[0] = vec_add(temp2[0], temp2[3]);
332*fb1b10abSAndroid Build Coastguard Worker temp0[1] = vec_add(temp2[1], temp2[2]);
333*fb1b10abSAndroid Build Coastguard Worker temp0[2] = vec_sub(temp2[1], temp2[2]);
334*fb1b10abSAndroid Build Coastguard Worker temp0[3] = vec_sub(temp2[0], temp2[3]);
335*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]);
336*fb1b10abSAndroid Build Coastguard Worker
337*fb1b10abSAndroid Build Coastguard Worker temp0[9] = vec_add(temp2[9], temp2[10]);
338*fb1b10abSAndroid Build Coastguard Worker temp0[10] = vec_sub(temp2[9], temp2[10]);
339*fb1b10abSAndroid Build Coastguard Worker temp0[13] = vec_sub(temp2[14], temp2[13]);
340*fb1b10abSAndroid Build Coastguard Worker temp0[14] = vec_add(temp2[14], temp2[13]);
341*fb1b10abSAndroid Build Coastguard Worker
342*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29],
343*fb1b10abSAndroid Build Coastguard Worker &temp0[18]);
344*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28],
345*fb1b10abSAndroid Build Coastguard Worker &temp0[19]);
346*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27],
347*fb1b10abSAndroid Build Coastguard Worker &temp0[20]);
348*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26],
349*fb1b10abSAndroid Build Coastguard Worker &temp0[21]);
350*fb1b10abSAndroid Build Coastguard Worker
351*fb1b10abSAndroid Build Coastguard Worker // Pass through Stage 5
352*fb1b10abSAndroid Build Coastguard Worker temp1[8] = vec_add(temp2[8], temp2[11]);
353*fb1b10abSAndroid Build Coastguard Worker temp1[11] = vec_sub(temp2[8], temp2[11]);
354*fb1b10abSAndroid Build Coastguard Worker temp1[12] = vec_sub(temp2[15], temp2[12]);
355*fb1b10abSAndroid Build Coastguard Worker temp1[15] = vec_add(temp2[15], temp2[12]);
356*fb1b10abSAndroid Build Coastguard Worker
357*fb1b10abSAndroid Build Coastguard Worker // Stage 5
358*fb1b10abSAndroid Build Coastguard Worker // 0 and 1 pass through to 0 and 16 at the end
359*fb1b10abSAndroid Build Coastguard Worker single_butterfly(temp0[0], temp0[1], &out[0], &out[16]);
360*fb1b10abSAndroid Build Coastguard Worker
361*fb1b10abSAndroid Build Coastguard Worker // 2 and 3 pass through to 8 and 24 at the end
362*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]);
363*fb1b10abSAndroid Build Coastguard Worker
364*fb1b10abSAndroid Build Coastguard Worker temp1[4] = vec_add(temp0[4], temp0[5]);
365*fb1b10abSAndroid Build Coastguard Worker temp1[5] = vec_sub(temp0[4], temp0[5]);
366*fb1b10abSAndroid Build Coastguard Worker temp1[6] = vec_sub(temp0[7], temp0[6]);
367*fb1b10abSAndroid Build Coastguard Worker temp1[7] = vec_add(temp0[7], temp0[6]);
368*fb1b10abSAndroid Build Coastguard Worker
369*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14],
370*fb1b10abSAndroid Build Coastguard Worker &temp1[9]);
371*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13],
372*fb1b10abSAndroid Build Coastguard Worker &temp1[10]);
373*fb1b10abSAndroid Build Coastguard Worker
374*fb1b10abSAndroid Build Coastguard Worker temp1[17] = vec_add(temp0[17], temp0[18]);
375*fb1b10abSAndroid Build Coastguard Worker temp1[18] = vec_sub(temp0[17], temp0[18]);
376*fb1b10abSAndroid Build Coastguard Worker
377*fb1b10abSAndroid Build Coastguard Worker temp1[21] = vec_sub(temp0[22], temp0[21]);
378*fb1b10abSAndroid Build Coastguard Worker temp1[22] = vec_add(temp0[22], temp0[21]);
379*fb1b10abSAndroid Build Coastguard Worker
380*fb1b10abSAndroid Build Coastguard Worker temp1[25] = vec_add(temp0[25], temp0[26]);
381*fb1b10abSAndroid Build Coastguard Worker temp1[26] = vec_sub(temp0[25], temp0[26]);
382*fb1b10abSAndroid Build Coastguard Worker
383*fb1b10abSAndroid Build Coastguard Worker temp1[29] = vec_sub(temp0[30], temp0[29]);
384*fb1b10abSAndroid Build Coastguard Worker temp1[30] = vec_add(temp0[30], temp0[29]);
385*fb1b10abSAndroid Build Coastguard Worker
386*fb1b10abSAndroid Build Coastguard Worker // Pass through Stage 6
387*fb1b10abSAndroid Build Coastguard Worker temp2[16] = vec_add(temp0[16], temp0[19]);
388*fb1b10abSAndroid Build Coastguard Worker temp2[19] = vec_sub(temp0[16], temp0[19]);
389*fb1b10abSAndroid Build Coastguard Worker temp2[20] = vec_sub(temp0[23], temp0[20]);
390*fb1b10abSAndroid Build Coastguard Worker temp2[23] = vec_add(temp0[23], temp0[20]);
391*fb1b10abSAndroid Build Coastguard Worker temp2[24] = vec_add(temp0[24], temp0[27]);
392*fb1b10abSAndroid Build Coastguard Worker temp2[27] = vec_sub(temp0[24], temp0[27]);
393*fb1b10abSAndroid Build Coastguard Worker temp2[28] = vec_sub(temp0[31], temp0[28]);
394*fb1b10abSAndroid Build Coastguard Worker temp2[31] = vec_add(temp0[31], temp0[28]);
395*fb1b10abSAndroid Build Coastguard Worker
396*fb1b10abSAndroid Build Coastguard Worker // Stage 6
397*fb1b10abSAndroid Build Coastguard Worker // 4 and 7 pass through to 4 and 28 at the end
398*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]);
399*fb1b10abSAndroid Build Coastguard Worker // 5 and 6 pass through to 20 and 12 at the end
400*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20],
401*fb1b10abSAndroid Build Coastguard Worker &out[12]);
402*fb1b10abSAndroid Build Coastguard Worker temp2[8] = vec_add(temp1[8], temp1[9]);
403*fb1b10abSAndroid Build Coastguard Worker temp2[9] = vec_sub(temp1[8], temp1[9]);
404*fb1b10abSAndroid Build Coastguard Worker temp2[10] = vec_sub(temp1[11], temp1[10]);
405*fb1b10abSAndroid Build Coastguard Worker temp2[11] = vec_add(temp1[11], temp1[10]);
406*fb1b10abSAndroid Build Coastguard Worker temp2[12] = vec_add(temp1[12], temp1[13]);
407*fb1b10abSAndroid Build Coastguard Worker temp2[13] = vec_sub(temp1[12], temp1[13]);
408*fb1b10abSAndroid Build Coastguard Worker temp2[14] = vec_sub(temp1[15], temp1[14]);
409*fb1b10abSAndroid Build Coastguard Worker temp2[15] = vec_add(temp1[15], temp1[14]);
410*fb1b10abSAndroid Build Coastguard Worker
411*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30],
412*fb1b10abSAndroid Build Coastguard Worker &temp2[17]);
413*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29],
414*fb1b10abSAndroid Build Coastguard Worker &temp2[18]);
415*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26],
416*fb1b10abSAndroid Build Coastguard Worker &temp2[21]);
417*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25],
418*fb1b10abSAndroid Build Coastguard Worker &temp2[22]);
419*fb1b10abSAndroid Build Coastguard Worker
420*fb1b10abSAndroid Build Coastguard Worker // Stage 7
421*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]);
422*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18],
423*fb1b10abSAndroid Build Coastguard Worker &out[14]);
424*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10],
425*fb1b10abSAndroid Build Coastguard Worker &out[22]);
426*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26],
427*fb1b10abSAndroid Build Coastguard Worker &out[6]);
428*fb1b10abSAndroid Build Coastguard Worker
429*fb1b10abSAndroid Build Coastguard Worker temp0[16] = vec_add(temp2[16], temp2[17]);
430*fb1b10abSAndroid Build Coastguard Worker temp0[17] = vec_sub(temp2[16], temp2[17]);
431*fb1b10abSAndroid Build Coastguard Worker temp0[18] = vec_sub(temp2[19], temp2[18]);
432*fb1b10abSAndroid Build Coastguard Worker temp0[19] = vec_add(temp2[19], temp2[18]);
433*fb1b10abSAndroid Build Coastguard Worker temp0[20] = vec_add(temp2[20], temp2[21]);
434*fb1b10abSAndroid Build Coastguard Worker temp0[21] = vec_sub(temp2[20], temp2[21]);
435*fb1b10abSAndroid Build Coastguard Worker temp0[22] = vec_sub(temp2[23], temp2[22]);
436*fb1b10abSAndroid Build Coastguard Worker temp0[23] = vec_add(temp2[23], temp2[22]);
437*fb1b10abSAndroid Build Coastguard Worker temp0[24] = vec_add(temp2[24], temp2[25]);
438*fb1b10abSAndroid Build Coastguard Worker temp0[25] = vec_sub(temp2[24], temp2[25]);
439*fb1b10abSAndroid Build Coastguard Worker temp0[26] = vec_sub(temp2[27], temp2[26]);
440*fb1b10abSAndroid Build Coastguard Worker temp0[27] = vec_add(temp2[27], temp2[26]);
441*fb1b10abSAndroid Build Coastguard Worker temp0[28] = vec_add(temp2[28], temp2[29]);
442*fb1b10abSAndroid Build Coastguard Worker temp0[29] = vec_sub(temp2[28], temp2[29]);
443*fb1b10abSAndroid Build Coastguard Worker temp0[30] = vec_sub(temp2[31], temp2[30]);
444*fb1b10abSAndroid Build Coastguard Worker temp0[31] = vec_add(temp2[31], temp2[30]);
445*fb1b10abSAndroid Build Coastguard Worker
446*fb1b10abSAndroid Build Coastguard Worker // Final stage --- outputs indices are bit-reversed.
447*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1],
448*fb1b10abSAndroid Build Coastguard Worker &out[31]);
449*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17],
450*fb1b10abSAndroid Build Coastguard Worker &out[15]);
451*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9],
452*fb1b10abSAndroid Build Coastguard Worker &out[23]);
453*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25],
454*fb1b10abSAndroid Build Coastguard Worker &out[7]);
455*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5],
456*fb1b10abSAndroid Build Coastguard Worker &out[27]);
457*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21],
458*fb1b10abSAndroid Build Coastguard Worker &out[11]);
459*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13],
460*fb1b10abSAndroid Build Coastguard Worker &out[19]);
461*fb1b10abSAndroid Build Coastguard Worker double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29],
462*fb1b10abSAndroid Build Coastguard Worker &out[3]);
463*fb1b10abSAndroid Build Coastguard Worker
464*fb1b10abSAndroid Build Coastguard Worker if (pass == 0) {
465*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < 32; i++) {
466*fb1b10abSAndroid Build Coastguard Worker out[i] = sub_round_shift(out[i]);
467*fb1b10abSAndroid Build Coastguard Worker }
468*fb1b10abSAndroid Build Coastguard Worker }
469*fb1b10abSAndroid Build Coastguard Worker }
470*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct32x32_rd_vsx(const int16_t * input,tran_low_t * out,int stride)471*fb1b10abSAndroid Build Coastguard Worker void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) {
472*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp0[32];
473*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp1[32];
474*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp2[32];
475*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp3[32];
476*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp4[32];
477*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp5[32];
478*fb1b10abSAndroid Build Coastguard Worker int16x8_t temp6[32];
479*fb1b10abSAndroid Build Coastguard Worker
480*fb1b10abSAndroid Build Coastguard Worker // Process in 8x32 columns.
481*fb1b10abSAndroid Build Coastguard Worker load(input, stride, temp0);
482*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp1, 0);
483*fb1b10abSAndroid Build Coastguard Worker
484*fb1b10abSAndroid Build Coastguard Worker load(input + 8, stride, temp0);
485*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp2, 0);
486*fb1b10abSAndroid Build Coastguard Worker
487*fb1b10abSAndroid Build Coastguard Worker load(input + 16, stride, temp0);
488*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp3, 0);
489*fb1b10abSAndroid Build Coastguard Worker
490*fb1b10abSAndroid Build Coastguard Worker load(input + 24, stride, temp0);
491*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp4, 0);
492*fb1b10abSAndroid Build Coastguard Worker
493*fb1b10abSAndroid Build Coastguard Worker // Generate the top row by munging the first set of 8 from each one
494*fb1b10abSAndroid Build Coastguard Worker // together.
495*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp1[0], &temp0[0]);
496*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp2[0], &temp0[8]);
497*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp3[0], &temp0[16]);
498*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp4[0], &temp0[24]);
499*fb1b10abSAndroid Build Coastguard Worker
500*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp5, 1);
501*fb1b10abSAndroid Build Coastguard Worker
502*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[0], &temp6[0]);
503*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[8], &temp6[8]);
504*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[16], &temp6[16]);
505*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[24], &temp6[24]);
506*fb1b10abSAndroid Build Coastguard Worker
507*fb1b10abSAndroid Build Coastguard Worker store(out, temp6);
508*fb1b10abSAndroid Build Coastguard Worker
509*fb1b10abSAndroid Build Coastguard Worker // Second row of 8x32.
510*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp1[8], &temp0[0]);
511*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp2[8], &temp0[8]);
512*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp3[8], &temp0[16]);
513*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp4[8], &temp0[24]);
514*fb1b10abSAndroid Build Coastguard Worker
515*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp5, 1);
516*fb1b10abSAndroid Build Coastguard Worker
517*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[0], &temp6[0]);
518*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[8], &temp6[8]);
519*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[16], &temp6[16]);
520*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[24], &temp6[24]);
521*fb1b10abSAndroid Build Coastguard Worker
522*fb1b10abSAndroid Build Coastguard Worker store(out + 8 * 32, temp6);
523*fb1b10abSAndroid Build Coastguard Worker
524*fb1b10abSAndroid Build Coastguard Worker // Third row of 8x32
525*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp1[16], &temp0[0]);
526*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp2[16], &temp0[8]);
527*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp3[16], &temp0[16]);
528*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp4[16], &temp0[24]);
529*fb1b10abSAndroid Build Coastguard Worker
530*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp5, 1);
531*fb1b10abSAndroid Build Coastguard Worker
532*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[0], &temp6[0]);
533*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[8], &temp6[8]);
534*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[16], &temp6[16]);
535*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[24], &temp6[24]);
536*fb1b10abSAndroid Build Coastguard Worker
537*fb1b10abSAndroid Build Coastguard Worker store(out + 16 * 32, temp6);
538*fb1b10abSAndroid Build Coastguard Worker
539*fb1b10abSAndroid Build Coastguard Worker // Final row of 8x32.
540*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp1[24], &temp0[0]);
541*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp2[24], &temp0[8]);
542*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp3[24], &temp0[16]);
543*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp4[24], &temp0[24]);
544*fb1b10abSAndroid Build Coastguard Worker
545*fb1b10abSAndroid Build Coastguard Worker fdct32_vsx(temp0, temp5, 1);
546*fb1b10abSAndroid Build Coastguard Worker
547*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[0], &temp6[0]);
548*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[8], &temp6[8]);
549*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[16], &temp6[16]);
550*fb1b10abSAndroid Build Coastguard Worker transpose_8x8(&temp5[24], &temp6[24]);
551*fb1b10abSAndroid Build Coastguard Worker
552*fb1b10abSAndroid Build Coastguard Worker store(out + 24 * 32, temp6);
553*fb1b10abSAndroid Build Coastguard Worker }
554