1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar *
3*c83a76b0SSuyog Pawar * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*c83a76b0SSuyog Pawar *
5*c83a76b0SSuyog Pawar * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar *
9*c83a76b0SSuyog Pawar * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar *
11*c83a76b0SSuyog Pawar * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar * limitations under the License.
16*c83a76b0SSuyog Pawar *
17*c83a76b0SSuyog Pawar ******************************************************************************/
18*c83a76b0SSuyog Pawar /**
19*c83a76b0SSuyog Pawar *******************************************************************************
20*c83a76b0SSuyog Pawar * @file
21*c83a76b0SSuyog Pawar * ihevc_itrans_recon_32x32.c
22*c83a76b0SSuyog Pawar *
23*c83a76b0SSuyog Pawar * @brief
24*c83a76b0SSuyog Pawar * Contains function definitions for inverse transform and reconstruction 32x32
25*c83a76b0SSuyog Pawar *
26*c83a76b0SSuyog Pawar *
27*c83a76b0SSuyog Pawar * @author
28*c83a76b0SSuyog Pawar * 100470
29*c83a76b0SSuyog Pawar *
30*c83a76b0SSuyog Pawar * @par List of Functions:
31*c83a76b0SSuyog Pawar * - ihevc_itrans_recon_32x32()
32*c83a76b0SSuyog Pawar *
33*c83a76b0SSuyog Pawar * @remarks
34*c83a76b0SSuyog Pawar * None
35*c83a76b0SSuyog Pawar *
36*c83a76b0SSuyog Pawar *******************************************************************************
37*c83a76b0SSuyog Pawar */
38*c83a76b0SSuyog Pawar #include <stdio.h>
39*c83a76b0SSuyog Pawar #include <string.h>
40*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
41*c83a76b0SSuyog Pawar #include "ihevc_macros.h"
42*c83a76b0SSuyog Pawar #include "ihevc_platform_macros.h"
43*c83a76b0SSuyog Pawar #include "ihevc_defs.h"
44*c83a76b0SSuyog Pawar #include "ihevc_trans_tables.h"
45*c83a76b0SSuyog Pawar #include "ihevc_itrans_recon.h"
46*c83a76b0SSuyog Pawar #include "ihevc_func_selector.h"
47*c83a76b0SSuyog Pawar #include "ihevc_trans_macros.h"
48*c83a76b0SSuyog Pawar
49*c83a76b0SSuyog Pawar
50*c83a76b0SSuyog Pawar /**
51*c83a76b0SSuyog Pawar *******************************************************************************
52*c83a76b0SSuyog Pawar *
53*c83a76b0SSuyog Pawar * @brief
54*c83a76b0SSuyog Pawar * This function performs Inverse transform and reconstruction for 32x32
55*c83a76b0SSuyog Pawar * input block
56*c83a76b0SSuyog Pawar *
57*c83a76b0SSuyog Pawar * @par Description:
58*c83a76b0SSuyog Pawar * Performs inverse transform and adds the prediction data and clips output
59*c83a76b0SSuyog Pawar * to 8 bit
60*c83a76b0SSuyog Pawar *
61*c83a76b0SSuyog Pawar * @param[in] pi2_src
62*c83a76b0SSuyog Pawar * Input 32x32 coefficients
63*c83a76b0SSuyog Pawar *
64*c83a76b0SSuyog Pawar * @param[in] pi2_tmp
65*c83a76b0SSuyog Pawar * Temporary 32x32 buffer for storing inverse
66*c83a76b0SSuyog Pawar *
67*c83a76b0SSuyog Pawar * transform
68*c83a76b0SSuyog Pawar * 1st stage output
69*c83a76b0SSuyog Pawar *
70*c83a76b0SSuyog Pawar * @param[in] pu1_pred
71*c83a76b0SSuyog Pawar * Prediction 32x32 block
72*c83a76b0SSuyog Pawar *
73*c83a76b0SSuyog Pawar * @param[out] pu1_dst
74*c83a76b0SSuyog Pawar * Output 32x32 block
75*c83a76b0SSuyog Pawar *
76*c83a76b0SSuyog Pawar * @param[in] src_strd
77*c83a76b0SSuyog Pawar * Input stride
78*c83a76b0SSuyog Pawar *
79*c83a76b0SSuyog Pawar * @param[in] pred_strd
80*c83a76b0SSuyog Pawar * Prediction stride
81*c83a76b0SSuyog Pawar *
82*c83a76b0SSuyog Pawar * @param[in] dst_strd
83*c83a76b0SSuyog Pawar * Output Stride
84*c83a76b0SSuyog Pawar *
85*c83a76b0SSuyog Pawar * @param[in] shift
86*c83a76b0SSuyog Pawar * Output shift
87*c83a76b0SSuyog Pawar *
88*c83a76b0SSuyog Pawar * @param[in] zero_cols
89*c83a76b0SSuyog Pawar * Zero columns in pi2_src
90*c83a76b0SSuyog Pawar *
91*c83a76b0SSuyog Pawar * @returns Void
92*c83a76b0SSuyog Pawar *
93*c83a76b0SSuyog Pawar * @remarks
94*c83a76b0SSuyog Pawar * None
95*c83a76b0SSuyog Pawar *
96*c83a76b0SSuyog Pawar *******************************************************************************
97*c83a76b0SSuyog Pawar */
98*c83a76b0SSuyog Pawar
ihevc_itrans_recon_32x32(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)99*c83a76b0SSuyog Pawar void ihevc_itrans_recon_32x32(WORD16 *pi2_src,
100*c83a76b0SSuyog Pawar WORD16 *pi2_tmp,
101*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
102*c83a76b0SSuyog Pawar UWORD8 *pu1_dst,
103*c83a76b0SSuyog Pawar WORD32 src_strd,
104*c83a76b0SSuyog Pawar WORD32 pred_strd,
105*c83a76b0SSuyog Pawar WORD32 dst_strd,
106*c83a76b0SSuyog Pawar WORD32 zero_cols,
107*c83a76b0SSuyog Pawar WORD32 zero_rows)
108*c83a76b0SSuyog Pawar {
109*c83a76b0SSuyog Pawar WORD32 j, k;
110*c83a76b0SSuyog Pawar WORD32 e[16], o[16];
111*c83a76b0SSuyog Pawar WORD32 ee[8], eo[8];
112*c83a76b0SSuyog Pawar WORD32 eee[4], eeo[4];
113*c83a76b0SSuyog Pawar WORD32 eeee[2], eeeo[2];
114*c83a76b0SSuyog Pawar WORD32 add;
115*c83a76b0SSuyog Pawar WORD32 shift;
116*c83a76b0SSuyog Pawar WORD16 *pi2_tmp_orig;
117*c83a76b0SSuyog Pawar WORD32 trans_size;
118*c83a76b0SSuyog Pawar WORD32 zero_rows_2nd_stage = zero_cols;
119*c83a76b0SSuyog Pawar WORD32 row_limit_2nd_stage;
120*c83a76b0SSuyog Pawar
121*c83a76b0SSuyog Pawar trans_size = TRANS_SIZE_32;
122*c83a76b0SSuyog Pawar pi2_tmp_orig = pi2_tmp;
123*c83a76b0SSuyog Pawar
124*c83a76b0SSuyog Pawar if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0)
125*c83a76b0SSuyog Pawar row_limit_2nd_stage = 4;
126*c83a76b0SSuyog Pawar else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00)
127*c83a76b0SSuyog Pawar row_limit_2nd_stage = 8;
128*c83a76b0SSuyog Pawar else
129*c83a76b0SSuyog Pawar row_limit_2nd_stage = TRANS_SIZE_32;
130*c83a76b0SSuyog Pawar
131*c83a76b0SSuyog Pawar if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of input are non-zero */
132*c83a76b0SSuyog Pawar {
133*c83a76b0SSuyog Pawar /************************************************************************************************/
134*c83a76b0SSuyog Pawar /**********************************START - IT_RECON_32x32****************************************/
135*c83a76b0SSuyog Pawar /************************************************************************************************/
136*c83a76b0SSuyog Pawar /* Inverse Transform 1st stage */
137*c83a76b0SSuyog Pawar shift = IT_SHIFT_STAGE_1;
138*c83a76b0SSuyog Pawar add = 1 << (shift - 1);
139*c83a76b0SSuyog Pawar
140*c83a76b0SSuyog Pawar for(j = 0; j < row_limit_2nd_stage; j++)
141*c83a76b0SSuyog Pawar {
142*c83a76b0SSuyog Pawar /* Checking for Zero Cols */
143*c83a76b0SSuyog Pawar if((zero_cols & 1) == 1)
144*c83a76b0SSuyog Pawar {
145*c83a76b0SSuyog Pawar memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
146*c83a76b0SSuyog Pawar }
147*c83a76b0SSuyog Pawar else
148*c83a76b0SSuyog Pawar {
149*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
150*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
151*c83a76b0SSuyog Pawar {
152*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
153*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
154*c83a76b0SSuyog Pawar * pi2_src[3 * src_strd];
155*c83a76b0SSuyog Pawar }
156*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
157*c83a76b0SSuyog Pawar {
158*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd];
159*c83a76b0SSuyog Pawar }
160*c83a76b0SSuyog Pawar // for(k = 0; k < 4; k++)
161*c83a76b0SSuyog Pawar {
162*c83a76b0SSuyog Pawar eeo[0] = 0;
163*c83a76b0SSuyog Pawar eeo[1] = 0;
164*c83a76b0SSuyog Pawar eeo[2] = 0;
165*c83a76b0SSuyog Pawar eeo[3] = 0;
166*c83a76b0SSuyog Pawar }
167*c83a76b0SSuyog Pawar eeeo[0] = 0;
168*c83a76b0SSuyog Pawar eeeo[1] = 0;
169*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
170*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
171*c83a76b0SSuyog Pawar
172*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
173*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
174*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
175*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
176*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
177*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
178*c83a76b0SSuyog Pawar {
179*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
180*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
181*c83a76b0SSuyog Pawar }
182*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
183*c83a76b0SSuyog Pawar {
184*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
185*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
186*c83a76b0SSuyog Pawar }
187*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
188*c83a76b0SSuyog Pawar {
189*c83a76b0SSuyog Pawar pi2_tmp[k] =
190*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
191*c83a76b0SSuyog Pawar pi2_tmp[k + 16] =
192*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
193*c83a76b0SSuyog Pawar }
194*c83a76b0SSuyog Pawar }
195*c83a76b0SSuyog Pawar pi2_src++;
196*c83a76b0SSuyog Pawar pi2_tmp += trans_size;
197*c83a76b0SSuyog Pawar zero_cols = zero_cols >> 1;
198*c83a76b0SSuyog Pawar }
199*c83a76b0SSuyog Pawar
200*c83a76b0SSuyog Pawar pi2_tmp = pi2_tmp_orig;
201*c83a76b0SSuyog Pawar
202*c83a76b0SSuyog Pawar /* Inverse Transform 2nd stage */
203*c83a76b0SSuyog Pawar shift = IT_SHIFT_STAGE_2;
204*c83a76b0SSuyog Pawar add = 1 << (shift - 1);
205*c83a76b0SSuyog Pawar if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
206*c83a76b0SSuyog Pawar {
207*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
208*c83a76b0SSuyog Pawar {
209*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
210*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
211*c83a76b0SSuyog Pawar {
212*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
213*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
214*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size];
215*c83a76b0SSuyog Pawar }
216*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
217*c83a76b0SSuyog Pawar {
218*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
219*c83a76b0SSuyog Pawar }
220*c83a76b0SSuyog Pawar // for(k = 0; k < 4; k++)
221*c83a76b0SSuyog Pawar {
222*c83a76b0SSuyog Pawar eeo[0] = 0;
223*c83a76b0SSuyog Pawar eeo[1] = 0;
224*c83a76b0SSuyog Pawar eeo[2] = 0;
225*c83a76b0SSuyog Pawar eeo[3] = 0;
226*c83a76b0SSuyog Pawar }
227*c83a76b0SSuyog Pawar eeeo[0] = 0;
228*c83a76b0SSuyog Pawar eeeo[1] = 0;
229*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
230*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
231*c83a76b0SSuyog Pawar
232*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
233*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
234*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
235*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
236*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
237*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
238*c83a76b0SSuyog Pawar {
239*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
240*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
241*c83a76b0SSuyog Pawar }
242*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
243*c83a76b0SSuyog Pawar {
244*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
245*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
246*c83a76b0SSuyog Pawar }
247*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
248*c83a76b0SSuyog Pawar {
249*c83a76b0SSuyog Pawar WORD32 itrans_out;
250*c83a76b0SSuyog Pawar itrans_out =
251*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
252*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
253*c83a76b0SSuyog Pawar itrans_out =
254*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
255*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
256*c83a76b0SSuyog Pawar }
257*c83a76b0SSuyog Pawar pi2_tmp++;
258*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
259*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
260*c83a76b0SSuyog Pawar }
261*c83a76b0SSuyog Pawar }
262*c83a76b0SSuyog Pawar else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
263*c83a76b0SSuyog Pawar {
264*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
265*c83a76b0SSuyog Pawar {
266*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
267*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
268*c83a76b0SSuyog Pawar {
269*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
270*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
271*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size]
272*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
273*c83a76b0SSuyog Pawar * pi2_tmp[5 * trans_size]
274*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
275*c83a76b0SSuyog Pawar * pi2_tmp[7 * trans_size];
276*c83a76b0SSuyog Pawar }
277*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
278*c83a76b0SSuyog Pawar {
279*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
280*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
281*c83a76b0SSuyog Pawar * pi2_tmp[6 * trans_size];
282*c83a76b0SSuyog Pawar }
283*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
284*c83a76b0SSuyog Pawar {
285*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
286*c83a76b0SSuyog Pawar }
287*c83a76b0SSuyog Pawar eeeo[0] = 0;
288*c83a76b0SSuyog Pawar eeeo[1] = 0;
289*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
290*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
291*c83a76b0SSuyog Pawar
292*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
293*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
294*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
295*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
296*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
297*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
298*c83a76b0SSuyog Pawar {
299*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
300*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
301*c83a76b0SSuyog Pawar }
302*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
303*c83a76b0SSuyog Pawar {
304*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
305*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
306*c83a76b0SSuyog Pawar }
307*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
308*c83a76b0SSuyog Pawar {
309*c83a76b0SSuyog Pawar WORD32 itrans_out;
310*c83a76b0SSuyog Pawar itrans_out =
311*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
312*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
313*c83a76b0SSuyog Pawar itrans_out =
314*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
315*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
316*c83a76b0SSuyog Pawar }
317*c83a76b0SSuyog Pawar pi2_tmp++;
318*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
319*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
320*c83a76b0SSuyog Pawar }
321*c83a76b0SSuyog Pawar }
322*c83a76b0SSuyog Pawar else /* All rows of output of 1st stage are non-zero */
323*c83a76b0SSuyog Pawar {
324*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
325*c83a76b0SSuyog Pawar {
326*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
327*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
328*c83a76b0SSuyog Pawar {
329*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
330*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
331*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size]
332*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
333*c83a76b0SSuyog Pawar * pi2_tmp[5 * trans_size]
334*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
335*c83a76b0SSuyog Pawar * pi2_tmp[7 * trans_size]
336*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[9][k]
337*c83a76b0SSuyog Pawar * pi2_tmp[9 * trans_size]
338*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[11][k]
339*c83a76b0SSuyog Pawar * pi2_tmp[11 * trans_size]
340*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[13][k]
341*c83a76b0SSuyog Pawar * pi2_tmp[13 * trans_size]
342*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[15][k]
343*c83a76b0SSuyog Pawar * pi2_tmp[15 * trans_size]
344*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[17][k]
345*c83a76b0SSuyog Pawar * pi2_tmp[17 * trans_size]
346*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[19][k]
347*c83a76b0SSuyog Pawar * pi2_tmp[19 * trans_size]
348*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[21][k]
349*c83a76b0SSuyog Pawar * pi2_tmp[21 * trans_size]
350*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[23][k]
351*c83a76b0SSuyog Pawar * pi2_tmp[23 * trans_size]
352*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[25][k]
353*c83a76b0SSuyog Pawar * pi2_tmp[25 * trans_size]
354*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[27][k]
355*c83a76b0SSuyog Pawar * pi2_tmp[27 * trans_size]
356*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[29][k]
357*c83a76b0SSuyog Pawar * pi2_tmp[29 * trans_size]
358*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[31][k]
359*c83a76b0SSuyog Pawar * pi2_tmp[31 * trans_size];
360*c83a76b0SSuyog Pawar }
361*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
362*c83a76b0SSuyog Pawar {
363*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
364*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
365*c83a76b0SSuyog Pawar * pi2_tmp[6 * trans_size]
366*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[10][k]
367*c83a76b0SSuyog Pawar * pi2_tmp[10 * trans_size]
368*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[14][k]
369*c83a76b0SSuyog Pawar * pi2_tmp[14 * trans_size]
370*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[18][k]
371*c83a76b0SSuyog Pawar * pi2_tmp[18 * trans_size]
372*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[22][k]
373*c83a76b0SSuyog Pawar * pi2_tmp[22 * trans_size]
374*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[26][k]
375*c83a76b0SSuyog Pawar * pi2_tmp[26 * trans_size]
376*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[30][k]
377*c83a76b0SSuyog Pawar * pi2_tmp[30 * trans_size];
378*c83a76b0SSuyog Pawar }
379*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
380*c83a76b0SSuyog Pawar {
381*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
382*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[12][k]
383*c83a76b0SSuyog Pawar * pi2_tmp[12 * trans_size]
384*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[20][k]
385*c83a76b0SSuyog Pawar * pi2_tmp[20 * trans_size]
386*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[28][k]
387*c83a76b0SSuyog Pawar * pi2_tmp[28 * trans_size];
388*c83a76b0SSuyog Pawar }
389*c83a76b0SSuyog Pawar eeeo[0] =
390*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
391*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][0]
392*c83a76b0SSuyog Pawar * pi2_tmp[24
393*c83a76b0SSuyog Pawar * trans_size];
394*c83a76b0SSuyog Pawar eeeo[1] =
395*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
396*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][1]
397*c83a76b0SSuyog Pawar * pi2_tmp[24
398*c83a76b0SSuyog Pawar * trans_size];
399*c83a76b0SSuyog Pawar eeee[0] =
400*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
401*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][0]
402*c83a76b0SSuyog Pawar * pi2_tmp[16
403*c83a76b0SSuyog Pawar * trans_size];
404*c83a76b0SSuyog Pawar eeee[1] =
405*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
406*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][1]
407*c83a76b0SSuyog Pawar * pi2_tmp[16
408*c83a76b0SSuyog Pawar * trans_size];
409*c83a76b0SSuyog Pawar
410*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
412*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
413*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
414*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
415*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
416*c83a76b0SSuyog Pawar {
417*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
418*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
419*c83a76b0SSuyog Pawar }
420*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
421*c83a76b0SSuyog Pawar {
422*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
423*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
424*c83a76b0SSuyog Pawar }
425*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
426*c83a76b0SSuyog Pawar {
427*c83a76b0SSuyog Pawar WORD32 itrans_out;
428*c83a76b0SSuyog Pawar itrans_out =
429*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
430*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
431*c83a76b0SSuyog Pawar itrans_out =
432*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
433*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
434*c83a76b0SSuyog Pawar }
435*c83a76b0SSuyog Pawar pi2_tmp++;
436*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
437*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
438*c83a76b0SSuyog Pawar }
439*c83a76b0SSuyog Pawar }
440*c83a76b0SSuyog Pawar /************************************************************************************************/
441*c83a76b0SSuyog Pawar /************************************END - IT_RECON_32x32****************************************/
442*c83a76b0SSuyog Pawar /************************************************************************************************/
443*c83a76b0SSuyog Pawar }
444*c83a76b0SSuyog Pawar else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */
445*c83a76b0SSuyog Pawar {
446*c83a76b0SSuyog Pawar /************************************************************************************************/
447*c83a76b0SSuyog Pawar /**********************************START - IT_RECON_32x32****************************************/
448*c83a76b0SSuyog Pawar /************************************************************************************************/
449*c83a76b0SSuyog Pawar /* Inverse Transform 1st stage */
450*c83a76b0SSuyog Pawar shift = IT_SHIFT_STAGE_1;
451*c83a76b0SSuyog Pawar add = 1 << (shift - 1);
452*c83a76b0SSuyog Pawar
453*c83a76b0SSuyog Pawar for(j = 0; j < row_limit_2nd_stage; j++)
454*c83a76b0SSuyog Pawar {
455*c83a76b0SSuyog Pawar /* Checking for Zero Cols */
456*c83a76b0SSuyog Pawar if((zero_cols & 1) == 1)
457*c83a76b0SSuyog Pawar {
458*c83a76b0SSuyog Pawar memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
459*c83a76b0SSuyog Pawar }
460*c83a76b0SSuyog Pawar else
461*c83a76b0SSuyog Pawar {
462*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
463*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
464*c83a76b0SSuyog Pawar {
465*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
466*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
467*c83a76b0SSuyog Pawar * pi2_src[3 * src_strd]
468*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
469*c83a76b0SSuyog Pawar * pi2_src[5 * src_strd]
470*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
471*c83a76b0SSuyog Pawar * pi2_src[7 * src_strd];
472*c83a76b0SSuyog Pawar }
473*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
474*c83a76b0SSuyog Pawar {
475*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
476*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
477*c83a76b0SSuyog Pawar * pi2_src[6 * src_strd];
478*c83a76b0SSuyog Pawar }
479*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
480*c83a76b0SSuyog Pawar {
481*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd];
482*c83a76b0SSuyog Pawar }
483*c83a76b0SSuyog Pawar eeeo[0] = 0;
484*c83a76b0SSuyog Pawar eeeo[1] = 0;
485*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
486*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
487*c83a76b0SSuyog Pawar
488*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
489*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
490*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
491*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
492*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
493*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
494*c83a76b0SSuyog Pawar {
495*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
496*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
497*c83a76b0SSuyog Pawar }
498*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
499*c83a76b0SSuyog Pawar {
500*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
501*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
502*c83a76b0SSuyog Pawar }
503*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
504*c83a76b0SSuyog Pawar {
505*c83a76b0SSuyog Pawar pi2_tmp[k] =
506*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
507*c83a76b0SSuyog Pawar pi2_tmp[k + 16] =
508*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
509*c83a76b0SSuyog Pawar }
510*c83a76b0SSuyog Pawar }
511*c83a76b0SSuyog Pawar pi2_src++;
512*c83a76b0SSuyog Pawar pi2_tmp += trans_size;
513*c83a76b0SSuyog Pawar zero_cols = zero_cols >> 1;
514*c83a76b0SSuyog Pawar }
515*c83a76b0SSuyog Pawar
516*c83a76b0SSuyog Pawar pi2_tmp = pi2_tmp_orig;
517*c83a76b0SSuyog Pawar
518*c83a76b0SSuyog Pawar /* Inverse Transform 2nd stage */
519*c83a76b0SSuyog Pawar shift = IT_SHIFT_STAGE_2;
520*c83a76b0SSuyog Pawar add = 1 << (shift - 1);
521*c83a76b0SSuyog Pawar if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
522*c83a76b0SSuyog Pawar {
523*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
524*c83a76b0SSuyog Pawar {
525*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
526*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
527*c83a76b0SSuyog Pawar {
528*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
529*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
530*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size];
531*c83a76b0SSuyog Pawar }
532*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
533*c83a76b0SSuyog Pawar {
534*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
535*c83a76b0SSuyog Pawar }
536*c83a76b0SSuyog Pawar // for(k = 0; k < 4; k++)
537*c83a76b0SSuyog Pawar {
538*c83a76b0SSuyog Pawar eeo[0] = 0;
539*c83a76b0SSuyog Pawar eeo[1] = 0;
540*c83a76b0SSuyog Pawar eeo[2] = 0;
541*c83a76b0SSuyog Pawar eeo[3] = 0;
542*c83a76b0SSuyog Pawar }
543*c83a76b0SSuyog Pawar eeeo[0] = 0;
544*c83a76b0SSuyog Pawar eeeo[1] = 0;
545*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
546*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
547*c83a76b0SSuyog Pawar
548*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
549*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
550*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
551*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
552*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
553*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
554*c83a76b0SSuyog Pawar {
555*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
556*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
557*c83a76b0SSuyog Pawar }
558*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
559*c83a76b0SSuyog Pawar {
560*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
561*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
562*c83a76b0SSuyog Pawar }
563*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
564*c83a76b0SSuyog Pawar {
565*c83a76b0SSuyog Pawar WORD32 itrans_out;
566*c83a76b0SSuyog Pawar itrans_out =
567*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
568*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
569*c83a76b0SSuyog Pawar itrans_out =
570*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
571*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
572*c83a76b0SSuyog Pawar }
573*c83a76b0SSuyog Pawar pi2_tmp++;
574*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
575*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
576*c83a76b0SSuyog Pawar }
577*c83a76b0SSuyog Pawar }
578*c83a76b0SSuyog Pawar else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
579*c83a76b0SSuyog Pawar {
580*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
581*c83a76b0SSuyog Pawar {
582*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
583*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
584*c83a76b0SSuyog Pawar {
585*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
586*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
587*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size]
588*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
589*c83a76b0SSuyog Pawar * pi2_tmp[5 * trans_size]
590*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
591*c83a76b0SSuyog Pawar * pi2_tmp[7 * trans_size];
592*c83a76b0SSuyog Pawar }
593*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
594*c83a76b0SSuyog Pawar {
595*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
596*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
597*c83a76b0SSuyog Pawar * pi2_tmp[6 * trans_size];
598*c83a76b0SSuyog Pawar }
599*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
600*c83a76b0SSuyog Pawar {
601*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
602*c83a76b0SSuyog Pawar }
603*c83a76b0SSuyog Pawar eeeo[0] = 0;
604*c83a76b0SSuyog Pawar eeeo[1] = 0;
605*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
606*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
607*c83a76b0SSuyog Pawar
608*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
609*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
610*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
611*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
612*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
613*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
614*c83a76b0SSuyog Pawar {
615*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
616*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
617*c83a76b0SSuyog Pawar }
618*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
619*c83a76b0SSuyog Pawar {
620*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
621*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
622*c83a76b0SSuyog Pawar }
623*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
624*c83a76b0SSuyog Pawar {
625*c83a76b0SSuyog Pawar WORD32 itrans_out;
626*c83a76b0SSuyog Pawar itrans_out =
627*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
628*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
629*c83a76b0SSuyog Pawar itrans_out =
630*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
631*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
632*c83a76b0SSuyog Pawar }
633*c83a76b0SSuyog Pawar pi2_tmp++;
634*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
635*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
636*c83a76b0SSuyog Pawar }
637*c83a76b0SSuyog Pawar }
638*c83a76b0SSuyog Pawar else /* All rows of output of 1st stage are non-zero */
639*c83a76b0SSuyog Pawar {
640*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
641*c83a76b0SSuyog Pawar {
642*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
643*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
644*c83a76b0SSuyog Pawar {
645*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
646*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
647*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size]
648*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
649*c83a76b0SSuyog Pawar * pi2_tmp[5 * trans_size]
650*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
651*c83a76b0SSuyog Pawar * pi2_tmp[7 * trans_size]
652*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[9][k]
653*c83a76b0SSuyog Pawar * pi2_tmp[9 * trans_size]
654*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[11][k]
655*c83a76b0SSuyog Pawar * pi2_tmp[11 * trans_size]
656*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[13][k]
657*c83a76b0SSuyog Pawar * pi2_tmp[13 * trans_size]
658*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[15][k]
659*c83a76b0SSuyog Pawar * pi2_tmp[15 * trans_size]
660*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[17][k]
661*c83a76b0SSuyog Pawar * pi2_tmp[17 * trans_size]
662*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[19][k]
663*c83a76b0SSuyog Pawar * pi2_tmp[19 * trans_size]
664*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[21][k]
665*c83a76b0SSuyog Pawar * pi2_tmp[21 * trans_size]
666*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[23][k]
667*c83a76b0SSuyog Pawar * pi2_tmp[23 * trans_size]
668*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[25][k]
669*c83a76b0SSuyog Pawar * pi2_tmp[25 * trans_size]
670*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[27][k]
671*c83a76b0SSuyog Pawar * pi2_tmp[27 * trans_size]
672*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[29][k]
673*c83a76b0SSuyog Pawar * pi2_tmp[29 * trans_size]
674*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[31][k]
675*c83a76b0SSuyog Pawar * pi2_tmp[31 * trans_size];
676*c83a76b0SSuyog Pawar }
677*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
678*c83a76b0SSuyog Pawar {
679*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
680*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
681*c83a76b0SSuyog Pawar * pi2_tmp[6 * trans_size]
682*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[10][k]
683*c83a76b0SSuyog Pawar * pi2_tmp[10 * trans_size]
684*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[14][k]
685*c83a76b0SSuyog Pawar * pi2_tmp[14 * trans_size]
686*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[18][k]
687*c83a76b0SSuyog Pawar * pi2_tmp[18 * trans_size]
688*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[22][k]
689*c83a76b0SSuyog Pawar * pi2_tmp[22 * trans_size]
690*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[26][k]
691*c83a76b0SSuyog Pawar * pi2_tmp[26 * trans_size]
692*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[30][k]
693*c83a76b0SSuyog Pawar * pi2_tmp[30 * trans_size];
694*c83a76b0SSuyog Pawar }
695*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
696*c83a76b0SSuyog Pawar {
697*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
698*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[12][k]
699*c83a76b0SSuyog Pawar * pi2_tmp[12 * trans_size]
700*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[20][k]
701*c83a76b0SSuyog Pawar * pi2_tmp[20 * trans_size]
702*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[28][k]
703*c83a76b0SSuyog Pawar * pi2_tmp[28 * trans_size];
704*c83a76b0SSuyog Pawar }
705*c83a76b0SSuyog Pawar eeeo[0] =
706*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
707*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][0]
708*c83a76b0SSuyog Pawar * pi2_tmp[24
709*c83a76b0SSuyog Pawar * trans_size];
710*c83a76b0SSuyog Pawar eeeo[1] =
711*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
712*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][1]
713*c83a76b0SSuyog Pawar * pi2_tmp[24
714*c83a76b0SSuyog Pawar * trans_size];
715*c83a76b0SSuyog Pawar eeee[0] =
716*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
717*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][0]
718*c83a76b0SSuyog Pawar * pi2_tmp[16
719*c83a76b0SSuyog Pawar * trans_size];
720*c83a76b0SSuyog Pawar eeee[1] =
721*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
722*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][1]
723*c83a76b0SSuyog Pawar * pi2_tmp[16
724*c83a76b0SSuyog Pawar * trans_size];
725*c83a76b0SSuyog Pawar
726*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
727*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
728*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
729*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
730*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
731*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
732*c83a76b0SSuyog Pawar {
733*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
734*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
735*c83a76b0SSuyog Pawar }
736*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
737*c83a76b0SSuyog Pawar {
738*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
739*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
740*c83a76b0SSuyog Pawar }
741*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
742*c83a76b0SSuyog Pawar {
743*c83a76b0SSuyog Pawar WORD32 itrans_out;
744*c83a76b0SSuyog Pawar itrans_out =
745*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
746*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
747*c83a76b0SSuyog Pawar itrans_out =
748*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
749*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
750*c83a76b0SSuyog Pawar }
751*c83a76b0SSuyog Pawar pi2_tmp++;
752*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
753*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
754*c83a76b0SSuyog Pawar }
755*c83a76b0SSuyog Pawar }
756*c83a76b0SSuyog Pawar /************************************************************************************************/
757*c83a76b0SSuyog Pawar /************************************END - IT_RECON_32x32****************************************/
758*c83a76b0SSuyog Pawar /************************************************************************************************/
759*c83a76b0SSuyog Pawar }
760*c83a76b0SSuyog Pawar else /* All rows of input are non-zero */
761*c83a76b0SSuyog Pawar {
762*c83a76b0SSuyog Pawar /************************************************************************************************/
763*c83a76b0SSuyog Pawar /**********************************START - IT_RECON_32x32****************************************/
764*c83a76b0SSuyog Pawar /************************************************************************************************/
765*c83a76b0SSuyog Pawar /* Inverse Transform 1st stage */
766*c83a76b0SSuyog Pawar shift = IT_SHIFT_STAGE_1;
767*c83a76b0SSuyog Pawar add = 1 << (shift - 1);
768*c83a76b0SSuyog Pawar
769*c83a76b0SSuyog Pawar for(j = 0; j < row_limit_2nd_stage; j++)
770*c83a76b0SSuyog Pawar {
771*c83a76b0SSuyog Pawar /* Checking for Zero Cols */
772*c83a76b0SSuyog Pawar if((zero_cols & 1) == 1)
773*c83a76b0SSuyog Pawar {
774*c83a76b0SSuyog Pawar memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
775*c83a76b0SSuyog Pawar }
776*c83a76b0SSuyog Pawar else
777*c83a76b0SSuyog Pawar {
778*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
779*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
780*c83a76b0SSuyog Pawar {
781*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
782*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
783*c83a76b0SSuyog Pawar * pi2_src[3 * src_strd]
784*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
785*c83a76b0SSuyog Pawar * pi2_src[5 * src_strd]
786*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
787*c83a76b0SSuyog Pawar * pi2_src[7 * src_strd]
788*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[9][k]
789*c83a76b0SSuyog Pawar * pi2_src[9 * src_strd]
790*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[11][k]
791*c83a76b0SSuyog Pawar * pi2_src[11 * src_strd]
792*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[13][k]
793*c83a76b0SSuyog Pawar * pi2_src[13 * src_strd]
794*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[15][k]
795*c83a76b0SSuyog Pawar * pi2_src[15 * src_strd]
796*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[17][k]
797*c83a76b0SSuyog Pawar * pi2_src[17 * src_strd]
798*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[19][k]
799*c83a76b0SSuyog Pawar * pi2_src[19 * src_strd]
800*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[21][k]
801*c83a76b0SSuyog Pawar * pi2_src[21 * src_strd]
802*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[23][k]
803*c83a76b0SSuyog Pawar * pi2_src[23 * src_strd]
804*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[25][k]
805*c83a76b0SSuyog Pawar * pi2_src[25 * src_strd]
806*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[27][k]
807*c83a76b0SSuyog Pawar * pi2_src[27 * src_strd]
808*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[29][k]
809*c83a76b0SSuyog Pawar * pi2_src[29 * src_strd]
810*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[31][k]
811*c83a76b0SSuyog Pawar * pi2_src[31 * src_strd];
812*c83a76b0SSuyog Pawar }
813*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
814*c83a76b0SSuyog Pawar {
815*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
816*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
817*c83a76b0SSuyog Pawar * pi2_src[6 * src_strd]
818*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[10][k]
819*c83a76b0SSuyog Pawar * pi2_src[10 * src_strd]
820*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[14][k]
821*c83a76b0SSuyog Pawar * pi2_src[14 * src_strd]
822*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[18][k]
823*c83a76b0SSuyog Pawar * pi2_src[18 * src_strd]
824*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[22][k]
825*c83a76b0SSuyog Pawar * pi2_src[22 * src_strd]
826*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[26][k]
827*c83a76b0SSuyog Pawar * pi2_src[26 * src_strd]
828*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[30][k]
829*c83a76b0SSuyog Pawar * pi2_src[30 * src_strd];
830*c83a76b0SSuyog Pawar }
831*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
832*c83a76b0SSuyog Pawar {
833*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
834*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[12][k]
835*c83a76b0SSuyog Pawar * pi2_src[12 * src_strd]
836*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[20][k]
837*c83a76b0SSuyog Pawar * pi2_src[20 * src_strd]
838*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[28][k]
839*c83a76b0SSuyog Pawar * pi2_src[28 * src_strd];
840*c83a76b0SSuyog Pawar }
841*c83a76b0SSuyog Pawar eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
842*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][0]
843*c83a76b0SSuyog Pawar * pi2_src[24 * src_strd];
844*c83a76b0SSuyog Pawar eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
845*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][1]
846*c83a76b0SSuyog Pawar * pi2_src[24 * src_strd];
847*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
848*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][0]
849*c83a76b0SSuyog Pawar * pi2_src[16 * src_strd];
850*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
851*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][1]
852*c83a76b0SSuyog Pawar * pi2_src[16 * src_strd];
853*c83a76b0SSuyog Pawar
854*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
855*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
856*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
857*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
858*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
859*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
860*c83a76b0SSuyog Pawar {
861*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
862*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
863*c83a76b0SSuyog Pawar }
864*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
865*c83a76b0SSuyog Pawar {
866*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
867*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
868*c83a76b0SSuyog Pawar }
869*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
870*c83a76b0SSuyog Pawar {
871*c83a76b0SSuyog Pawar pi2_tmp[k] =
872*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
873*c83a76b0SSuyog Pawar pi2_tmp[k + 16] =
874*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
875*c83a76b0SSuyog Pawar }
876*c83a76b0SSuyog Pawar }
877*c83a76b0SSuyog Pawar pi2_src++;
878*c83a76b0SSuyog Pawar pi2_tmp += trans_size;
879*c83a76b0SSuyog Pawar zero_cols = zero_cols >> 1;
880*c83a76b0SSuyog Pawar }
881*c83a76b0SSuyog Pawar
882*c83a76b0SSuyog Pawar pi2_tmp = pi2_tmp_orig;
883*c83a76b0SSuyog Pawar
884*c83a76b0SSuyog Pawar /* Inverse Transform 2nd stage */
885*c83a76b0SSuyog Pawar shift = IT_SHIFT_STAGE_2;
886*c83a76b0SSuyog Pawar add = 1 << (shift - 1);
887*c83a76b0SSuyog Pawar if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
888*c83a76b0SSuyog Pawar {
889*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
890*c83a76b0SSuyog Pawar {
891*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
892*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
893*c83a76b0SSuyog Pawar {
894*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
895*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
896*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size];
897*c83a76b0SSuyog Pawar }
898*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
899*c83a76b0SSuyog Pawar {
900*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
901*c83a76b0SSuyog Pawar }
902*c83a76b0SSuyog Pawar // for(k = 0; k < 4; k++)
903*c83a76b0SSuyog Pawar {
904*c83a76b0SSuyog Pawar eeo[0] = 0;
905*c83a76b0SSuyog Pawar eeo[1] = 0;
906*c83a76b0SSuyog Pawar eeo[2] = 0;
907*c83a76b0SSuyog Pawar eeo[3] = 0;
908*c83a76b0SSuyog Pawar }
909*c83a76b0SSuyog Pawar eeeo[0] = 0;
910*c83a76b0SSuyog Pawar eeeo[1] = 0;
911*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
912*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
913*c83a76b0SSuyog Pawar
914*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
915*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
916*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
917*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
918*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
919*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
920*c83a76b0SSuyog Pawar {
921*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
922*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
923*c83a76b0SSuyog Pawar }
924*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
925*c83a76b0SSuyog Pawar {
926*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
927*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
928*c83a76b0SSuyog Pawar }
929*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
930*c83a76b0SSuyog Pawar {
931*c83a76b0SSuyog Pawar WORD32 itrans_out;
932*c83a76b0SSuyog Pawar itrans_out =
933*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
934*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
935*c83a76b0SSuyog Pawar itrans_out =
936*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
937*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
938*c83a76b0SSuyog Pawar }
939*c83a76b0SSuyog Pawar pi2_tmp++;
940*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
941*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
942*c83a76b0SSuyog Pawar }
943*c83a76b0SSuyog Pawar }
944*c83a76b0SSuyog Pawar else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
945*c83a76b0SSuyog Pawar {
946*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
947*c83a76b0SSuyog Pawar {
948*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
949*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
950*c83a76b0SSuyog Pawar {
951*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
952*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
953*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size]
954*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
955*c83a76b0SSuyog Pawar * pi2_tmp[5 * trans_size]
956*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
957*c83a76b0SSuyog Pawar * pi2_tmp[7 * trans_size];
958*c83a76b0SSuyog Pawar }
959*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
960*c83a76b0SSuyog Pawar {
961*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
962*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
963*c83a76b0SSuyog Pawar * pi2_tmp[6 * trans_size];
964*c83a76b0SSuyog Pawar }
965*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
966*c83a76b0SSuyog Pawar {
967*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
968*c83a76b0SSuyog Pawar }
969*c83a76b0SSuyog Pawar eeeo[0] = 0;
970*c83a76b0SSuyog Pawar eeeo[1] = 0;
971*c83a76b0SSuyog Pawar eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
972*c83a76b0SSuyog Pawar eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
973*c83a76b0SSuyog Pawar
974*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
975*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
976*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
977*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
978*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
979*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
980*c83a76b0SSuyog Pawar {
981*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
982*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
983*c83a76b0SSuyog Pawar }
984*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
985*c83a76b0SSuyog Pawar {
986*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
987*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
988*c83a76b0SSuyog Pawar }
989*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
990*c83a76b0SSuyog Pawar {
991*c83a76b0SSuyog Pawar WORD32 itrans_out;
992*c83a76b0SSuyog Pawar itrans_out =
993*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
994*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
995*c83a76b0SSuyog Pawar itrans_out =
996*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
997*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
998*c83a76b0SSuyog Pawar }
999*c83a76b0SSuyog Pawar pi2_tmp++;
1000*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
1001*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
1002*c83a76b0SSuyog Pawar }
1003*c83a76b0SSuyog Pawar }
1004*c83a76b0SSuyog Pawar else /* All rows of output of 1st stage are non-zero */
1005*c83a76b0SSuyog Pawar {
1006*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1007*c83a76b0SSuyog Pawar {
1008*c83a76b0SSuyog Pawar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1009*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
1010*c83a76b0SSuyog Pawar {
1011*c83a76b0SSuyog Pawar o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
1012*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[3][k]
1013*c83a76b0SSuyog Pawar * pi2_tmp[3 * trans_size]
1014*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[5][k]
1015*c83a76b0SSuyog Pawar * pi2_tmp[5 * trans_size]
1016*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[7][k]
1017*c83a76b0SSuyog Pawar * pi2_tmp[7 * trans_size]
1018*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[9][k]
1019*c83a76b0SSuyog Pawar * pi2_tmp[9 * trans_size]
1020*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[11][k]
1021*c83a76b0SSuyog Pawar * pi2_tmp[11 * trans_size]
1022*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[13][k]
1023*c83a76b0SSuyog Pawar * pi2_tmp[13 * trans_size]
1024*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[15][k]
1025*c83a76b0SSuyog Pawar * pi2_tmp[15 * trans_size]
1026*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[17][k]
1027*c83a76b0SSuyog Pawar * pi2_tmp[17 * trans_size]
1028*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[19][k]
1029*c83a76b0SSuyog Pawar * pi2_tmp[19 * trans_size]
1030*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[21][k]
1031*c83a76b0SSuyog Pawar * pi2_tmp[21 * trans_size]
1032*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[23][k]
1033*c83a76b0SSuyog Pawar * pi2_tmp[23 * trans_size]
1034*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[25][k]
1035*c83a76b0SSuyog Pawar * pi2_tmp[25 * trans_size]
1036*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[27][k]
1037*c83a76b0SSuyog Pawar * pi2_tmp[27 * trans_size]
1038*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[29][k]
1039*c83a76b0SSuyog Pawar * pi2_tmp[29 * trans_size]
1040*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[31][k]
1041*c83a76b0SSuyog Pawar * pi2_tmp[31 * trans_size];
1042*c83a76b0SSuyog Pawar }
1043*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
1044*c83a76b0SSuyog Pawar {
1045*c83a76b0SSuyog Pawar eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
1046*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[6][k]
1047*c83a76b0SSuyog Pawar * pi2_tmp[6 * trans_size]
1048*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[10][k]
1049*c83a76b0SSuyog Pawar * pi2_tmp[10 * trans_size]
1050*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[14][k]
1051*c83a76b0SSuyog Pawar * pi2_tmp[14 * trans_size]
1052*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[18][k]
1053*c83a76b0SSuyog Pawar * pi2_tmp[18 * trans_size]
1054*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[22][k]
1055*c83a76b0SSuyog Pawar * pi2_tmp[22 * trans_size]
1056*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[26][k]
1057*c83a76b0SSuyog Pawar * pi2_tmp[26 * trans_size]
1058*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[30][k]
1059*c83a76b0SSuyog Pawar * pi2_tmp[30 * trans_size];
1060*c83a76b0SSuyog Pawar }
1061*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
1062*c83a76b0SSuyog Pawar {
1063*c83a76b0SSuyog Pawar eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
1064*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[12][k]
1065*c83a76b0SSuyog Pawar * pi2_tmp[12 * trans_size]
1066*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[20][k]
1067*c83a76b0SSuyog Pawar * pi2_tmp[20 * trans_size]
1068*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[28][k]
1069*c83a76b0SSuyog Pawar * pi2_tmp[28 * trans_size];
1070*c83a76b0SSuyog Pawar }
1071*c83a76b0SSuyog Pawar eeeo[0] =
1072*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
1073*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][0]
1074*c83a76b0SSuyog Pawar * pi2_tmp[24
1075*c83a76b0SSuyog Pawar * trans_size];
1076*c83a76b0SSuyog Pawar eeeo[1] =
1077*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
1078*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[24][1]
1079*c83a76b0SSuyog Pawar * pi2_tmp[24
1080*c83a76b0SSuyog Pawar * trans_size];
1081*c83a76b0SSuyog Pawar eeee[0] =
1082*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
1083*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][0]
1084*c83a76b0SSuyog Pawar * pi2_tmp[16
1085*c83a76b0SSuyog Pawar * trans_size];
1086*c83a76b0SSuyog Pawar eeee[1] =
1087*c83a76b0SSuyog Pawar g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
1088*c83a76b0SSuyog Pawar + g_ai2_ihevc_trans_32[16][1]
1089*c83a76b0SSuyog Pawar * pi2_tmp[16
1090*c83a76b0SSuyog Pawar * trans_size];
1091*c83a76b0SSuyog Pawar
1092*c83a76b0SSuyog Pawar /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
1093*c83a76b0SSuyog Pawar eee[0] = eeee[0] + eeeo[0];
1094*c83a76b0SSuyog Pawar eee[3] = eeee[0] - eeeo[0];
1095*c83a76b0SSuyog Pawar eee[1] = eeee[1] + eeeo[1];
1096*c83a76b0SSuyog Pawar eee[2] = eeee[1] - eeeo[1];
1097*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
1098*c83a76b0SSuyog Pawar {
1099*c83a76b0SSuyog Pawar ee[k] = eee[k] + eeo[k];
1100*c83a76b0SSuyog Pawar ee[k + 4] = eee[3 - k] - eeo[3 - k];
1101*c83a76b0SSuyog Pawar }
1102*c83a76b0SSuyog Pawar for(k = 0; k < 8; k++)
1103*c83a76b0SSuyog Pawar {
1104*c83a76b0SSuyog Pawar e[k] = ee[k] + eo[k];
1105*c83a76b0SSuyog Pawar e[k + 8] = ee[7 - k] - eo[7 - k];
1106*c83a76b0SSuyog Pawar }
1107*c83a76b0SSuyog Pawar for(k = 0; k < 16; k++)
1108*c83a76b0SSuyog Pawar {
1109*c83a76b0SSuyog Pawar WORD32 itrans_out;
1110*c83a76b0SSuyog Pawar itrans_out =
1111*c83a76b0SSuyog Pawar CLIP_S16(((e[k] + o[k] + add) >> shift));
1112*c83a76b0SSuyog Pawar pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
1113*c83a76b0SSuyog Pawar itrans_out =
1114*c83a76b0SSuyog Pawar CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
1115*c83a76b0SSuyog Pawar pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
1116*c83a76b0SSuyog Pawar }
1117*c83a76b0SSuyog Pawar pi2_tmp++;
1118*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
1119*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
1120*c83a76b0SSuyog Pawar }
1121*c83a76b0SSuyog Pawar }
1122*c83a76b0SSuyog Pawar /************************************************************************************************/
1123*c83a76b0SSuyog Pawar /************************************END - IT_RECON_32x32****************************************/
1124*c83a76b0SSuyog Pawar /************************************************************************************************/
1125*c83a76b0SSuyog Pawar }
1126*c83a76b0SSuyog Pawar }
1127*c83a76b0SSuyog Pawar
1128