1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar *
3*c83a76b0SSuyog Pawar * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar *
5*c83a76b0SSuyog Pawar * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar *
9*c83a76b0SSuyog Pawar * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar *
11*c83a76b0SSuyog Pawar * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar * limitations under the License.
16*c83a76b0SSuyog Pawar *
17*c83a76b0SSuyog Pawar *****************************************************************************
18*c83a76b0SSuyog Pawar * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar */
20*c83a76b0SSuyog Pawar /**
21*c83a76b0SSuyog Pawar *******************************************************************************
22*c83a76b0SSuyog Pawar * @file
23*c83a76b0SSuyog Pawar * ihevc_quant_iquant_ssd.c
24*c83a76b0SSuyog Pawar *
25*c83a76b0SSuyog Pawar * @brief
26*c83a76b0SSuyog Pawar * Contains function definitions for quantization, followed by Inverse
27*c83a76b0SSuyog Pawar * quantization to find transform domain SSD
28*c83a76b0SSuyog Pawar *
29*c83a76b0SSuyog Pawar * @author
30*c83a76b0SSuyog Pawar * 100453, 100578
31*c83a76b0SSuyog Pawar *
32*c83a76b0SSuyog Pawar * @par List of Functions:
33*c83a76b0SSuyog Pawar * - ihevc_quant_iquant_ssd()
34*c83a76b0SSuyog Pawar * - ihevc_quant_iquant_ssd_flat_scale_mat()
35*c83a76b0SSuyog Pawar *
36*c83a76b0SSuyog Pawar * @remarks
37*c83a76b0SSuyog Pawar * None
38*c83a76b0SSuyog Pawar *
39*c83a76b0SSuyog Pawar *******************************************************************************
40*c83a76b0SSuyog Pawar */
41*c83a76b0SSuyog Pawar
42*c83a76b0SSuyog Pawar #include <stdio.h>
43*c83a76b0SSuyog Pawar #include <string.h>
44*c83a76b0SSuyog Pawar #include <stdlib.h>
45*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
46*c83a76b0SSuyog Pawar #include "ihevc_macros.h"
47*c83a76b0SSuyog Pawar #include "ihevc_platform_macros.h"
48*c83a76b0SSuyog Pawar #include "ihevc_defs.h"
49*c83a76b0SSuyog Pawar #include "ihevc_debug.h"
50*c83a76b0SSuyog Pawar #include "ihevc_trans_tables.h"
51*c83a76b0SSuyog Pawar #include "ihevc_quant_iquant_ssd.h"
52*c83a76b0SSuyog Pawar #include "ihevc_func_selector.h"
53*c83a76b0SSuyog Pawar #include "ihevc_trans_macros.h"
54*c83a76b0SSuyog Pawar #include <assert.h>
55*c83a76b0SSuyog Pawar
56*c83a76b0SSuyog Pawar /*****************************************************************************/
57*c83a76b0SSuyog Pawar /* Globals */
58*c83a76b0SSuyog Pawar /*****************************************************************************/
59*c83a76b0SSuyog Pawar
60*c83a76b0SSuyog Pawar
61*c83a76b0SSuyog Pawar /**
62*c83a76b0SSuyog Pawar *******************************************************************************
63*c83a76b0SSuyog Pawar *
64*c83a76b0SSuyog Pawar * @brief
65*c83a76b0SSuyog Pawar * This function performs quantization, followed by Inverse
66*c83a76b0SSuyog Pawar * quantization to find transform domain SSD
67*c83a76b0SSuyog Pawar *
68*c83a76b0SSuyog Pawar * @par Description:
69*c83a76b0SSuyog Pawar * Performs quantization on coeffs
70*c83a76b0SSuyog Pawar *
71*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
72*c83a76b0SSuyog Pawar * 4x4 Coeffs
73*c83a76b0SSuyog Pawar *
74*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
75*c83a76b0SSuyog Pawar * Scaling Matrix
76*c83a76b0SSuyog Pawar *
77*c83a76b0SSuyog Pawar * @param[out] pi2_dst
78*c83a76b0SSuyog Pawar * Output 4x4 coefficients
79*c83a76b0SSuyog Pawar *
80*c83a76b0SSuyog Pawar * @param[in] qp_div
81*c83a76b0SSuyog Pawar * Quantization parameter / 6
82*c83a76b0SSuyog Pawar *
83*c83a76b0SSuyog Pawar * @param[in] qp_rem
84*c83a76b0SSuyog Pawar * Quantization parameter % 6
85*c83a76b0SSuyog Pawar *
86*c83a76b0SSuyog Pawar * @param[in] src_strd
87*c83a76b0SSuyog Pawar * Input stride
88*c83a76b0SSuyog Pawar *
89*c83a76b0SSuyog Pawar * @param[in] dst_strd
90*c83a76b0SSuyog Pawar * Output Stride
91*c83a76b0SSuyog Pawar *
92*c83a76b0SSuyog Pawar * @param[out] csbf
93*c83a76b0SSuyog Pawar * coded sub block flag
94*c83a76b0SSuyog Pawar *
95*c83a76b0SSuyog Pawar * @param[in] csbf_strd
96*c83a76b0SSuyog Pawar * coded sub block flag
97*c83a76b0SSuyog Pawar *
98*c83a76b0SSuyog Pawar * @param[out] zero_col
99*c83a76b0SSuyog Pawar * zero column flag
100*c83a76b0SSuyog Pawar *
101*c83a76b0SSuyog Pawar * @param[out] zero_row
102*c83a76b0SSuyog Pawar * zero column flag
103*c83a76b0SSuyog Pawar *
104*c83a76b0SSuyog Pawar * @returns cbf
105*c83a76b0SSuyog Pawar * coded block flag
106*c83a76b0SSuyog Pawar *
107*c83a76b0SSuyog Pawar * @remarks
108*c83a76b0SSuyog Pawar * None
109*c83a76b0SSuyog Pawar *
110*c83a76b0SSuyog Pawar *******************************************************************************
111*c83a76b0SSuyog Pawar */
112*c83a76b0SSuyog Pawar
ihevc_quant_iquant_ssd(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)113*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_ssd
114*c83a76b0SSuyog Pawar (
115*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
116*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
117*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
118*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
119*c83a76b0SSuyog Pawar WORD32 trans_size,
120*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
121*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
122*c83a76b0SSuyog Pawar WORD32 q_add,
123*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
124*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
125*c83a76b0SSuyog Pawar WORD32 src_strd,
126*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
127*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
128*c83a76b0SSuyog Pawar UWORD8 *csbf,
129*c83a76b0SSuyog Pawar WORD32 csbf_strd,
130*c83a76b0SSuyog Pawar WORD32 *zero_col,
131*c83a76b0SSuyog Pawar WORD32 *zero_row,
132*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
133*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
134*c83a76b0SSuyog Pawar )
135*c83a76b0SSuyog Pawar {
136*c83a76b0SSuyog Pawar WORD32 i, j;
137*c83a76b0SSuyog Pawar WORD32 log2_size;
138*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
139*c83a76b0SSuyog Pawar WORD32 cbf = 0;
140*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
141*c83a76b0SSuyog Pawar WORD32 val;
142*c83a76b0SSuyog Pawar WORD16 i2_temp;
143*c83a76b0SSuyog Pawar WORD32 ssd_cost = 0;
144*c83a76b0SSuyog Pawar
145*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
146*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
147*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
148*c83a76b0SSuyog Pawar
149*c83a76b0SSuyog Pawar /* Quant initialization */
150*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
151*c83a76b0SSuyog Pawar log2_size -= 1;
152*c83a76b0SSuyog Pawar
153*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
154*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
155*c83a76b0SSuyog Pawar
156*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
157*c83a76b0SSuyog Pawar {
158*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
159*c83a76b0SSuyog Pawar {
160*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
161*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
162*c83a76b0SSuyog Pawar
163*c83a76b0SSuyog Pawar /* Quantization */
164*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j], pi2_coeffs[j],
165*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
166*c83a76b0SSuyog Pawar log2_size, q_add);
167*c83a76b0SSuyog Pawar
168*c83a76b0SSuyog Pawar /* Inverse Quantization */
169*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
170*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
171*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
172*c83a76b0SSuyog Pawar /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
173*c83a76b0SSuyog Pawar shift_iq,
174*c83a76b0SSuyog Pawar qp_div);
175*c83a76b0SSuyog Pawar
176*c83a76b0SSuyog Pawar /* SSD Computation & Accumulation */
177*c83a76b0SSuyog Pawar val = i2_temp - pi2_iq_dst[j];
178*c83a76b0SSuyog Pawar ssd_cost += val*val;
179*c83a76b0SSuyog Pawar
180*c83a76b0SSuyog Pawar }
181*c83a76b0SSuyog Pawar
182*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
183*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
184*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
185*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
186*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
187*c83a76b0SSuyog Pawar }
188*c83a76b0SSuyog Pawar
189*c83a76b0SSuyog Pawar /* Store the cost */
190*c83a76b0SSuyog Pawar *pi8_cost = ssd_cost;
191*c83a76b0SSuyog Pawar
192*c83a76b0SSuyog Pawar /* CSBF update */
193*c83a76b0SSuyog Pawar {
194*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
195*c83a76b0SSuyog Pawar WORD32 row, col;
196*c83a76b0SSuyog Pawar WORD16 *pi2_block;
197*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
198*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
199*c83a76b0SSuyog Pawar
200*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
201*c83a76b0SSuyog Pawar
202*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
203*c83a76b0SSuyog Pawar {
204*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
205*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
206*c83a76b0SSuyog Pawar {
207*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
208*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
209*c83a76b0SSuyog Pawar
210*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
211*c83a76b0SSuyog Pawar {
212*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
213*c83a76b0SSuyog Pawar {
214*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
215*c83a76b0SSuyog Pawar {
216*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
217*c83a76b0SSuyog Pawar break;
218*c83a76b0SSuyog Pawar }
219*c83a76b0SSuyog Pawar }
220*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
221*c83a76b0SSuyog Pawar {
222*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
223*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
224*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
225*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
226*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
227*c83a76b0SSuyog Pawar
228*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
229*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
230*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
231*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
232*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
233*c83a76b0SSuyog Pawar
234*c83a76b0SSuyog Pawar break;
235*c83a76b0SSuyog Pawar }
236*c83a76b0SSuyog Pawar }
237*c83a76b0SSuyog Pawar
238*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
239*c83a76b0SSuyog Pawar }
240*c83a76b0SSuyog Pawar csbf += csbf_strd;
241*c83a76b0SSuyog Pawar }
242*c83a76b0SSuyog Pawar
243*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
244*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
245*c83a76b0SSuyog Pawar }
246*c83a76b0SSuyog Pawar
247*c83a76b0SSuyog Pawar return cbf;
248*c83a76b0SSuyog Pawar }
249*c83a76b0SSuyog Pawar
250*c83a76b0SSuyog Pawar /**
251*c83a76b0SSuyog Pawar *******************************************************************************
252*c83a76b0SSuyog Pawar *
253*c83a76b0SSuyog Pawar * @brief
254*c83a76b0SSuyog Pawar * This function performs quantization, followed by Inverse
255*c83a76b0SSuyog Pawar * quantization
256*c83a76b0SSuyog Pawar *
257*c83a76b0SSuyog Pawar * @par Description:
258*c83a76b0SSuyog Pawar * Performs quantization on coeffs
259*c83a76b0SSuyog Pawar *
260*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
261*c83a76b0SSuyog Pawar * 4x4 Coeffs
262*c83a76b0SSuyog Pawar *
263*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
264*c83a76b0SSuyog Pawar * Scaling Matrix
265*c83a76b0SSuyog Pawar *
266*c83a76b0SSuyog Pawar * @param[out] pi2_dst
267*c83a76b0SSuyog Pawar * Output 4x4 coefficients
268*c83a76b0SSuyog Pawar *
269*c83a76b0SSuyog Pawar * @param[in] qp_div
270*c83a76b0SSuyog Pawar * Quantization parameter / 6
271*c83a76b0SSuyog Pawar *
272*c83a76b0SSuyog Pawar * @param[in] qp_rem
273*c83a76b0SSuyog Pawar * Quantization parameter % 6
274*c83a76b0SSuyog Pawar *
275*c83a76b0SSuyog Pawar * @param[in] src_strd
276*c83a76b0SSuyog Pawar * Input stride
277*c83a76b0SSuyog Pawar *
278*c83a76b0SSuyog Pawar * @param[in] dst_strd
279*c83a76b0SSuyog Pawar * Output Stride
280*c83a76b0SSuyog Pawar *
281*c83a76b0SSuyog Pawar * @param[out] csbf
282*c83a76b0SSuyog Pawar * coded sub block flag
283*c83a76b0SSuyog Pawar *
284*c83a76b0SSuyog Pawar * @param[in] csbf_strd
285*c83a76b0SSuyog Pawar * coded sub block flag
286*c83a76b0SSuyog Pawar *
287*c83a76b0SSuyog Pawar * @param[out] zero_col
288*c83a76b0SSuyog Pawar * zero column flag
289*c83a76b0SSuyog Pawar *
290*c83a76b0SSuyog Pawar * @param[out] zero_row
291*c83a76b0SSuyog Pawar * zero column flag
292*c83a76b0SSuyog Pawar *
293*c83a76b0SSuyog Pawar * @returns cbf
294*c83a76b0SSuyog Pawar * coded block flag
295*c83a76b0SSuyog Pawar *
296*c83a76b0SSuyog Pawar * @remarks
297*c83a76b0SSuyog Pawar * None
298*c83a76b0SSuyog Pawar *
299*c83a76b0SSuyog Pawar *******************************************************************************
300*c83a76b0SSuyog Pawar */
301*c83a76b0SSuyog Pawar
ihevc_quant_iquant(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)302*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant
303*c83a76b0SSuyog Pawar (
304*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
305*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
306*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
307*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
308*c83a76b0SSuyog Pawar WORD32 trans_size,
309*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
310*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
311*c83a76b0SSuyog Pawar WORD32 q_add,
312*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
313*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
314*c83a76b0SSuyog Pawar WORD32 src_strd,
315*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
316*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
317*c83a76b0SSuyog Pawar UWORD8 *csbf,
318*c83a76b0SSuyog Pawar WORD32 csbf_strd,
319*c83a76b0SSuyog Pawar WORD32 *zero_col,
320*c83a76b0SSuyog Pawar WORD32 *zero_row,
321*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
322*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
323*c83a76b0SSuyog Pawar )
324*c83a76b0SSuyog Pawar {
325*c83a76b0SSuyog Pawar WORD32 i, j;
326*c83a76b0SSuyog Pawar WORD32 log2_size;
327*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
328*c83a76b0SSuyog Pawar WORD32 cbf = 0;
329*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
330*c83a76b0SSuyog Pawar WORD16 i2_temp;
331*c83a76b0SSuyog Pawar
332*c83a76b0SSuyog Pawar (void)pi8_cost;
333*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
334*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
335*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
336*c83a76b0SSuyog Pawar
337*c83a76b0SSuyog Pawar /* Quant initialization */
338*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
339*c83a76b0SSuyog Pawar log2_size -= 1;
340*c83a76b0SSuyog Pawar
341*c83a76b0SSuyog Pawar bit_depth = 8;
342*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
343*c83a76b0SSuyog Pawar
344*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
345*c83a76b0SSuyog Pawar {
346*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
347*c83a76b0SSuyog Pawar {
348*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
349*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
350*c83a76b0SSuyog Pawar
351*c83a76b0SSuyog Pawar /* Quantization */
352*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j], pi2_coeffs[j],
353*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
354*c83a76b0SSuyog Pawar log2_size, q_add);
355*c83a76b0SSuyog Pawar
356*c83a76b0SSuyog Pawar /* Inverse Quantization */
357*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
358*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
359*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
360*c83a76b0SSuyog Pawar shift_iq,
361*c83a76b0SSuyog Pawar qp_div);
362*c83a76b0SSuyog Pawar }
363*c83a76b0SSuyog Pawar
364*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
365*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
366*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
367*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
368*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
369*c83a76b0SSuyog Pawar }
370*c83a76b0SSuyog Pawar
371*c83a76b0SSuyog Pawar /* CSBF update */
372*c83a76b0SSuyog Pawar {
373*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
374*c83a76b0SSuyog Pawar WORD32 row, col;
375*c83a76b0SSuyog Pawar WORD16 *pi2_block;
376*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
377*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
378*c83a76b0SSuyog Pawar
379*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
380*c83a76b0SSuyog Pawar
381*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
382*c83a76b0SSuyog Pawar {
383*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
384*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
385*c83a76b0SSuyog Pawar {
386*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
387*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
388*c83a76b0SSuyog Pawar
389*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
390*c83a76b0SSuyog Pawar {
391*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
392*c83a76b0SSuyog Pawar {
393*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
394*c83a76b0SSuyog Pawar {
395*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
396*c83a76b0SSuyog Pawar break;
397*c83a76b0SSuyog Pawar }
398*c83a76b0SSuyog Pawar }
399*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
400*c83a76b0SSuyog Pawar {
401*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
402*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
403*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
404*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
405*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
406*c83a76b0SSuyog Pawar
407*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
408*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
409*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
410*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
411*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
412*c83a76b0SSuyog Pawar
413*c83a76b0SSuyog Pawar break;
414*c83a76b0SSuyog Pawar }
415*c83a76b0SSuyog Pawar }
416*c83a76b0SSuyog Pawar
417*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
418*c83a76b0SSuyog Pawar }
419*c83a76b0SSuyog Pawar
420*c83a76b0SSuyog Pawar csbf += csbf_strd;
421*c83a76b0SSuyog Pawar }
422*c83a76b0SSuyog Pawar
423*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
424*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
425*c83a76b0SSuyog Pawar }
426*c83a76b0SSuyog Pawar
427*c83a76b0SSuyog Pawar return cbf;
428*c83a76b0SSuyog Pawar }
429*c83a76b0SSuyog Pawar
430*c83a76b0SSuyog Pawar /**
431*c83a76b0SSuyog Pawar *******************************************************************************
432*c83a76b0SSuyog Pawar *
433*c83a76b0SSuyog Pawar * @brief
434*c83a76b0SSuyog Pawar * This function performs quantization, followed by Inverse
435*c83a76b0SSuyog Pawar * quantization to find transform domain SSD
436*c83a76b0SSuyog Pawar *
437*c83a76b0SSuyog Pawar * @par Description:
438*c83a76b0SSuyog Pawar * Performs quantization on coeffs
439*c83a76b0SSuyog Pawar *
440*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
441*c83a76b0SSuyog Pawar * 4x4 Coeffs
442*c83a76b0SSuyog Pawar *
443*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
444*c83a76b0SSuyog Pawar * Scaling Matrix
445*c83a76b0SSuyog Pawar *
446*c83a76b0SSuyog Pawar * @param[out] pi2_dst
447*c83a76b0SSuyog Pawar * Output 4x4 coefficients
448*c83a76b0SSuyog Pawar *
449*c83a76b0SSuyog Pawar * @param[in] qp_div
450*c83a76b0SSuyog Pawar * Quantization parameter / 6
451*c83a76b0SSuyog Pawar *
452*c83a76b0SSuyog Pawar * @param[in] qp_rem
453*c83a76b0SSuyog Pawar * Quantization parameter % 6
454*c83a76b0SSuyog Pawar *
455*c83a76b0SSuyog Pawar * @param[in] src_strd
456*c83a76b0SSuyog Pawar * Input stride
457*c83a76b0SSuyog Pawar *
458*c83a76b0SSuyog Pawar * @param[in] dst_strd
459*c83a76b0SSuyog Pawar * Output Stride
460*c83a76b0SSuyog Pawar *
461*c83a76b0SSuyog Pawar * @param[out] csbf
462*c83a76b0SSuyog Pawar * coded sub block flag
463*c83a76b0SSuyog Pawar *
464*c83a76b0SSuyog Pawar * @param[in] csbf_strd
465*c83a76b0SSuyog Pawar * coded sub block flag
466*c83a76b0SSuyog Pawar *
467*c83a76b0SSuyog Pawar * @param[out] zero_col
468*c83a76b0SSuyog Pawar * zero column flag
469*c83a76b0SSuyog Pawar *
470*c83a76b0SSuyog Pawar * @param[out] zero_row
471*c83a76b0SSuyog Pawar * zero column flag
472*c83a76b0SSuyog Pawar *
473*c83a76b0SSuyog Pawar * @returns cbf
474*c83a76b0SSuyog Pawar * coded block flag
475*c83a76b0SSuyog Pawar *
476*c83a76b0SSuyog Pawar * @remarks
477*c83a76b0SSuyog Pawar * None
478*c83a76b0SSuyog Pawar *
479*c83a76b0SSuyog Pawar *******************************************************************************
480*c83a76b0SSuyog Pawar */
481*c83a76b0SSuyog Pawar
ihevc_quant_iquant_ssd_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)482*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_ssd_rdoq
483*c83a76b0SSuyog Pawar (
484*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
485*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
486*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
487*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
488*c83a76b0SSuyog Pawar WORD32 trans_size,
489*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
490*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
491*c83a76b0SSuyog Pawar WORD32 q_add,
492*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
493*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
494*c83a76b0SSuyog Pawar WORD32 src_strd,
495*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
496*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
497*c83a76b0SSuyog Pawar UWORD8 *csbf,
498*c83a76b0SSuyog Pawar WORD32 csbf_strd,
499*c83a76b0SSuyog Pawar WORD32 *zero_col,
500*c83a76b0SSuyog Pawar WORD32 *zero_row,
501*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
502*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
503*c83a76b0SSuyog Pawar )
504*c83a76b0SSuyog Pawar {
505*c83a76b0SSuyog Pawar WORD32 i, j;
506*c83a76b0SSuyog Pawar WORD32 log2_size;
507*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
508*c83a76b0SSuyog Pawar WORD32 cbf = 0;
509*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
510*c83a76b0SSuyog Pawar WORD32 val;
511*c83a76b0SSuyog Pawar WORD16 i2_temp;
512*c83a76b0SSuyog Pawar WORD32 ssd_cost = 0;
513*c83a76b0SSuyog Pawar
514*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
515*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
516*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
517*c83a76b0SSuyog Pawar
518*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
519*c83a76b0SSuyog Pawar log2_size -= 1;
520*c83a76b0SSuyog Pawar
521*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
522*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
523*c83a76b0SSuyog Pawar
524*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
525*c83a76b0SSuyog Pawar {
526*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
527*c83a76b0SSuyog Pawar {
528*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
529*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
530*c83a76b0SSuyog Pawar
531*c83a76b0SSuyog Pawar /* Quantization */
532*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j], pi2_coeffs[j],
533*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
534*c83a76b0SSuyog Pawar log2_size, q_add);
535*c83a76b0SSuyog Pawar
536*c83a76b0SSuyog Pawar
537*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) > 1)
538*c83a76b0SSuyog Pawar {
539*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
540*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
541*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
542*c83a76b0SSuyog Pawar
543*c83a76b0SSuyog Pawar }
544*c83a76b0SSuyog Pawar
545*c83a76b0SSuyog Pawar
546*c83a76b0SSuyog Pawar /* Inverse Quantization */
547*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
548*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
549*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
550*c83a76b0SSuyog Pawar /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
551*c83a76b0SSuyog Pawar shift_iq,
552*c83a76b0SSuyog Pawar qp_div);
553*c83a76b0SSuyog Pawar
554*c83a76b0SSuyog Pawar /* SSD Computation & Accumulation */
555*c83a76b0SSuyog Pawar val = i2_temp - pi2_iq_dst[j];
556*c83a76b0SSuyog Pawar ssd_cost += val*val;
557*c83a76b0SSuyog Pawar
558*c83a76b0SSuyog Pawar }
559*c83a76b0SSuyog Pawar
560*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
561*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
562*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
563*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
564*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
565*c83a76b0SSuyog Pawar }
566*c83a76b0SSuyog Pawar /* Store the cost */
567*c83a76b0SSuyog Pawar *pi8_cost = ssd_cost;
568*c83a76b0SSuyog Pawar
569*c83a76b0SSuyog Pawar /* CSBF update */
570*c83a76b0SSuyog Pawar {
571*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
572*c83a76b0SSuyog Pawar WORD32 row, col;
573*c83a76b0SSuyog Pawar WORD16 *pi2_block;
574*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
575*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
576*c83a76b0SSuyog Pawar
577*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
578*c83a76b0SSuyog Pawar
579*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
580*c83a76b0SSuyog Pawar {
581*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
582*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
583*c83a76b0SSuyog Pawar {
584*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
585*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
586*c83a76b0SSuyog Pawar
587*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
588*c83a76b0SSuyog Pawar {
589*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
590*c83a76b0SSuyog Pawar {
591*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
592*c83a76b0SSuyog Pawar {
593*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
594*c83a76b0SSuyog Pawar break;
595*c83a76b0SSuyog Pawar }
596*c83a76b0SSuyog Pawar }
597*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
598*c83a76b0SSuyog Pawar {
599*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
600*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
601*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
602*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
603*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
604*c83a76b0SSuyog Pawar
605*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
606*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
607*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
608*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
609*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
610*c83a76b0SSuyog Pawar
611*c83a76b0SSuyog Pawar break;
612*c83a76b0SSuyog Pawar }
613*c83a76b0SSuyog Pawar }
614*c83a76b0SSuyog Pawar
615*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
616*c83a76b0SSuyog Pawar }
617*c83a76b0SSuyog Pawar csbf += csbf_strd;
618*c83a76b0SSuyog Pawar }
619*c83a76b0SSuyog Pawar
620*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
621*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
622*c83a76b0SSuyog Pawar }
623*c83a76b0SSuyog Pawar
624*c83a76b0SSuyog Pawar return cbf;
625*c83a76b0SSuyog Pawar }
626*c83a76b0SSuyog Pawar
ihevc_quant_iquant_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)627*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_rdoq
628*c83a76b0SSuyog Pawar (
629*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
630*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
631*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
632*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
633*c83a76b0SSuyog Pawar WORD32 trans_size,
634*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
635*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
636*c83a76b0SSuyog Pawar WORD32 q_add,
637*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
638*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
639*c83a76b0SSuyog Pawar WORD32 src_strd,
640*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
641*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
642*c83a76b0SSuyog Pawar UWORD8 *csbf,
643*c83a76b0SSuyog Pawar WORD32 csbf_strd,
644*c83a76b0SSuyog Pawar WORD32 *zero_col,
645*c83a76b0SSuyog Pawar WORD32 *zero_row,
646*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
647*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
648*c83a76b0SSuyog Pawar )
649*c83a76b0SSuyog Pawar {
650*c83a76b0SSuyog Pawar WORD32 i, j;
651*c83a76b0SSuyog Pawar WORD32 log2_size;
652*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
653*c83a76b0SSuyog Pawar WORD32 cbf = 0;
654*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
655*c83a76b0SSuyog Pawar WORD16 i2_temp;
656*c83a76b0SSuyog Pawar
657*c83a76b0SSuyog Pawar (void)pi8_cost;
658*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
659*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
660*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
661*c83a76b0SSuyog Pawar
662*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
663*c83a76b0SSuyog Pawar log2_size -= 1;
664*c83a76b0SSuyog Pawar
665*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
666*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
667*c83a76b0SSuyog Pawar
668*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
669*c83a76b0SSuyog Pawar {
670*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
671*c83a76b0SSuyog Pawar {
672*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
673*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
674*c83a76b0SSuyog Pawar
675*c83a76b0SSuyog Pawar /* Quantization */
676*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j], pi2_coeffs[j],
677*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
678*c83a76b0SSuyog Pawar log2_size, q_add);
679*c83a76b0SSuyog Pawar
680*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) > 1)
681*c83a76b0SSuyog Pawar {
682*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
683*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
684*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
685*c83a76b0SSuyog Pawar }
686*c83a76b0SSuyog Pawar
687*c83a76b0SSuyog Pawar /* Inverse Quantization */
688*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
689*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
690*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
691*c83a76b0SSuyog Pawar shift_iq,
692*c83a76b0SSuyog Pawar qp_div);
693*c83a76b0SSuyog Pawar }
694*c83a76b0SSuyog Pawar
695*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
696*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
697*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
698*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
699*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
700*c83a76b0SSuyog Pawar }
701*c83a76b0SSuyog Pawar
702*c83a76b0SSuyog Pawar /* CSBF update */
703*c83a76b0SSuyog Pawar {
704*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
705*c83a76b0SSuyog Pawar WORD32 row, col;
706*c83a76b0SSuyog Pawar WORD16 *pi2_block;
707*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
708*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
709*c83a76b0SSuyog Pawar
710*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
711*c83a76b0SSuyog Pawar
712*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
713*c83a76b0SSuyog Pawar {
714*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
715*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
716*c83a76b0SSuyog Pawar {
717*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
718*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
719*c83a76b0SSuyog Pawar
720*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
721*c83a76b0SSuyog Pawar {
722*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
723*c83a76b0SSuyog Pawar {
724*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
725*c83a76b0SSuyog Pawar {
726*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
727*c83a76b0SSuyog Pawar break;
728*c83a76b0SSuyog Pawar }
729*c83a76b0SSuyog Pawar }
730*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
731*c83a76b0SSuyog Pawar {
732*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
733*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
734*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
735*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
736*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
737*c83a76b0SSuyog Pawar
738*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
739*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
740*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
741*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
742*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
743*c83a76b0SSuyog Pawar
744*c83a76b0SSuyog Pawar break;
745*c83a76b0SSuyog Pawar }
746*c83a76b0SSuyog Pawar }
747*c83a76b0SSuyog Pawar
748*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
749*c83a76b0SSuyog Pawar }
750*c83a76b0SSuyog Pawar csbf += csbf_strd;
751*c83a76b0SSuyog Pawar }
752*c83a76b0SSuyog Pawar
753*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
754*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
755*c83a76b0SSuyog Pawar }
756*c83a76b0SSuyog Pawar
757*c83a76b0SSuyog Pawar return cbf;
758*c83a76b0SSuyog Pawar }
759*c83a76b0SSuyog Pawar
760*c83a76b0SSuyog Pawar /**
761*c83a76b0SSuyog Pawar *******************************************************************************
762*c83a76b0SSuyog Pawar *
763*c83a76b0SSuyog Pawar * @brief
764*c83a76b0SSuyog Pawar * This function performs quantization(using flat scale matrix), followed by
765*c83a76b0SSuyog Pawar * inverse quantization to find transform domain SSD
766*c83a76b0SSuyog Pawar *
767*c83a76b0SSuyog Pawar * @par Description:
768*c83a76b0SSuyog Pawar * Performs quantization on coeffs
769*c83a76b0SSuyog Pawar *
770*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
771*c83a76b0SSuyog Pawar * 4x4 Coeffs
772*c83a76b0SSuyog Pawar *
773*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
774*c83a76b0SSuyog Pawar * Scaling Matrix
775*c83a76b0SSuyog Pawar *
776*c83a76b0SSuyog Pawar * @param[out] pi2_dst
777*c83a76b0SSuyog Pawar * Output 4x4 coefficients
778*c83a76b0SSuyog Pawar *
779*c83a76b0SSuyog Pawar * @param[in] qp_div
780*c83a76b0SSuyog Pawar * Quantization parameter / 6
781*c83a76b0SSuyog Pawar *
782*c83a76b0SSuyog Pawar * @param[in] qp_rem
783*c83a76b0SSuyog Pawar * Quantization parameter % 6
784*c83a76b0SSuyog Pawar *
785*c83a76b0SSuyog Pawar * @param[in] src_strd
786*c83a76b0SSuyog Pawar * Input stride
787*c83a76b0SSuyog Pawar *
788*c83a76b0SSuyog Pawar * @param[in] dst_strd
789*c83a76b0SSuyog Pawar * Output Stride
790*c83a76b0SSuyog Pawar *
791*c83a76b0SSuyog Pawar * @param[out] csbf
792*c83a76b0SSuyog Pawar * coded sub block flag
793*c83a76b0SSuyog Pawar *
794*c83a76b0SSuyog Pawar * @param[in] csbf_strd
795*c83a76b0SSuyog Pawar * coded sub block flag
796*c83a76b0SSuyog Pawar *
797*c83a76b0SSuyog Pawar * @param[out] zero_col
798*c83a76b0SSuyog Pawar * zero column flag
799*c83a76b0SSuyog Pawar *
800*c83a76b0SSuyog Pawar * @param[out] zero_row
801*c83a76b0SSuyog Pawar * zero column flag
802*c83a76b0SSuyog Pawar *
803*c83a76b0SSuyog Pawar * @returns cbf
804*c83a76b0SSuyog Pawar * coded block flag
805*c83a76b0SSuyog Pawar *
806*c83a76b0SSuyog Pawar * @remarks
807*c83a76b0SSuyog Pawar * None
808*c83a76b0SSuyog Pawar *
809*c83a76b0SSuyog Pawar *******************************************************************************
810*c83a76b0SSuyog Pawar */
811*c83a76b0SSuyog Pawar
ihevc_quant_iquant_ssd_flat_scale_mat(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)812*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_ssd_flat_scale_mat
813*c83a76b0SSuyog Pawar (
814*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
815*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
816*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
817*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
818*c83a76b0SSuyog Pawar WORD32 trans_size,
819*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
820*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
821*c83a76b0SSuyog Pawar WORD32 q_add,
822*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
823*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
824*c83a76b0SSuyog Pawar WORD32 src_strd,
825*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
826*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
827*c83a76b0SSuyog Pawar UWORD8 *csbf,
828*c83a76b0SSuyog Pawar WORD32 csbf_strd,
829*c83a76b0SSuyog Pawar WORD32 *zero_col,
830*c83a76b0SSuyog Pawar WORD32 *zero_row,
831*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
832*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
833*c83a76b0SSuyog Pawar )
834*c83a76b0SSuyog Pawar {
835*c83a76b0SSuyog Pawar WORD32 i, j;
836*c83a76b0SSuyog Pawar WORD32 log2_size;
837*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
838*c83a76b0SSuyog Pawar WORD32 cbf = 0;
839*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
840*c83a76b0SSuyog Pawar WORD32 val;
841*c83a76b0SSuyog Pawar WORD16 i2_temp;
842*c83a76b0SSuyog Pawar /* Initialize cost to zero */
843*c83a76b0SSuyog Pawar WORD32 ssd_cost = 0;
844*c83a76b0SSuyog Pawar
845*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
846*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
847*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
848*c83a76b0SSuyog Pawar
849*c83a76b0SSuyog Pawar /* Quant initialization */
850*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
851*c83a76b0SSuyog Pawar log2_size -= 1;
852*c83a76b0SSuyog Pawar
853*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
854*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
855*c83a76b0SSuyog Pawar
856*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
857*c83a76b0SSuyog Pawar {
858*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
859*c83a76b0SSuyog Pawar {
860*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
861*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
862*c83a76b0SSuyog Pawar
863*c83a76b0SSuyog Pawar /*QUANT(pi2_dst[j], pi2_coeffs[j],
864*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
865*c83a76b0SSuyog Pawar log2_size, q_add);*/
866*c83a76b0SSuyog Pawar
867*c83a76b0SSuyog Pawar /* modified by 1028 */
868*c83a76b0SSuyog Pawar /* Quantization */
869*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
870*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
871*c83a76b0SSuyog Pawar log2_size, q_add);
872*c83a76b0SSuyog Pawar
873*c83a76b0SSuyog Pawar if(pi2_q_dst[j] == 0)
874*c83a76b0SSuyog Pawar {
875*c83a76b0SSuyog Pawar pi2_iq_dst[j] = 0;
876*c83a76b0SSuyog Pawar }
877*c83a76b0SSuyog Pawar else
878*c83a76b0SSuyog Pawar {
879*c83a76b0SSuyog Pawar /* Inverse Quantization */
880*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
881*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
882*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
883*c83a76b0SSuyog Pawar shift_iq,
884*c83a76b0SSuyog Pawar qp_div);
885*c83a76b0SSuyog Pawar }
886*c83a76b0SSuyog Pawar
887*c83a76b0SSuyog Pawar /* SSD Computation & Accumulation */
888*c83a76b0SSuyog Pawar val = i2_temp - pi2_iq_dst[j];
889*c83a76b0SSuyog Pawar ssd_cost += val*val;
890*c83a76b0SSuyog Pawar
891*c83a76b0SSuyog Pawar }
892*c83a76b0SSuyog Pawar
893*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
894*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
895*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
896*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
897*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
898*c83a76b0SSuyog Pawar }
899*c83a76b0SSuyog Pawar /* Store the cost */
900*c83a76b0SSuyog Pawar *pi8_cost = ssd_cost;
901*c83a76b0SSuyog Pawar
902*c83a76b0SSuyog Pawar /* CSBF update */
903*c83a76b0SSuyog Pawar {
904*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
905*c83a76b0SSuyog Pawar WORD32 row, col;
906*c83a76b0SSuyog Pawar WORD16 *pi2_block;
907*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
908*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
909*c83a76b0SSuyog Pawar
910*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
911*c83a76b0SSuyog Pawar
912*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
913*c83a76b0SSuyog Pawar {
914*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
915*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
916*c83a76b0SSuyog Pawar {
917*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
918*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
919*c83a76b0SSuyog Pawar
920*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
921*c83a76b0SSuyog Pawar {
922*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
923*c83a76b0SSuyog Pawar {
924*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
925*c83a76b0SSuyog Pawar {
926*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
927*c83a76b0SSuyog Pawar break;
928*c83a76b0SSuyog Pawar }
929*c83a76b0SSuyog Pawar }
930*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
931*c83a76b0SSuyog Pawar {
932*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
933*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
934*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
935*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
936*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
937*c83a76b0SSuyog Pawar
938*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
939*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
940*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
941*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
942*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
943*c83a76b0SSuyog Pawar
944*c83a76b0SSuyog Pawar break;
945*c83a76b0SSuyog Pawar }
946*c83a76b0SSuyog Pawar }
947*c83a76b0SSuyog Pawar
948*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
949*c83a76b0SSuyog Pawar }
950*c83a76b0SSuyog Pawar csbf += csbf_strd;
951*c83a76b0SSuyog Pawar }
952*c83a76b0SSuyog Pawar
953*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
954*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
955*c83a76b0SSuyog Pawar }
956*c83a76b0SSuyog Pawar
957*c83a76b0SSuyog Pawar return cbf;
958*c83a76b0SSuyog Pawar }
959*c83a76b0SSuyog Pawar
ihevc_quant_iquant_flat_scale_mat(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)960*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_flat_scale_mat
961*c83a76b0SSuyog Pawar (
962*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
963*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
964*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
965*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
966*c83a76b0SSuyog Pawar WORD32 trans_size,
967*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
968*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
969*c83a76b0SSuyog Pawar WORD32 q_add,
970*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
971*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
972*c83a76b0SSuyog Pawar WORD32 src_strd,
973*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
974*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
975*c83a76b0SSuyog Pawar UWORD8 *csbf,
976*c83a76b0SSuyog Pawar WORD32 csbf_strd,
977*c83a76b0SSuyog Pawar WORD32 *zero_col,
978*c83a76b0SSuyog Pawar WORD32 *zero_row,
979*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
980*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
981*c83a76b0SSuyog Pawar )
982*c83a76b0SSuyog Pawar {
983*c83a76b0SSuyog Pawar WORD32 i, j;
984*c83a76b0SSuyog Pawar WORD32 log2_size;
985*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
986*c83a76b0SSuyog Pawar WORD32 cbf = 0;
987*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
988*c83a76b0SSuyog Pawar WORD16 i2_temp;
989*c83a76b0SSuyog Pawar
990*c83a76b0SSuyog Pawar (void)pi8_cost;
991*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
992*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
993*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
994*c83a76b0SSuyog Pawar
995*c83a76b0SSuyog Pawar /* Quant initialization */
996*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
997*c83a76b0SSuyog Pawar log2_size -= 1;
998*c83a76b0SSuyog Pawar
999*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
1000*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
1001*c83a76b0SSuyog Pawar
1002*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
1003*c83a76b0SSuyog Pawar {
1004*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1005*c83a76b0SSuyog Pawar {
1006*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
1007*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
1008*c83a76b0SSuyog Pawar
1009*c83a76b0SSuyog Pawar /* Quantization */
1010*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1011*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1012*c83a76b0SSuyog Pawar log2_size, q_add);
1013*c83a76b0SSuyog Pawar
1014*c83a76b0SSuyog Pawar if(pi2_q_dst[j] == 0)
1015*c83a76b0SSuyog Pawar {
1016*c83a76b0SSuyog Pawar pi2_iq_dst[j] = 0;
1017*c83a76b0SSuyog Pawar }
1018*c83a76b0SSuyog Pawar else
1019*c83a76b0SSuyog Pawar {
1020*c83a76b0SSuyog Pawar /* Inverse Quantization */
1021*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
1022*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1023*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1024*c83a76b0SSuyog Pawar shift_iq,
1025*c83a76b0SSuyog Pawar qp_div);
1026*c83a76b0SSuyog Pawar }
1027*c83a76b0SSuyog Pawar }
1028*c83a76b0SSuyog Pawar
1029*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
1030*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
1031*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
1032*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
1033*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
1034*c83a76b0SSuyog Pawar }
1035*c83a76b0SSuyog Pawar
1036*c83a76b0SSuyog Pawar /* CSBF update */
1037*c83a76b0SSuyog Pawar {
1038*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
1039*c83a76b0SSuyog Pawar WORD32 row, col;
1040*c83a76b0SSuyog Pawar WORD16 *pi2_block;
1041*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
1042*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
1043*c83a76b0SSuyog Pawar
1044*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
1045*c83a76b0SSuyog Pawar
1046*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
1047*c83a76b0SSuyog Pawar {
1048*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
1049*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
1050*c83a76b0SSuyog Pawar {
1051*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1052*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
1053*c83a76b0SSuyog Pawar
1054*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
1055*c83a76b0SSuyog Pawar {
1056*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
1057*c83a76b0SSuyog Pawar {
1058*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
1059*c83a76b0SSuyog Pawar {
1060*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
1061*c83a76b0SSuyog Pawar break;
1062*c83a76b0SSuyog Pawar }
1063*c83a76b0SSuyog Pawar }
1064*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
1065*c83a76b0SSuyog Pawar {
1066*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
1067*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1068*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
1069*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
1070*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1071*c83a76b0SSuyog Pawar
1072*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
1073*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1074*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
1075*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
1076*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1077*c83a76b0SSuyog Pawar
1078*c83a76b0SSuyog Pawar break;
1079*c83a76b0SSuyog Pawar }
1080*c83a76b0SSuyog Pawar }
1081*c83a76b0SSuyog Pawar
1082*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
1083*c83a76b0SSuyog Pawar }
1084*c83a76b0SSuyog Pawar csbf += csbf_strd;
1085*c83a76b0SSuyog Pawar }
1086*c83a76b0SSuyog Pawar
1087*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
1088*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
1089*c83a76b0SSuyog Pawar }
1090*c83a76b0SSuyog Pawar
1091*c83a76b0SSuyog Pawar return cbf;
1092*c83a76b0SSuyog Pawar }
1093*c83a76b0SSuyog Pawar
1094*c83a76b0SSuyog Pawar /**
1095*c83a76b0SSuyog Pawar *******************************************************************************
1096*c83a76b0SSuyog Pawar *
1097*c83a76b0SSuyog Pawar * @brief
1098*c83a76b0SSuyog Pawar * This function performs quantization(using flat scale matrix), followed by
1099*c83a76b0SSuyog Pawar * inverse quantization to find transform domain SSD; when we perform RDOQ.
1100*c83a76b0SSuyog Pawar * In case the quantized value turns out to be grater than 1, we then requantize
1101*c83a76b0SSuyog Pawar * use half rounding.
1102*c83a76b0SSuyog Pawar *
1103*c83a76b0SSuyog Pawar * @par Description:
1104*c83a76b0SSuyog Pawar * Performs quantization on coeffs
1105*c83a76b0SSuyog Pawar *
1106*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
1107*c83a76b0SSuyog Pawar * 4x4 Coeffs
1108*c83a76b0SSuyog Pawar *
1109*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
1110*c83a76b0SSuyog Pawar * Scaling Matrix
1111*c83a76b0SSuyog Pawar *
1112*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1113*c83a76b0SSuyog Pawar * Output 4x4 coefficients
1114*c83a76b0SSuyog Pawar *
1115*c83a76b0SSuyog Pawar * @param[in] qp_div
1116*c83a76b0SSuyog Pawar * Quantization parameter / 6
1117*c83a76b0SSuyog Pawar *
1118*c83a76b0SSuyog Pawar * @param[in] qp_rem
1119*c83a76b0SSuyog Pawar * Quantization parameter % 6
1120*c83a76b0SSuyog Pawar *
1121*c83a76b0SSuyog Pawar * @param[in] src_strd
1122*c83a76b0SSuyog Pawar * Input stride
1123*c83a76b0SSuyog Pawar *
1124*c83a76b0SSuyog Pawar * @param[in] dst_strd
1125*c83a76b0SSuyog Pawar * Output Stride
1126*c83a76b0SSuyog Pawar *
1127*c83a76b0SSuyog Pawar * @param[out] csbf
1128*c83a76b0SSuyog Pawar * coded sub block flag
1129*c83a76b0SSuyog Pawar *
1130*c83a76b0SSuyog Pawar * @param[in] csbf_strd
1131*c83a76b0SSuyog Pawar * coded sub block flag
1132*c83a76b0SSuyog Pawar *
1133*c83a76b0SSuyog Pawar * @param[out] zero_col
1134*c83a76b0SSuyog Pawar * zero column flag
1135*c83a76b0SSuyog Pawar *
1136*c83a76b0SSuyog Pawar * @param[out] zero_row
1137*c83a76b0SSuyog Pawar * zero column flag
1138*c83a76b0SSuyog Pawar *
1139*c83a76b0SSuyog Pawar * @returns cbf
1140*c83a76b0SSuyog Pawar * coded block flag
1141*c83a76b0SSuyog Pawar *
1142*c83a76b0SSuyog Pawar * @remarks
1143*c83a76b0SSuyog Pawar * None
1144*c83a76b0SSuyog Pawar *
1145*c83a76b0SSuyog Pawar *******************************************************************************
1146*c83a76b0SSuyog Pawar */
1147*c83a76b0SSuyog Pawar
ihevc_quant_iquant_ssd_flat_scale_mat_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1148*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_rdoq
1149*c83a76b0SSuyog Pawar (
1150*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
1151*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
1152*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
1153*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
1154*c83a76b0SSuyog Pawar WORD32 trans_size,
1155*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
1156*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
1157*c83a76b0SSuyog Pawar WORD32 q_add,
1158*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
1159*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
1160*c83a76b0SSuyog Pawar WORD32 src_strd,
1161*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
1162*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
1163*c83a76b0SSuyog Pawar UWORD8 *csbf,
1164*c83a76b0SSuyog Pawar WORD32 csbf_strd,
1165*c83a76b0SSuyog Pawar WORD32 *zero_col,
1166*c83a76b0SSuyog Pawar WORD32 *zero_row,
1167*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
1168*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
1169*c83a76b0SSuyog Pawar )
1170*c83a76b0SSuyog Pawar {
1171*c83a76b0SSuyog Pawar WORD32 i, j;
1172*c83a76b0SSuyog Pawar WORD32 log2_size;
1173*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
1174*c83a76b0SSuyog Pawar WORD32 cbf = 0;
1175*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
1176*c83a76b0SSuyog Pawar WORD32 val;
1177*c83a76b0SSuyog Pawar WORD16 i2_temp;
1178*c83a76b0SSuyog Pawar /* Initialize cost to zero */
1179*c83a76b0SSuyog Pawar WORD32 ssd_cost = 0;
1180*c83a76b0SSuyog Pawar
1181*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
1182*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
1183*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
1184*c83a76b0SSuyog Pawar
1185*c83a76b0SSuyog Pawar /* Quant initialization */
1186*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
1187*c83a76b0SSuyog Pawar log2_size -= 1;
1188*c83a76b0SSuyog Pawar
1189*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
1190*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
1191*c83a76b0SSuyog Pawar
1192*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
1193*c83a76b0SSuyog Pawar {
1194*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1195*c83a76b0SSuyog Pawar {
1196*c83a76b0SSuyog Pawar WORD16 i2_temp1;
1197*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
1198*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
1199*c83a76b0SSuyog Pawar
1200*c83a76b0SSuyog Pawar /*QUANT(pi2_dst[j], pi2_coeffs[j],
1201*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1202*c83a76b0SSuyog Pawar log2_size, q_add);*/
1203*c83a76b0SSuyog Pawar
1204*c83a76b0SSuyog Pawar /* modified by 1028 */
1205*c83a76b0SSuyog Pawar /* Quantization */
1206*c83a76b0SSuyog Pawar
1207*c83a76b0SSuyog Pawar if (1)
1208*c83a76b0SSuyog Pawar {
1209*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1210*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1211*c83a76b0SSuyog Pawar log2_size, q_add);
1212*c83a76b0SSuyog Pawar }
1213*c83a76b0SSuyog Pawar else
1214*c83a76b0SSuyog Pawar { \
1215*c83a76b0SSuyog Pawar WORD16 inp = pi2_coeffs[j],out = pi2_q_dst[j];
1216*c83a76b0SSuyog Pawar WORD32 quant_coeff = g_ihevc_quant_scales[qp_rem];
1217*c83a76b0SSuyog Pawar WORD32 log2_trans_size = log2_size;
1218*c83a76b0SSuyog Pawar WORD32 tmp; \
1219*c83a76b0SSuyog Pawar WORD32 sign; \
1220*c83a76b0SSuyog Pawar WORD32 bit_depth,transform_shift; \
1221*c83a76b0SSuyog Pawar WORD32 q_bits, quant_multiplier; \
1222*c83a76b0SSuyog Pawar \
1223*c83a76b0SSuyog Pawar /* q_bits and q_add calculation*/ \
1224*c83a76b0SSuyog Pawar /* To be moved outside in neon. To be computer once per transform call */ \
1225*c83a76b0SSuyog Pawar bit_depth = 8; \
1226*c83a76b0SSuyog Pawar transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size; \
1227*c83a76b0SSuyog Pawar quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */ \
1228*c83a76b0SSuyog Pawar q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */; \
1229*c83a76b0SSuyog Pawar \
1230*c83a76b0SSuyog Pawar sign = (inp)<0 ? -1:1; \
1231*c83a76b0SSuyog Pawar \
1232*c83a76b0SSuyog Pawar tmp = (WORD32)(abs(inp)); \
1233*c83a76b0SSuyog Pawar tmp = tmp * (quant_coeff); \
1234*c83a76b0SSuyog Pawar tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); \
1235*c83a76b0SSuyog Pawar tmp = tmp >> q_bits; \
1236*c83a76b0SSuyog Pawar \
1237*c83a76b0SSuyog Pawar tmp = tmp * sign; \
1238*c83a76b0SSuyog Pawar out = (WORD16) CLIP_S16(tmp); \
1239*c83a76b0SSuyog Pawar }
1240*c83a76b0SSuyog Pawar i2_temp1 = pi2_q_dst[j];
1241*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) > 1)
1242*c83a76b0SSuyog Pawar {
1243*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1244*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1245*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1246*c83a76b0SSuyog Pawar }
1247*c83a76b0SSuyog Pawar
1248*c83a76b0SSuyog Pawar
1249*c83a76b0SSuyog Pawar ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1250*c83a76b0SSuyog Pawar ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));
1251*c83a76b0SSuyog Pawar
1252*c83a76b0SSuyog Pawar
1253*c83a76b0SSuyog Pawar /* Inverse Quantization */
1254*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
1255*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1256*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1257*c83a76b0SSuyog Pawar shift_iq,
1258*c83a76b0SSuyog Pawar qp_div);
1259*c83a76b0SSuyog Pawar
1260*c83a76b0SSuyog Pawar /* SSD Computation & Accumulation */
1261*c83a76b0SSuyog Pawar val = i2_temp - pi2_iq_dst[j];
1262*c83a76b0SSuyog Pawar ssd_cost += val*val;
1263*c83a76b0SSuyog Pawar
1264*c83a76b0SSuyog Pawar }
1265*c83a76b0SSuyog Pawar
1266*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
1267*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
1268*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
1269*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
1270*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
1271*c83a76b0SSuyog Pawar
1272*c83a76b0SSuyog Pawar }
1273*c83a76b0SSuyog Pawar /* Store the cost */
1274*c83a76b0SSuyog Pawar *pi8_cost = ssd_cost;
1275*c83a76b0SSuyog Pawar
1276*c83a76b0SSuyog Pawar /* CSBF update */
1277*c83a76b0SSuyog Pawar {
1278*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
1279*c83a76b0SSuyog Pawar WORD32 row, col;
1280*c83a76b0SSuyog Pawar WORD16 *pi2_block;
1281*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
1282*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
1283*c83a76b0SSuyog Pawar
1284*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
1285*c83a76b0SSuyog Pawar
1286*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
1287*c83a76b0SSuyog Pawar {
1288*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
1289*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
1290*c83a76b0SSuyog Pawar {
1291*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1292*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
1293*c83a76b0SSuyog Pawar
1294*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
1295*c83a76b0SSuyog Pawar {
1296*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
1297*c83a76b0SSuyog Pawar {
1298*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
1299*c83a76b0SSuyog Pawar {
1300*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
1301*c83a76b0SSuyog Pawar break;
1302*c83a76b0SSuyog Pawar }
1303*c83a76b0SSuyog Pawar }
1304*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
1305*c83a76b0SSuyog Pawar {
1306*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
1307*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1308*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
1309*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
1310*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1311*c83a76b0SSuyog Pawar
1312*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
1313*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1314*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
1315*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
1316*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1317*c83a76b0SSuyog Pawar
1318*c83a76b0SSuyog Pawar break;
1319*c83a76b0SSuyog Pawar }
1320*c83a76b0SSuyog Pawar }
1321*c83a76b0SSuyog Pawar
1322*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
1323*c83a76b0SSuyog Pawar }
1324*c83a76b0SSuyog Pawar csbf += csbf_strd;
1325*c83a76b0SSuyog Pawar }
1326*c83a76b0SSuyog Pawar
1327*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
1328*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
1329*c83a76b0SSuyog Pawar }
1330*c83a76b0SSuyog Pawar return cbf;
1331*c83a76b0SSuyog Pawar }
1332*c83a76b0SSuyog Pawar
ihevc_quant_iquant_flat_scale_mat_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1333*c83a76b0SSuyog Pawar WORD32 ihevc_quant_iquant_flat_scale_mat_rdoq
1334*c83a76b0SSuyog Pawar (
1335*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
1336*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
1337*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
1338*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
1339*c83a76b0SSuyog Pawar WORD32 trans_size,
1340*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
1341*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
1342*c83a76b0SSuyog Pawar WORD32 q_add,
1343*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
1344*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
1345*c83a76b0SSuyog Pawar WORD32 src_strd,
1346*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
1347*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
1348*c83a76b0SSuyog Pawar UWORD8 *csbf,
1349*c83a76b0SSuyog Pawar WORD32 csbf_strd,
1350*c83a76b0SSuyog Pawar WORD32 *zero_col,
1351*c83a76b0SSuyog Pawar WORD32 *zero_row,
1352*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
1353*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
1354*c83a76b0SSuyog Pawar )
1355*c83a76b0SSuyog Pawar {
1356*c83a76b0SSuyog Pawar WORD32 i, j;
1357*c83a76b0SSuyog Pawar WORD32 log2_size;
1358*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
1359*c83a76b0SSuyog Pawar WORD32 cbf = 0;
1360*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
1361*c83a76b0SSuyog Pawar WORD16 i2_temp;
1362*c83a76b0SSuyog Pawar
1363*c83a76b0SSuyog Pawar (void)pi8_cost;
1364*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_0_1;
1365*c83a76b0SSuyog Pawar (void)pi4_quant_round_factor_1_2;
1366*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
1367*c83a76b0SSuyog Pawar
1368*c83a76b0SSuyog Pawar /* Quant initialization */
1369*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
1370*c83a76b0SSuyog Pawar log2_size -= 1;
1371*c83a76b0SSuyog Pawar
1372*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
1373*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
1374*c83a76b0SSuyog Pawar
1375*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
1376*c83a76b0SSuyog Pawar {
1377*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1378*c83a76b0SSuyog Pawar {
1379*c83a76b0SSuyog Pawar WORD16 i2_temp1;
1380*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
1381*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
1382*c83a76b0SSuyog Pawar
1383*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1384*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1385*c83a76b0SSuyog Pawar log2_size, q_add);
1386*c83a76b0SSuyog Pawar
1387*c83a76b0SSuyog Pawar i2_temp1 = pi2_q_dst[j];
1388*c83a76b0SSuyog Pawar
1389*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) > 1)
1390*c83a76b0SSuyog Pawar {
1391*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1392*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1393*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1394*c83a76b0SSuyog Pawar }
1395*c83a76b0SSuyog Pawar
1396*c83a76b0SSuyog Pawar ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1397*c83a76b0SSuyog Pawar ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));
1398*c83a76b0SSuyog Pawar
1399*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
1400*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1401*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1402*c83a76b0SSuyog Pawar shift_iq,
1403*c83a76b0SSuyog Pawar qp_div);
1404*c83a76b0SSuyog Pawar }
1405*c83a76b0SSuyog Pawar
1406*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
1407*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
1408*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
1409*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
1410*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
1411*c83a76b0SSuyog Pawar }
1412*c83a76b0SSuyog Pawar
1413*c83a76b0SSuyog Pawar /* CSBF update */
1414*c83a76b0SSuyog Pawar {
1415*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
1416*c83a76b0SSuyog Pawar WORD32 row, col;
1417*c83a76b0SSuyog Pawar WORD16 *pi2_block;
1418*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
1419*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
1420*c83a76b0SSuyog Pawar
1421*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
1422*c83a76b0SSuyog Pawar
1423*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
1424*c83a76b0SSuyog Pawar {
1425*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
1426*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
1427*c83a76b0SSuyog Pawar {
1428*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1429*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
1430*c83a76b0SSuyog Pawar
1431*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
1432*c83a76b0SSuyog Pawar {
1433*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
1434*c83a76b0SSuyog Pawar {
1435*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
1436*c83a76b0SSuyog Pawar {
1437*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
1438*c83a76b0SSuyog Pawar break;
1439*c83a76b0SSuyog Pawar }
1440*c83a76b0SSuyog Pawar }
1441*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
1442*c83a76b0SSuyog Pawar {
1443*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
1444*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1445*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
1446*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
1447*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1448*c83a76b0SSuyog Pawar
1449*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
1450*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1451*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
1452*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
1453*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1454*c83a76b0SSuyog Pawar
1455*c83a76b0SSuyog Pawar break;
1456*c83a76b0SSuyog Pawar }
1457*c83a76b0SSuyog Pawar }
1458*c83a76b0SSuyog Pawar
1459*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
1460*c83a76b0SSuyog Pawar }
1461*c83a76b0SSuyog Pawar csbf += csbf_strd;
1462*c83a76b0SSuyog Pawar }
1463*c83a76b0SSuyog Pawar
1464*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
1465*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
1466*c83a76b0SSuyog Pawar }
1467*c83a76b0SSuyog Pawar
1468*c83a76b0SSuyog Pawar return cbf;
1469*c83a76b0SSuyog Pawar }
1470*c83a76b0SSuyog Pawar
1471*c83a76b0SSuyog Pawar
1472*c83a76b0SSuyog Pawar /**
1473*c83a76b0SSuyog Pawar *******************************************************************************
1474*c83a76b0SSuyog Pawar *
1475*c83a76b0SSuyog Pawar * @brief
1476*c83a76b0SSuyog Pawar * This function performs quantization, followed by Inverse
1477*c83a76b0SSuyog Pawar * quantization to find transform domain SSD
1478*c83a76b0SSuyog Pawar *
1479*c83a76b0SSuyog Pawar * @par Description:
1480*c83a76b0SSuyog Pawar * Performs quantization on coeffs
1481*c83a76b0SSuyog Pawar *
1482*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
1483*c83a76b0SSuyog Pawar * 4x4 Coeffs
1484*c83a76b0SSuyog Pawar *
1485*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
1486*c83a76b0SSuyog Pawar * Scaling Matrix
1487*c83a76b0SSuyog Pawar *
1488*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1489*c83a76b0SSuyog Pawar * Output 4x4 coefficients
1490*c83a76b0SSuyog Pawar *
1491*c83a76b0SSuyog Pawar * @param[in] qp_div
1492*c83a76b0SSuyog Pawar * Quantization parameter / 6
1493*c83a76b0SSuyog Pawar *
1494*c83a76b0SSuyog Pawar * @param[in] qp_rem
1495*c83a76b0SSuyog Pawar * Quantization parameter % 6
1496*c83a76b0SSuyog Pawar *
1497*c83a76b0SSuyog Pawar * @param[in] src_strd
1498*c83a76b0SSuyog Pawar * Input stride
1499*c83a76b0SSuyog Pawar *
1500*c83a76b0SSuyog Pawar * @param[in] dst_strd
1501*c83a76b0SSuyog Pawar * Output Stride
1502*c83a76b0SSuyog Pawar *
1503*c83a76b0SSuyog Pawar * @param[out] csbf
1504*c83a76b0SSuyog Pawar * coded sub block flag
1505*c83a76b0SSuyog Pawar *
1506*c83a76b0SSuyog Pawar * @param[in] csbf_strd
1507*c83a76b0SSuyog Pawar * coded sub block flag
1508*c83a76b0SSuyog Pawar *
1509*c83a76b0SSuyog Pawar * @param[out] zero_col
1510*c83a76b0SSuyog Pawar * zero column flag
1511*c83a76b0SSuyog Pawar *
1512*c83a76b0SSuyog Pawar * @param[out] zero_row
1513*c83a76b0SSuyog Pawar * zero column flag
1514*c83a76b0SSuyog Pawar *
1515*c83a76b0SSuyog Pawar * @returns cbf
1516*c83a76b0SSuyog Pawar * coded block flag
1517*c83a76b0SSuyog Pawar *
1518*c83a76b0SSuyog Pawar * @remarks
1519*c83a76b0SSuyog Pawar * None
1520*c83a76b0SSuyog Pawar *
1521*c83a76b0SSuyog Pawar *******************************************************************************
1522*c83a76b0SSuyog Pawar */
1523*c83a76b0SSuyog Pawar
ihevc_q_iq_ssd_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1524*c83a76b0SSuyog Pawar WORD32 ihevc_q_iq_ssd_var_rnd_fact
1525*c83a76b0SSuyog Pawar (
1526*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
1527*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
1528*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
1529*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
1530*c83a76b0SSuyog Pawar WORD32 trans_size,
1531*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
1532*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
1533*c83a76b0SSuyog Pawar WORD32 q_add,
1534*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
1535*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
1536*c83a76b0SSuyog Pawar WORD32 src_strd,
1537*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
1538*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
1539*c83a76b0SSuyog Pawar UWORD8 *csbf,
1540*c83a76b0SSuyog Pawar WORD32 csbf_strd,
1541*c83a76b0SSuyog Pawar WORD32 *zero_col,
1542*c83a76b0SSuyog Pawar WORD32 *zero_row,
1543*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
1544*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
1545*c83a76b0SSuyog Pawar )
1546*c83a76b0SSuyog Pawar {
1547*c83a76b0SSuyog Pawar WORD32 i, j;
1548*c83a76b0SSuyog Pawar WORD32 log2_size;
1549*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
1550*c83a76b0SSuyog Pawar WORD32 cbf = 0;
1551*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
1552*c83a76b0SSuyog Pawar WORD32 val;
1553*c83a76b0SSuyog Pawar WORD16 i2_temp;
1554*c83a76b0SSuyog Pawar //WORD16 i2_temp_1;
1555*c83a76b0SSuyog Pawar /* Initialize cost to zero */
1556*c83a76b0SSuyog Pawar WORD32 ssd_cost = 0;
1557*c83a76b0SSuyog Pawar
1558*c83a76b0SSuyog Pawar (void)q_add;
1559*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
1560*c83a76b0SSuyog Pawar
1561*c83a76b0SSuyog Pawar
1562*c83a76b0SSuyog Pawar /* Quant initialization */
1563*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
1564*c83a76b0SSuyog Pawar log2_size -= 1;
1565*c83a76b0SSuyog Pawar
1566*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
1567*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
1568*c83a76b0SSuyog Pawar
1569*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
1570*c83a76b0SSuyog Pawar {
1571*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1572*c83a76b0SSuyog Pawar {
1573*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
1574*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
1575*c83a76b0SSuyog Pawar
1576*c83a76b0SSuyog Pawar
1577*c83a76b0SSuyog Pawar {
1578*c83a76b0SSuyog Pawar /* Quantization */
1579*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1580*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1581*c83a76b0SSuyog Pawar log2_size, 0);
1582*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) >= 2)
1583*c83a76b0SSuyog Pawar {
1584*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1585*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1586*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1587*c83a76b0SSuyog Pawar
1588*c83a76b0SSuyog Pawar }
1589*c83a76b0SSuyog Pawar else if (abs(pi2_q_dst[j]) >= 1)
1590*c83a76b0SSuyog Pawar {
1591*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1592*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1593*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_1_2);
1594*c83a76b0SSuyog Pawar }
1595*c83a76b0SSuyog Pawar
1596*c83a76b0SSuyog Pawar else
1597*c83a76b0SSuyog Pawar {
1598*c83a76b0SSuyog Pawar /* Quantization */
1599*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1600*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1601*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_0_1);
1602*c83a76b0SSuyog Pawar }
1603*c83a76b0SSuyog Pawar
1604*c83a76b0SSuyog Pawar }
1605*c83a76b0SSuyog Pawar
1606*c83a76b0SSuyog Pawar
1607*c83a76b0SSuyog Pawar
1608*c83a76b0SSuyog Pawar /* Inverse Quantization */
1609*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
1610*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1611*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
1612*c83a76b0SSuyog Pawar /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1613*c83a76b0SSuyog Pawar shift_iq,
1614*c83a76b0SSuyog Pawar qp_div);
1615*c83a76b0SSuyog Pawar
1616*c83a76b0SSuyog Pawar /* SSD Computation & Accumulation */
1617*c83a76b0SSuyog Pawar val = i2_temp - pi2_iq_dst[j];
1618*c83a76b0SSuyog Pawar ssd_cost += val*val;
1619*c83a76b0SSuyog Pawar
1620*c83a76b0SSuyog Pawar pi4_quant_round_factor_0_1++;
1621*c83a76b0SSuyog Pawar pi4_quant_round_factor_1_2++;
1622*c83a76b0SSuyog Pawar }
1623*c83a76b0SSuyog Pawar
1624*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
1625*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
1626*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
1627*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
1628*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
1629*c83a76b0SSuyog Pawar }
1630*c83a76b0SSuyog Pawar /* Store the cost */
1631*c83a76b0SSuyog Pawar *pi8_cost = ssd_cost;
1632*c83a76b0SSuyog Pawar
1633*c83a76b0SSuyog Pawar /* CSBF update */
1634*c83a76b0SSuyog Pawar {
1635*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
1636*c83a76b0SSuyog Pawar WORD32 row, col;
1637*c83a76b0SSuyog Pawar WORD16 *pi2_block;
1638*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
1639*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
1640*c83a76b0SSuyog Pawar
1641*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
1642*c83a76b0SSuyog Pawar
1643*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
1644*c83a76b0SSuyog Pawar {
1645*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
1646*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
1647*c83a76b0SSuyog Pawar {
1648*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1649*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
1650*c83a76b0SSuyog Pawar
1651*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
1652*c83a76b0SSuyog Pawar {
1653*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
1654*c83a76b0SSuyog Pawar {
1655*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
1656*c83a76b0SSuyog Pawar {
1657*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
1658*c83a76b0SSuyog Pawar break;
1659*c83a76b0SSuyog Pawar }
1660*c83a76b0SSuyog Pawar }
1661*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
1662*c83a76b0SSuyog Pawar {
1663*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
1664*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1665*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
1666*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
1667*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1668*c83a76b0SSuyog Pawar
1669*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
1670*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1671*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
1672*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
1673*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1674*c83a76b0SSuyog Pawar
1675*c83a76b0SSuyog Pawar break;
1676*c83a76b0SSuyog Pawar }
1677*c83a76b0SSuyog Pawar }
1678*c83a76b0SSuyog Pawar
1679*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
1680*c83a76b0SSuyog Pawar }
1681*c83a76b0SSuyog Pawar csbf += csbf_strd;
1682*c83a76b0SSuyog Pawar }
1683*c83a76b0SSuyog Pawar
1684*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
1685*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
1686*c83a76b0SSuyog Pawar }
1687*c83a76b0SSuyog Pawar
1688*c83a76b0SSuyog Pawar return cbf;
1689*c83a76b0SSuyog Pawar }
1690*c83a76b0SSuyog Pawar
ihevc_q_iq_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1691*c83a76b0SSuyog Pawar WORD32 ihevc_q_iq_var_rnd_fact
1692*c83a76b0SSuyog Pawar (
1693*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
1694*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
1695*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
1696*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
1697*c83a76b0SSuyog Pawar WORD32 trans_size,
1698*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
1699*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
1700*c83a76b0SSuyog Pawar WORD32 q_add,
1701*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
1702*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
1703*c83a76b0SSuyog Pawar WORD32 src_strd,
1704*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
1705*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
1706*c83a76b0SSuyog Pawar UWORD8 *csbf,
1707*c83a76b0SSuyog Pawar WORD32 csbf_strd,
1708*c83a76b0SSuyog Pawar WORD32 *zero_col,
1709*c83a76b0SSuyog Pawar WORD32 *zero_row,
1710*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
1711*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
1712*c83a76b0SSuyog Pawar )
1713*c83a76b0SSuyog Pawar {
1714*c83a76b0SSuyog Pawar WORD32 i, j;
1715*c83a76b0SSuyog Pawar WORD32 log2_size;
1716*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
1717*c83a76b0SSuyog Pawar WORD32 cbf = 0;
1718*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
1719*c83a76b0SSuyog Pawar WORD16 i2_temp;
1720*c83a76b0SSuyog Pawar
1721*c83a76b0SSuyog Pawar (void)q_add;
1722*c83a76b0SSuyog Pawar (void)pi8_cost;
1723*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
1724*c83a76b0SSuyog Pawar
1725*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
1726*c83a76b0SSuyog Pawar log2_size -= 1;
1727*c83a76b0SSuyog Pawar
1728*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
1729*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
1730*c83a76b0SSuyog Pawar
1731*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
1732*c83a76b0SSuyog Pawar {
1733*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1734*c83a76b0SSuyog Pawar {
1735*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
1736*c83a76b0SSuyog Pawar
1737*c83a76b0SSuyog Pawar {
1738*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1739*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1740*c83a76b0SSuyog Pawar log2_size, 0);
1741*c83a76b0SSuyog Pawar
1742*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) >= 2)
1743*c83a76b0SSuyog Pawar {
1744*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1745*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1746*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1747*c83a76b0SSuyog Pawar }
1748*c83a76b0SSuyog Pawar else if (abs(pi2_q_dst[j]) >= 1)
1749*c83a76b0SSuyog Pawar {
1750*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1751*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1752*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_1_2);
1753*c83a76b0SSuyog Pawar }
1754*c83a76b0SSuyog Pawar else
1755*c83a76b0SSuyog Pawar {
1756*c83a76b0SSuyog Pawar QUANT(pi2_q_dst[j],i2_temp,
1757*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1758*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_0_1);
1759*c83a76b0SSuyog Pawar }
1760*c83a76b0SSuyog Pawar }
1761*c83a76b0SSuyog Pawar
1762*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
1763*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1764*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
1765*c83a76b0SSuyog Pawar shift_iq,
1766*c83a76b0SSuyog Pawar qp_div);
1767*c83a76b0SSuyog Pawar
1768*c83a76b0SSuyog Pawar pi4_quant_round_factor_0_1++;
1769*c83a76b0SSuyog Pawar pi4_quant_round_factor_1_2++;
1770*c83a76b0SSuyog Pawar }
1771*c83a76b0SSuyog Pawar
1772*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
1773*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
1774*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
1775*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
1776*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
1777*c83a76b0SSuyog Pawar }
1778*c83a76b0SSuyog Pawar
1779*c83a76b0SSuyog Pawar /* CSBF update */
1780*c83a76b0SSuyog Pawar {
1781*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
1782*c83a76b0SSuyog Pawar WORD32 row, col;
1783*c83a76b0SSuyog Pawar WORD16 *pi2_block;
1784*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
1785*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
1786*c83a76b0SSuyog Pawar
1787*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
1788*c83a76b0SSuyog Pawar
1789*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
1790*c83a76b0SSuyog Pawar {
1791*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
1792*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
1793*c83a76b0SSuyog Pawar {
1794*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1795*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
1796*c83a76b0SSuyog Pawar
1797*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
1798*c83a76b0SSuyog Pawar {
1799*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
1800*c83a76b0SSuyog Pawar {
1801*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
1802*c83a76b0SSuyog Pawar {
1803*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
1804*c83a76b0SSuyog Pawar break;
1805*c83a76b0SSuyog Pawar }
1806*c83a76b0SSuyog Pawar }
1807*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
1808*c83a76b0SSuyog Pawar {
1809*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
1810*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1811*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
1812*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
1813*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1814*c83a76b0SSuyog Pawar
1815*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
1816*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1817*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
1818*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
1819*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
1820*c83a76b0SSuyog Pawar
1821*c83a76b0SSuyog Pawar break;
1822*c83a76b0SSuyog Pawar }
1823*c83a76b0SSuyog Pawar }
1824*c83a76b0SSuyog Pawar
1825*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
1826*c83a76b0SSuyog Pawar }
1827*c83a76b0SSuyog Pawar csbf += csbf_strd;
1828*c83a76b0SSuyog Pawar }
1829*c83a76b0SSuyog Pawar
1830*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
1831*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
1832*c83a76b0SSuyog Pawar }
1833*c83a76b0SSuyog Pawar
1834*c83a76b0SSuyog Pawar return cbf;
1835*c83a76b0SSuyog Pawar }
1836*c83a76b0SSuyog Pawar
1837*c83a76b0SSuyog Pawar /**
1838*c83a76b0SSuyog Pawar *******************************************************************************
1839*c83a76b0SSuyog Pawar *
1840*c83a76b0SSuyog Pawar * @brief
1841*c83a76b0SSuyog Pawar * This function performs quantization(using flat scale matrix), followed by
1842*c83a76b0SSuyog Pawar * inverse quantization to find transform domain SSD; when we perform RDOQ.
1843*c83a76b0SSuyog Pawar * In case the quantized value turns out to be grater than 1, we then requantize
1844*c83a76b0SSuyog Pawar * use half rounding.
1845*c83a76b0SSuyog Pawar *
1846*c83a76b0SSuyog Pawar * @par Description:
1847*c83a76b0SSuyog Pawar * Performs quantization on coeffs
1848*c83a76b0SSuyog Pawar *
1849*c83a76b0SSuyog Pawar * @param[in] pi2_coeffs
1850*c83a76b0SSuyog Pawar * 4x4 Coeffs
1851*c83a76b0SSuyog Pawar *
1852*c83a76b0SSuyog Pawar * @param[in] pi2_quant_coeff
1853*c83a76b0SSuyog Pawar * Scaling Matrix
1854*c83a76b0SSuyog Pawar *
1855*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1856*c83a76b0SSuyog Pawar * Output 4x4 coefficients
1857*c83a76b0SSuyog Pawar *
1858*c83a76b0SSuyog Pawar * @param[in] qp_div
1859*c83a76b0SSuyog Pawar * Quantization parameter / 6
1860*c83a76b0SSuyog Pawar *
1861*c83a76b0SSuyog Pawar * @param[in] qp_rem
1862*c83a76b0SSuyog Pawar * Quantization parameter % 6
1863*c83a76b0SSuyog Pawar *
1864*c83a76b0SSuyog Pawar * @param[in] src_strd
1865*c83a76b0SSuyog Pawar * Input stride
1866*c83a76b0SSuyog Pawar *
1867*c83a76b0SSuyog Pawar * @param[in] dst_strd
1868*c83a76b0SSuyog Pawar * Output Stride
1869*c83a76b0SSuyog Pawar *
1870*c83a76b0SSuyog Pawar * @param[out] csbf
1871*c83a76b0SSuyog Pawar * coded sub block flag
1872*c83a76b0SSuyog Pawar *
1873*c83a76b0SSuyog Pawar * @param[in] csbf_strd
1874*c83a76b0SSuyog Pawar * coded sub block flag
1875*c83a76b0SSuyog Pawar *
1876*c83a76b0SSuyog Pawar * @param[out] zero_col
1877*c83a76b0SSuyog Pawar * zero column flag
1878*c83a76b0SSuyog Pawar *
1879*c83a76b0SSuyog Pawar * @param[out] zero_row
1880*c83a76b0SSuyog Pawar * zero column flag
1881*c83a76b0SSuyog Pawar *
1882*c83a76b0SSuyog Pawar * @returns cbf
1883*c83a76b0SSuyog Pawar * coded block flag
1884*c83a76b0SSuyog Pawar *
1885*c83a76b0SSuyog Pawar * @remarks
1886*c83a76b0SSuyog Pawar * None
1887*c83a76b0SSuyog Pawar *
1888*c83a76b0SSuyog Pawar *******************************************************************************
1889*c83a76b0SSuyog Pawar */
1890*c83a76b0SSuyog Pawar
ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1891*c83a76b0SSuyog Pawar WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact
1892*c83a76b0SSuyog Pawar (
1893*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
1894*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
1895*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
1896*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
1897*c83a76b0SSuyog Pawar WORD32 trans_size,
1898*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
1899*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
1900*c83a76b0SSuyog Pawar WORD32 q_add,
1901*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
1902*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
1903*c83a76b0SSuyog Pawar WORD32 src_strd,
1904*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
1905*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
1906*c83a76b0SSuyog Pawar UWORD8 *csbf,
1907*c83a76b0SSuyog Pawar WORD32 csbf_strd,
1908*c83a76b0SSuyog Pawar WORD32 *zero_col,
1909*c83a76b0SSuyog Pawar WORD32 *zero_row,
1910*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
1911*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
1912*c83a76b0SSuyog Pawar )
1913*c83a76b0SSuyog Pawar {
1914*c83a76b0SSuyog Pawar WORD32 i, j;
1915*c83a76b0SSuyog Pawar WORD32 log2_size;
1916*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
1917*c83a76b0SSuyog Pawar WORD32 cbf = 0;
1918*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
1919*c83a76b0SSuyog Pawar WORD32 val;
1920*c83a76b0SSuyog Pawar WORD16 i2_temp;
1921*c83a76b0SSuyog Pawar /* Initialize cost to zero */
1922*c83a76b0SSuyog Pawar WORD32 ssd_cost = 0;
1923*c83a76b0SSuyog Pawar
1924*c83a76b0SSuyog Pawar (void)q_add;
1925*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
1926*c83a76b0SSuyog Pawar
1927*c83a76b0SSuyog Pawar /* Quant initialization */
1928*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
1929*c83a76b0SSuyog Pawar log2_size -= 1;
1930*c83a76b0SSuyog Pawar
1931*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
1932*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
1933*c83a76b0SSuyog Pawar
1934*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
1935*c83a76b0SSuyog Pawar {
1936*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
1937*c83a76b0SSuyog Pawar {
1938*c83a76b0SSuyog Pawar WORD16 i2_temp1;
1939*c83a76b0SSuyog Pawar /* Back up the coefficients before Quantization */
1940*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
1941*c83a76b0SSuyog Pawar
1942*c83a76b0SSuyog Pawar /*QUANT(pi2_dst[j], pi2_coeffs[j],
1943*c83a76b0SSuyog Pawar pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1944*c83a76b0SSuyog Pawar log2_size, q_add);*/
1945*c83a76b0SSuyog Pawar
1946*c83a76b0SSuyog Pawar /* modified by 1028 */
1947*c83a76b0SSuyog Pawar /* Quantization */
1948*c83a76b0SSuyog Pawar
1949*c83a76b0SSuyog Pawar
1950*c83a76b0SSuyog Pawar {
1951*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1952*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1953*c83a76b0SSuyog Pawar log2_size, 0);
1954*c83a76b0SSuyog Pawar
1955*c83a76b0SSuyog Pawar i2_temp1 = pi2_q_dst[j];
1956*c83a76b0SSuyog Pawar
1957*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) >= 2)
1958*c83a76b0SSuyog Pawar {
1959*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1960*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1961*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1962*c83a76b0SSuyog Pawar }
1963*c83a76b0SSuyog Pawar else if (abs(pi2_q_dst[j]) >= 1)
1964*c83a76b0SSuyog Pawar {
1965*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1966*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1967*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_1_2);
1968*c83a76b0SSuyog Pawar }
1969*c83a76b0SSuyog Pawar
1970*c83a76b0SSuyog Pawar else
1971*c83a76b0SSuyog Pawar {
1972*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1973*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
1974*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_0_1);
1975*c83a76b0SSuyog Pawar }
1976*c83a76b0SSuyog Pawar
1977*c83a76b0SSuyog Pawar }
1978*c83a76b0SSuyog Pawar
1979*c83a76b0SSuyog Pawar
1980*c83a76b0SSuyog Pawar
1981*c83a76b0SSuyog Pawar
1982*c83a76b0SSuyog Pawar ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1983*c83a76b0SSuyog Pawar
1984*c83a76b0SSuyog Pawar
1985*c83a76b0SSuyog Pawar /* Inverse Quantization */
1986*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
1987*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1988*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1989*c83a76b0SSuyog Pawar shift_iq,
1990*c83a76b0SSuyog Pawar qp_div);
1991*c83a76b0SSuyog Pawar
1992*c83a76b0SSuyog Pawar /* SSD Computation & Accumulation */
1993*c83a76b0SSuyog Pawar val = i2_temp - pi2_iq_dst[j];
1994*c83a76b0SSuyog Pawar ssd_cost += val*val;
1995*c83a76b0SSuyog Pawar
1996*c83a76b0SSuyog Pawar pi4_quant_round_factor_0_1++;
1997*c83a76b0SSuyog Pawar pi4_quant_round_factor_1_2++;
1998*c83a76b0SSuyog Pawar }
1999*c83a76b0SSuyog Pawar
2000*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
2001*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
2002*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
2003*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
2004*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
2005*c83a76b0SSuyog Pawar
2006*c83a76b0SSuyog Pawar }
2007*c83a76b0SSuyog Pawar /* Store the cost */
2008*c83a76b0SSuyog Pawar *pi8_cost = ssd_cost;
2009*c83a76b0SSuyog Pawar
2010*c83a76b0SSuyog Pawar /* CSBF update */
2011*c83a76b0SSuyog Pawar {
2012*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
2013*c83a76b0SSuyog Pawar WORD32 row, col;
2014*c83a76b0SSuyog Pawar WORD16 *pi2_block;
2015*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
2016*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
2017*c83a76b0SSuyog Pawar
2018*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
2019*c83a76b0SSuyog Pawar
2020*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
2021*c83a76b0SSuyog Pawar {
2022*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
2023*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
2024*c83a76b0SSuyog Pawar {
2025*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
2026*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
2027*c83a76b0SSuyog Pawar
2028*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
2029*c83a76b0SSuyog Pawar {
2030*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
2031*c83a76b0SSuyog Pawar {
2032*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
2033*c83a76b0SSuyog Pawar {
2034*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
2035*c83a76b0SSuyog Pawar break;
2036*c83a76b0SSuyog Pawar }
2037*c83a76b0SSuyog Pawar }
2038*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
2039*c83a76b0SSuyog Pawar {
2040*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
2041*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
2042*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
2043*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
2044*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
2045*c83a76b0SSuyog Pawar
2046*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
2047*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
2048*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
2049*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
2050*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
2051*c83a76b0SSuyog Pawar
2052*c83a76b0SSuyog Pawar break;
2053*c83a76b0SSuyog Pawar }
2054*c83a76b0SSuyog Pawar }
2055*c83a76b0SSuyog Pawar
2056*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
2057*c83a76b0SSuyog Pawar }
2058*c83a76b0SSuyog Pawar csbf += csbf_strd;
2059*c83a76b0SSuyog Pawar }
2060*c83a76b0SSuyog Pawar
2061*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
2062*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
2063*c83a76b0SSuyog Pawar }
2064*c83a76b0SSuyog Pawar return cbf;
2065*c83a76b0SSuyog Pawar }
2066*c83a76b0SSuyog Pawar
ihevc_q_iq_flat_scale_mat_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)2067*c83a76b0SSuyog Pawar WORD32 ihevc_q_iq_flat_scale_mat_var_rnd_fact
2068*c83a76b0SSuyog Pawar (
2069*c83a76b0SSuyog Pawar WORD16 *pi2_coeffs,
2070*c83a76b0SSuyog Pawar WORD16 *pi2_quant_coeff,
2071*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst,
2072*c83a76b0SSuyog Pawar WORD16 *pi2_iq_dst,
2073*c83a76b0SSuyog Pawar WORD32 trans_size,
2074*c83a76b0SSuyog Pawar WORD32 qp_div,/* qpscaled / 6 */
2075*c83a76b0SSuyog Pawar WORD32 qp_rem,/* qpscaled % 6 */
2076*c83a76b0SSuyog Pawar WORD32 q_add,
2077*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_0_1,
2078*c83a76b0SSuyog Pawar WORD32 *pi4_quant_round_factor_1_2,
2079*c83a76b0SSuyog Pawar WORD32 src_strd,
2080*c83a76b0SSuyog Pawar WORD32 dst_q_strd,
2081*c83a76b0SSuyog Pawar WORD32 dst_iq_strd,
2082*c83a76b0SSuyog Pawar UWORD8 *csbf,
2083*c83a76b0SSuyog Pawar WORD32 csbf_strd,
2084*c83a76b0SSuyog Pawar WORD32 *zero_col,
2085*c83a76b0SSuyog Pawar WORD32 *zero_row,
2086*c83a76b0SSuyog Pawar WORD16 *pi2_dequant_coeff,
2087*c83a76b0SSuyog Pawar LWORD64 *pi8_cost
2088*c83a76b0SSuyog Pawar )
2089*c83a76b0SSuyog Pawar {
2090*c83a76b0SSuyog Pawar WORD32 i, j;
2091*c83a76b0SSuyog Pawar WORD32 log2_size;
2092*c83a76b0SSuyog Pawar WORD16 *pi2_q_dst_orig;
2093*c83a76b0SSuyog Pawar WORD32 cbf = 0;
2094*c83a76b0SSuyog Pawar WORD32 bit_depth,shift_iq;
2095*c83a76b0SSuyog Pawar WORD16 i2_temp;
2096*c83a76b0SSuyog Pawar
2097*c83a76b0SSuyog Pawar (void)q_add;
2098*c83a76b0SSuyog Pawar (void)pi8_cost;
2099*c83a76b0SSuyog Pawar pi2_q_dst_orig = pi2_q_dst;
2100*c83a76b0SSuyog Pawar
2101*c83a76b0SSuyog Pawar GETRANGE(log2_size, trans_size);
2102*c83a76b0SSuyog Pawar log2_size -= 1;
2103*c83a76b0SSuyog Pawar
2104*c83a76b0SSuyog Pawar bit_depth = 8 + 0;
2105*c83a76b0SSuyog Pawar shift_iq = bit_depth + log2_size - 5;
2106*c83a76b0SSuyog Pawar
2107*c83a76b0SSuyog Pawar for(i = 0; i < trans_size; i++)
2108*c83a76b0SSuyog Pawar {
2109*c83a76b0SSuyog Pawar for(j = 0; j < trans_size; j++)
2110*c83a76b0SSuyog Pawar {
2111*c83a76b0SSuyog Pawar WORD16 i2_temp1;
2112*c83a76b0SSuyog Pawar
2113*c83a76b0SSuyog Pawar i2_temp = pi2_coeffs[j];
2114*c83a76b0SSuyog Pawar
2115*c83a76b0SSuyog Pawar {
2116*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2117*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
2118*c83a76b0SSuyog Pawar log2_size, 0);
2119*c83a76b0SSuyog Pawar
2120*c83a76b0SSuyog Pawar i2_temp1 = pi2_q_dst[j];
2121*c83a76b0SSuyog Pawar
2122*c83a76b0SSuyog Pawar if (abs(pi2_q_dst[j]) >= 2)
2123*c83a76b0SSuyog Pawar {
2124*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
2125*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
2126*c83a76b0SSuyog Pawar log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
2127*c83a76b0SSuyog Pawar }
2128*c83a76b0SSuyog Pawar else if (abs(pi2_q_dst[j]) >= 1)
2129*c83a76b0SSuyog Pawar {
2130*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2131*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
2132*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_1_2);
2133*c83a76b0SSuyog Pawar }
2134*c83a76b0SSuyog Pawar else
2135*c83a76b0SSuyog Pawar {
2136*c83a76b0SSuyog Pawar QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2137*c83a76b0SSuyog Pawar g_ihevc_quant_scales[qp_rem], qp_div,
2138*c83a76b0SSuyog Pawar log2_size, *pi4_quant_round_factor_0_1);
2139*c83a76b0SSuyog Pawar }
2140*c83a76b0SSuyog Pawar }
2141*c83a76b0SSuyog Pawar
2142*c83a76b0SSuyog Pawar ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
2143*c83a76b0SSuyog Pawar
2144*c83a76b0SSuyog Pawar IQUANT(pi2_iq_dst[j],
2145*c83a76b0SSuyog Pawar pi2_q_dst[j], /*pi2_src[index*src_strd]*/
2146*c83a76b0SSuyog Pawar pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
2147*c83a76b0SSuyog Pawar shift_iq,
2148*c83a76b0SSuyog Pawar qp_div);
2149*c83a76b0SSuyog Pawar
2150*c83a76b0SSuyog Pawar pi4_quant_round_factor_0_1++;
2151*c83a76b0SSuyog Pawar pi4_quant_round_factor_1_2++;
2152*c83a76b0SSuyog Pawar }
2153*c83a76b0SSuyog Pawar
2154*c83a76b0SSuyog Pawar pi2_q_dst += dst_q_strd;
2155*c83a76b0SSuyog Pawar pi2_iq_dst += dst_iq_strd;
2156*c83a76b0SSuyog Pawar pi2_quant_coeff += trans_size;
2157*c83a76b0SSuyog Pawar pi2_coeffs += src_strd;
2158*c83a76b0SSuyog Pawar pi2_dequant_coeff += trans_size;
2159*c83a76b0SSuyog Pawar
2160*c83a76b0SSuyog Pawar }
2161*c83a76b0SSuyog Pawar
2162*c83a76b0SSuyog Pawar /* CSBF update */
2163*c83a76b0SSuyog Pawar {
2164*c83a76b0SSuyog Pawar WORD32 block_row, block_col;
2165*c83a76b0SSuyog Pawar WORD32 row, col;
2166*c83a76b0SSuyog Pawar WORD16 *pi2_block;
2167*c83a76b0SSuyog Pawar UWORD32 temp_zero_col = 0;
2168*c83a76b0SSuyog Pawar UWORD32 temp_zero_row = 0;
2169*c83a76b0SSuyog Pawar
2170*c83a76b0SSuyog Pawar pi2_q_dst = pi2_q_dst_orig;
2171*c83a76b0SSuyog Pawar
2172*c83a76b0SSuyog Pawar for(block_row = 0; block_row < trans_size; block_row += 4)
2173*c83a76b0SSuyog Pawar {
2174*c83a76b0SSuyog Pawar //block_col is incrementing by 1 for easy update of csbf pointer
2175*c83a76b0SSuyog Pawar for(block_col = 0; block_col < trans_size / 4; block_col++)
2176*c83a76b0SSuyog Pawar {
2177*c83a76b0SSuyog Pawar pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
2178*c83a76b0SSuyog Pawar *(csbf + block_col) = 0;
2179*c83a76b0SSuyog Pawar
2180*c83a76b0SSuyog Pawar for(row = 0; row < 4; row++)
2181*c83a76b0SSuyog Pawar {
2182*c83a76b0SSuyog Pawar for(col = 0; col < 4; col++)
2183*c83a76b0SSuyog Pawar {
2184*c83a76b0SSuyog Pawar if(pi2_block[row * dst_q_strd + col] != 0)
2185*c83a76b0SSuyog Pawar {
2186*c83a76b0SSuyog Pawar *(csbf + block_col) = 1;
2187*c83a76b0SSuyog Pawar break;
2188*c83a76b0SSuyog Pawar }
2189*c83a76b0SSuyog Pawar }
2190*c83a76b0SSuyog Pawar if(*(csbf + block_col) == 1)
2191*c83a76b0SSuyog Pawar {
2192*c83a76b0SSuyog Pawar /* zero_col update *//* temp_zero_col = ~zero_col */
2193*c83a76b0SSuyog Pawar temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
2194*c83a76b0SSuyog Pawar // zero col can be optimized further. Now clearing the
2195*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 colums of 4x4 block
2196*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
2197*c83a76b0SSuyog Pawar
2198*c83a76b0SSuyog Pawar /* zero row update */ /* temp_zero_row = ~zero_row */
2199*c83a76b0SSuyog Pawar temp_zero_row = (temp_zero_row) | (0xFU << block_row);
2200*c83a76b0SSuyog Pawar // zero row can be optimized further. Now clearing the
2201*c83a76b0SSuyog Pawar // entire 4 bits corresponding to 4 rows of 4x4 block
2202*c83a76b0SSuyog Pawar // even if any 4x4 csbf is set
2203*c83a76b0SSuyog Pawar
2204*c83a76b0SSuyog Pawar break;
2205*c83a76b0SSuyog Pawar }
2206*c83a76b0SSuyog Pawar }
2207*c83a76b0SSuyog Pawar
2208*c83a76b0SSuyog Pawar cbf = cbf || (*(csbf + block_col)); // cbf update
2209*c83a76b0SSuyog Pawar }
2210*c83a76b0SSuyog Pawar csbf += csbf_strd;
2211*c83a76b0SSuyog Pawar }
2212*c83a76b0SSuyog Pawar
2213*c83a76b0SSuyog Pawar *zero_col = ~temp_zero_col; //final zero_col storing
2214*c83a76b0SSuyog Pawar *zero_row = ~temp_zero_row; //final zero_row storing
2215*c83a76b0SSuyog Pawar }
2216*c83a76b0SSuyog Pawar return cbf;
2217*c83a76b0SSuyog Pawar }
2218