xref: /aosp_15_r20/external/libhevc/decoder/ihevcd_iquant_itrans_recon_ctb.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19  *******************************************************************************
20  * @file
21  *  ihevcd_iquant_itrans_recon_ctb.c
22  *
23  * @brief
24  *  Contains functions for inverse quantization, inverse transform and recon
25  *
26  * @author
27  *  Ittiam
28  *
29  * @par List of Functions:
30  * - ihevcd_iquant_itrans_recon_ctb()
31  *
32  * @remarks
33  *  None
34  *
35  *******************************************************************************
36  */
37 /*****************************************************************************/
38 /* File Includes                                                             */
39 /*****************************************************************************/
40 #include <stdio.h>
41 #include <stddef.h>
42 #include <stdlib.h>
43 #include <string.h>
44 
45 #include "ihevc_typedefs.h"
46 #include "iv.h"
47 #include "ivd.h"
48 #include "ihevcd_cxa.h"
49 
50 #include "ihevc_defs.h"
51 #include "ihevc_debug.h"
52 #include "ihevc_structs.h"
53 #include "ihevc_cabac_tables.h"
54 #include "ihevc_macros.h"
55 #include "ihevc_platform_macros.h"
56 
57 #include "ihevcd_defs.h"
58 #include "ihevcd_function_selector.h"
59 #include "ihevcd_structs.h"
60 #include "ihevcd_error.h"
61 #include "ihevcd_bitstream.h"
62 #include "ihevc_common_tables.h"
63 
64 /* Intra pred includes */
65 #include "ihevc_intra_pred.h"
66 
67 /* Inverse transform common module includes */
68 #include "ihevc_trans_tables.h"
69 #include "ihevc_trans_macros.h"
70 #include "ihevc_itrans_recon.h"
71 #include "ihevc_recon.h"
72 #include "ihevc_chroma_itrans_recon.h"
73 #include "ihevc_chroma_recon.h"
74 
75 /* Decoder includes */
76 #include "ihevcd_common_tables.h"
77 #include "ihevcd_iquant_itrans_recon_ctb.h"
78 #include "ihevcd_debug.h"
79 #include "ihevcd_profile.h"
80 #include "ihevcd_statistics.h"
81 #include "ihevcd_itrans_recon_dc.h"
82 
83 static const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
84 
85 
86 /* Globals */
87 static const WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
88   { IP_FUNC_MODE_0, /* Mode 0 */
89     IP_FUNC_MODE_1, /* Mode 1 */
90     IP_FUNC_MODE_2, /* Mode 2 */
91     IP_FUNC_MODE_3TO9, /* Mode 3 */
92     IP_FUNC_MODE_3TO9, /* Mode 4 */
93     IP_FUNC_MODE_3TO9, /* Mode 5 */
94     IP_FUNC_MODE_3TO9, /* Mode 6 */
95     IP_FUNC_MODE_3TO9, /* Mode 7 */
96     IP_FUNC_MODE_3TO9, /* Mode 8 */
97     IP_FUNC_MODE_3TO9, /* Mode 9 */
98     IP_FUNC_MODE_10, /* Mode 10 */
99     IP_FUNC_MODE_11TO17, /* Mode 11 */
100     IP_FUNC_MODE_11TO17, /* Mode 12 */
101     IP_FUNC_MODE_11TO17, /* Mode 13 */
102     IP_FUNC_MODE_11TO17, /* Mode 14 */
103     IP_FUNC_MODE_11TO17, /* Mode 15 */
104     IP_FUNC_MODE_11TO17, /* Mode 16 */
105     IP_FUNC_MODE_11TO17, /* Mode 17 */
106     IP_FUNC_MODE_18_34, /* Mode 18 */
107     IP_FUNC_MODE_19TO25, /* Mode 19 */
108     IP_FUNC_MODE_19TO25, /* Mode 20 */
109     IP_FUNC_MODE_19TO25, /* Mode 21 */
110     IP_FUNC_MODE_19TO25, /* Mode 22 */
111     IP_FUNC_MODE_19TO25, /* Mode 23 */
112     IP_FUNC_MODE_19TO25, /* Mode 24 */
113     IP_FUNC_MODE_19TO25, /* Mode 25 */
114     IP_FUNC_MODE_26, /* Mode 26 */
115     IP_FUNC_MODE_27TO33, /* Mode 27 */
116     IP_FUNC_MODE_27TO33, /* Mode 26 */
117     IP_FUNC_MODE_27TO33, /* Mode 29 */
118     IP_FUNC_MODE_27TO33, /* Mode 30 */
119     IP_FUNC_MODE_27TO33, /* Mode 31 */
120     IP_FUNC_MODE_27TO33, /* Mode 32 */
121     IP_FUNC_MODE_27TO33, /* Mode 33 */
122     IP_FUNC_MODE_18_34, /* Mode 34 */
123 };
124 
125 
126 const WORD16 *g_ai2_ihevc_trans_tables[] =
127   { &g_ai2_ihevc_trans_dst_4[0][0],
128     &g_ai2_ihevc_trans_4[0][0],
129     &g_ai2_ihevc_trans_8[0][0],
130     &g_ai2_ihevc_trans_16[0][0],
131     &g_ai2_ihevc_trans_32[0][0]
132 };
133 
134 
135 /*****************************************************************************/
136 /* Function Prototypes                                                       */
137 /*****************************************************************************/
138 /* Returns number of ai2_level read from ps_sblk_coeff */
ihevcd_unpack_coeffs(WORD16 * pi2_tu_coeff,WORD32 log2_trans_size,UWORD8 * pu1_tu_coeff_data,WORD16 * pi2_dequant_matrix,WORD32 qp_rem,WORD32 qp_div,TRANSFORM_TYPE e_trans_type,WORD32 trans_quant_bypass,UWORD32 * pu4_zero_cols,UWORD32 * pu4_zero_rows,UWORD32 * pu4_coeff_type,WORD16 * pi2_coeff_value)139 UWORD8* ihevcd_unpack_coeffs(WORD16 *pi2_tu_coeff,
140                              WORD32 log2_trans_size,
141                              UWORD8 *pu1_tu_coeff_data,
142                              WORD16 *pi2_dequant_matrix,
143                              WORD32 qp_rem,
144                              WORD32 qp_div,
145                              TRANSFORM_TYPE e_trans_type,
146                              WORD32 trans_quant_bypass,
147                              UWORD32 *pu4_zero_cols,
148                              UWORD32 *pu4_zero_rows,
149                              UWORD32 *pu4_coeff_type,
150                              WORD16 *pi2_coeff_value)
151 {
152     /* Generating coeffs from coeff-map */
153     WORD32 i;
154     WORD16 *pi2_sblk_ptr;
155     WORD32 subblk_pos_x, subblk_pos_y;
156     WORD32 sblk_scan_idx, coeff_raster_idx;
157     WORD32 sblk_non_zero_coeff_idx;
158     tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
159     UWORD8 u1_num_coded_sblks, u1_scan_type;
160     UWORD8 *pu1_new_tu_coeff_data;
161     WORD32 trans_size;
162     WORD32 xs, ys;
163     WORD32 trans_skip;
164     WORD16 iquant_out;
165     WORD32 shift_iq;
166     {
167         WORD32 bit_depth;
168 
169         bit_depth = 8 + 0;
170         shift_iq = bit_depth + log2_trans_size - 5;
171     }
172     trans_size = (1 << log2_trans_size);
173 
174     /* First byte points to number of coded blocks */
175     u1_num_coded_sblks = *pu1_tu_coeff_data++;
176 
177     /* Next byte points to scan type */
178     u1_scan_type = *pu1_tu_coeff_data++;
179     /* 0th bit has trans_skip */
180     trans_skip = u1_scan_type & 1;
181     u1_scan_type >>= 1;
182 
183     pi2_sblk_ptr = pi2_tu_coeff;
184 
185     /* Initially all columns are assumed to be zero */
186     *pu4_zero_cols = 0xFFFFFFFF;
187     /* Initially all rows are assumed to be zero */
188     *pu4_zero_rows = 0xFFFFFFFF;
189 
190     ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)(pu1_tu_coeff_data);
191 
192     if(trans_skip)
193         memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
194 
195     STATS_INIT_SBLK_AND_COEFF_POS();
196 
197     /* DC only case */
198     if((e_trans_type != DST_4x4) && (1 == u1_num_coded_sblks)
199                     && (0 == ps_tu_sblk_coeff_data->u2_subblk_pos)
200                     && (1 == ps_tu_sblk_coeff_data->u2_sig_coeff_map))
201     {
202         *pu4_coeff_type = 1;
203 
204         if(!trans_quant_bypass)
205         {
206             if(4 == trans_size)
207             {
208                 IQUANT_4x4(iquant_out,
209                            ps_tu_sblk_coeff_data->ai2_level[0],
210                            pi2_dequant_matrix[0]
211                                            * g_ihevc_iquant_scales[qp_rem],
212                            shift_iq, qp_div);
213             }
214             else
215             {
216                 IQUANT(iquant_out, ps_tu_sblk_coeff_data->ai2_level[0],
217                        pi2_dequant_matrix[0] * g_ihevc_iquant_scales[qp_rem],
218                        shift_iq, qp_div);
219             }
220             if(trans_skip)
221                 iquant_out = (iquant_out + 16) >> 5;
222         }
223         else
224         {
225             /* setting the column to zero */
226             for(i = 0; i < trans_size; i++)
227                 *(pi2_tu_coeff + i * trans_size) = 0;
228 
229             iquant_out = ps_tu_sblk_coeff_data->ai2_level[0];
230         }
231         *pi2_coeff_value = iquant_out;
232         *pi2_tu_coeff = iquant_out;
233         *pu4_zero_cols &= ~0x1;
234         *pu4_zero_rows &= ~0x1;
235         ps_tu_sblk_coeff_data =
236                         (void *)&ps_tu_sblk_coeff_data->ai2_level[1];
237 
238         STATS_UPDATE_COEFF_COUNT();
239         STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass),  0, 0);
240         STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
241         return ((UWORD8 *)ps_tu_sblk_coeff_data);
242     }
243     else
244     {
245         *pu4_coeff_type = 0;
246         /* In case of trans skip, memset has already happened */
247         if(!trans_skip)
248             memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
249     }
250 
251     for(i = 0; i < u1_num_coded_sblks; i++)
252     {
253         UWORD32 u4_sig_coeff_map;
254         subblk_pos_x = ps_tu_sblk_coeff_data->u2_subblk_pos & 0x00FF;
255         subblk_pos_y = (ps_tu_sblk_coeff_data->u2_subblk_pos & 0xFF00) >> 8;
256 
257         STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass), subblk_pos_x, subblk_pos_y);
258 
259         subblk_pos_x = subblk_pos_x * MIN_TU_SIZE;
260         subblk_pos_y = subblk_pos_y * MIN_TU_SIZE;
261 
262         pi2_sblk_ptr = pi2_tu_coeff + subblk_pos_y * trans_size
263                         + subblk_pos_x;
264 
265         //*pu4_zero_cols &= ~(0xF << subblk_pos_x);
266 
267         sblk_non_zero_coeff_idx = 0;
268         u4_sig_coeff_map = ps_tu_sblk_coeff_data->u2_sig_coeff_map;
269         //for(sblk_scan_idx = (31 - CLZ(u4_sig_coeff_map)); sblk_scan_idx >= 0; sblk_scan_idx--)
270         sblk_scan_idx = 31;
271         do
272         {
273             WORD32 clz = CLZ(u4_sig_coeff_map);
274 
275             sblk_scan_idx -= clz;
276             /* when clz is 31, u4_sig_coeff_map << (clz+1) might result in unknown behaviour in some cases */
277             /* Hence either use SHL which takes care of handling these issues based on platform or shift in two stages */
278             u4_sig_coeff_map = u4_sig_coeff_map << clz;
279             /* Copying coeffs and storing in reverse order */
280             {
281                 STATS_UPDATE_COEFF_COUNT();
282                 coeff_raster_idx =
283                                 gau1_ihevc_invscan4x4[u1_scan_type][sblk_scan_idx];
284 
285                 xs = coeff_raster_idx & 0x3;
286                 ys = coeff_raster_idx >> 2;
287 
288                 if(!trans_quant_bypass)
289                 {
290                     if(4 == trans_size)
291                     {
292                         IQUANT_4x4(iquant_out,
293                                    ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
294                                    pi2_dequant_matrix[(subblk_pos_x + xs)
295                                                    + (subblk_pos_y + ys)
296                                                    * trans_size]
297                                    * g_ihevc_iquant_scales[qp_rem],
298                                    shift_iq, qp_div);
299                         sblk_non_zero_coeff_idx++;
300                     }
301                     else
302                     {
303                         IQUANT(iquant_out,
304                                ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
305                                pi2_dequant_matrix[(subblk_pos_x + xs)
306                                                + (subblk_pos_y + ys)
307                                                * trans_size]
308                                * g_ihevc_iquant_scales[qp_rem],
309                                shift_iq, qp_div);
310                         sblk_non_zero_coeff_idx++;
311                     }
312 
313                     if(trans_skip)
314                         iquant_out = (iquant_out + 16) >> 5;
315                 }
316                 else
317                 {
318                     iquant_out = ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx++];
319                 }
320                 *pu4_zero_cols &= ~(0x1 << (subblk_pos_x + xs));
321                 *pu4_zero_rows &= ~(0x1 << (subblk_pos_y + ys));
322                 *(pi2_sblk_ptr + xs + ys * trans_size) = iquant_out;
323             }
324             sblk_scan_idx--;
325             u4_sig_coeff_map <<= 1;
326 
327         }while(u4_sig_coeff_map);
328         /* Updating the sblk pointer */
329         ps_tu_sblk_coeff_data =
330                         (void *)&ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx];
331     }
332 
333     STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
334 
335     pu1_new_tu_coeff_data = (UWORD8 *)ps_tu_sblk_coeff_data;
336 
337     return pu1_new_tu_coeff_data;
338 }
339 
ihevcd_get_intra_nbr_flag(process_ctxt_t * ps_proc,tu_t * ps_tu,UWORD32 * pu4_intra_nbr_avail,WORD16 i2_pic_width_in_luma_samples,UWORD8 i1_constrained_intra_pred_flag,WORD32 trans_size,WORD32 ctb_size)340 WORD32 ihevcd_get_intra_nbr_flag(process_ctxt_t *ps_proc,
341                                  tu_t *ps_tu,
342                                  UWORD32 *pu4_intra_nbr_avail,
343                                  WORD16 i2_pic_width_in_luma_samples,
344                                  UWORD8 i1_constrained_intra_pred_flag,
345                                  WORD32 trans_size,
346                                  WORD32 ctb_size)
347 {
348     sps_t *ps_sps;
349     UWORD8 u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail,
350                     u1_top_lt_avail;
351     WORD32 x_cur, y_cur, x_nbr, y_nbr;
352     UWORD8 *pu1_nbr_intra_flag;
353     UWORD8 *pu1_pic_intra_flag;
354     UWORD8 top_right, top, top_left, left, bot_left;
355     WORD32 intra_pos;
356     WORD32 num_8_blks, num_8_blks_in_bits;
357     WORD32 numbytes_row = (i2_pic_width_in_luma_samples + 63) / 64;
358     WORD32 cur_x, cur_y;
359     WORD32 i;
360     WORD32 nbr_flags;
361 
362     ps_sps = ps_proc->ps_sps;
363     cur_x = ps_tu->b4_pos_x;
364     cur_y = ps_tu->b4_pos_y;
365 
366     u1_bot_lt_avail = (pu4_intra_nbr_avail[1 + cur_y + trans_size / MIN_TU_SIZE]
367                     >> (31 - (1 + cur_x - 1))) & 1;
368     u1_left_avail = (pu4_intra_nbr_avail[1 + cur_y] >> (31 - (1 + cur_x - 1)))
369                     & 1;
370     u1_top_avail = (pu4_intra_nbr_avail[1 + cur_y - 1] >> (31 - (1 + cur_x)))
371                     & 1;
372     u1_top_rt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
373                     >> (31 - (1 + cur_x + trans_size / MIN_TU_SIZE))) & 1;
374     u1_top_lt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
375                     >> (31 - (1 + cur_x - 1))) & 1;
376 
377     x_cur = ps_proc->i4_ctb_x * ctb_size + cur_x * MIN_TU_SIZE;
378     y_cur = ps_proc->i4_ctb_y * ctb_size + cur_y * MIN_TU_SIZE;
379 
380     pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
381 
382     /* WORD32 nbr_flags as below  MSB --> LSB */
383     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
384      *       1         4         4     4         4
385      */
386     bot_left = 0;
387     left = 0;
388     top_right = 0;
389     top = 0;
390     top_left = 0;
391 
392     num_8_blks = trans_size > 4 ? trans_size / 8 : 1;
393     num_8_blks_in_bits = ((1 << num_8_blks) - 1);
394 
395     if(i1_constrained_intra_pred_flag)
396     {
397         /* TODO: constrained intra pred not tested */
398         if(u1_bot_lt_avail)
399         {
400             x_nbr = x_cur - 1;
401             y_nbr = y_cur + trans_size;
402 
403             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
404                             + x_nbr / 64;
405             intra_pos = ((x_nbr / 8) % 8);
406             for(i = 0; i < num_8_blks; i++)
407             {
408                 bot_left |= ((*(pu1_nbr_intra_flag + i * numbytes_row)
409                                 >> intra_pos) & 1) << i;
410             }
411             bot_left &= num_8_blks_in_bits;
412         }
413         if(u1_left_avail)
414         {
415             x_nbr = x_cur - 1;
416             y_nbr = y_cur;
417 
418             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
419                             + x_nbr / 64;
420             intra_pos = ((x_nbr / 8) % 8);
421 
422             for(i = 0; i < num_8_blks; i++)
423             {
424                 left |= ((*(pu1_nbr_intra_flag + i * numbytes_row) >> intra_pos)
425                                 & 1) << i;
426             }
427             left &= num_8_blks_in_bits;
428         }
429         if(u1_top_avail)
430         {
431             x_nbr = x_cur;
432             y_nbr = y_cur - 1;
433 
434             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
435                             + x_nbr / 64;
436             intra_pos = ((x_nbr / 8) % 8);
437 
438             top = (*pu1_nbr_intra_flag >> intra_pos);
439             top &= num_8_blks_in_bits;
440             /*
441              for(i=0;i<num_8_blks;i++)
442              {
443              top |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
444              }
445              */
446         }
447         if(u1_top_rt_avail)
448         {
449             x_nbr = x_cur + trans_size;
450             y_nbr = y_cur - 1;
451 
452             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
453                             + x_nbr / 64;
454             intra_pos = ((x_nbr / 8) % 8);
455 
456             top_right = (*pu1_nbr_intra_flag >> intra_pos);
457             top_right &= num_8_blks_in_bits;
458             /*
459              for(i=0;i<num_8_blks;i++)
460              {
461              top_right |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
462              }
463              */
464         }
465         if(u1_top_lt_avail)
466         {
467             x_nbr = x_cur - 1;
468             y_nbr = y_cur - 1;
469 
470             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
471                             + x_nbr / 64;
472             intra_pos = ((x_nbr / 8) % 8);
473 
474             top_left = (*pu1_nbr_intra_flag >> intra_pos) & 1;
475         }
476     }
477     else
478     {
479         if(u1_top_avail)
480             top = 0xF;
481         if(u1_top_rt_avail)
482             top_right = 0xF;
483         if(u1_bot_lt_avail)
484             bot_left = 0xF;
485         if(u1_left_avail)
486             left = 0xF;
487         if(u1_top_lt_avail)
488             top_left = 0x1;
489     }
490 
491     /* Handling incomplete CTBs */
492     {
493         WORD32 pu_size_limit = MIN(trans_size, 8);
494         WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
495                         - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size)
496                         - (ps_tu->b4_pos_x * MIN_TU_SIZE)
497                         - (1 << (ps_tu->b3_size + 2));
498         /* ctb_size_top gives number of valid pixels remaining in the current row */
499         WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
500         WORD32 ctb_size_top_bits = (1 << (ctb_size_top / pu_size_limit)) - 1;
501 
502         WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
503                         - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size)
504                         - (ps_tu->b4_pos_y * MIN_TU_SIZE)
505                         - (1 << (ps_tu->b3_size + 2));
506         /* ctb_size_bot gives number of valid pixels remaining in the current column */
507         WORD32 ctb_size_bot = MIN(ctb_size, rows_remaining);
508         WORD32 ctb_size_bot_bits = (1 << (ctb_size_bot / pu_size_limit)) - 1;
509 
510         top_right &= ctb_size_top_bits;
511         bot_left &= ctb_size_bot_bits;
512     }
513 
514     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
515      *      1         4         4     4         4
516      */
517 
518     /*
519      nbr_flags = (top_left << 16) | (gau4_ihevcd_4_bit_reverse[top_right] << 12) | (gau4_ihevcd_4_bit_reverse[top] << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
520      | gau4_ihevcd_4_bit_reverse[bot_left];
521      */
522     nbr_flags = (top_left << 16) | (top_right << 12) | (top << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
523                     | gau4_ihevcd_4_bit_reverse[bot_left];
524 
525 
526     return nbr_flags;
527 
528 }
529 
ihevcd_iquant_itrans_recon_ctb(process_ctxt_t * ps_proc)530 WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc)
531 {
532     WORD16 *pi2_scaling_mat;
533     UWORD8 *pu1_y_dst_ctb;
534     UWORD8 *pu1_uv_dst_ctb;
535     WORD32 ctb_size;
536     codec_t *ps_codec;
537     slice_header_t *ps_slice_hdr;
538     tu_t *ps_tu;
539     WORD16 *pi2_ctb_coeff;
540     WORD32 tu_cnt;
541     WORD16 *pi2_tu_coeff;
542     WORD16 *pi2_tmp;
543     WORD32 pic_strd;
544     WORD32 luma_nbr_flags;
545     WORD32 luma_nbr_flags_4x4[4] = { 0 };
546     WORD32 chroma_nbr_flags = 0;
547     UWORD8 u1_luma_pred_mode_first_tu = 0;
548     /* Pointers for generating 2d coeffs from coeff-map */
549     UWORD8 *pu1_tu_coeff_data;
550     /* nbr avail map for CTB */
551     /* 1st bit points to neighbor (left/top_left/bot_left) */
552     /* 1Tb starts at 2nd bit from msb of 2nd value in array, followed by number of min_tu's in that ctb */
553     UWORD32 au4_intra_nbr_avail[MAX_CTB_SIZE / MIN_TU_SIZE
554                     + 2 /* Top nbr + bot nbr */]; UWORD32
555                     top_avail_bits;
556     sps_t *ps_sps;
557     pps_t *ps_pps;
558     WORD32 intra_flag;
559     UWORD8 *pu1_pic_intra_flag;
560     /*************************************************************************/
561     /* Contanis scaling matrix offset in the following order in a 1D buffer  */
562     /* Entries that are listed as UNUSED are invalid combinations where      */
563     /* scaling matrix is not used. eg: 64x64 SKIP CU, 64x64 PCM CU           */
564     /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
565     /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
566     /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
567     /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
568     /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
569     /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
570     /* Intra 32x32 Y, UNUSED,  UNUSED                                        */
571     /* Inter 32x32 Y, UNUSED,  UNUSED                                        */
572     /* UNUSED,        UNUSED,  UNUSED                                        */
573     /* UNUSED,        UNUSED,  UNUSED                                        */
574     /*************************************************************************/
575     static const WORD32 scaling_mat_offset[] =
576       { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992,
577         1248, 1504, 1760, 2016, 0, 0, 3040, 0, 0, 0, 0, 0, 0, 0, 0};
578 
579     PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED();
580 
581     ps_sps = ps_proc->ps_sps;
582     ps_pps = ps_proc->ps_pps;
583     ps_slice_hdr = ps_proc->ps_slice_hdr;
584     ps_codec = ps_proc->ps_codec;
585 
586     pu1_y_dst_ctb = ps_proc->pu1_cur_ctb_luma;
587     pu1_uv_dst_ctb = ps_proc->pu1_cur_ctb_chroma;
588 
589     pi2_ctb_coeff = ps_proc->pi2_invscan_out;
590 
591     ctb_size = (1 << ps_sps->i1_log2_ctb_size);
592     pu1_tu_coeff_data = (UWORD8 *)ps_proc->pv_tu_coeff_data;
593 
594     pic_strd = ps_codec->i4_strd;
595 
596     pi2_tmp = ps_proc->pi2_itrans_intrmd_buf;
597 
598     pi2_tu_coeff = pi2_ctb_coeff;
599 
600     ps_tu = ps_proc->ps_tu;
601 
602     if((1 == ps_sps->i1_scaling_list_enable_flag) && (1 == ps_pps->i1_pps_scaling_list_data_present_flag))
603     {
604         pi2_scaling_mat = ps_pps->pi2_scaling_mat;
605     }
606     else
607     {
608         pi2_scaling_mat = ps_sps->pi2_scaling_mat;
609     }
610 
611     {
612         /* Updating the initial availability map */
613         WORD32 i;
614         UWORD8 u1_left_ctb_avail, u1_top_lt_ctb_avail, u1_top_rt_ctb_avail,
615                         u1_top_ctb_avail;
616 
617         u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
618         u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
619         u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
620         u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
621 
622         /* Initializing the availability array */
623         memset(au4_intra_nbr_avail, 0,
624                (MAX_CTB_SIZE / MIN_TU_SIZE + 2) * sizeof(UWORD32));
625         /* Initializing the availability array with CTB level availability flags */
626         {
627             WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
628             WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
629             for(i = 0; i < ctb_size_left / MIN_TU_SIZE; i++)
630             {
631                 au4_intra_nbr_avail[i + 1] = ((UWORD32)u1_left_ctb_avail << 31);
632             }
633         }
634         au4_intra_nbr_avail[0] |= (((UWORD32)u1_top_rt_ctb_avail << 31)
635                         >> (1 + ctb_size / MIN_TU_SIZE)); /* 1+ctb_size/4 position bit pos from msb */
636 
637         au4_intra_nbr_avail[0] |= ((UWORD32)u1_top_lt_ctb_avail << 31);
638 
639         {
640             WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
641             WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
642             WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));
643 
644             /* ctb_size_top gives number of valid pixels remaining in the current row */
645             /* Since we need pattern of 1's starting from the MSB, an additional shift */
646             /* is needed */
647             shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);
648 
649             top_avail_bits = ((1 << (ctb_size_top / MIN_TU_SIZE)) - 1)
650                             << shift;
651         }
652         au4_intra_nbr_avail[0] |= (
653                         (u1_top_ctb_avail == 1) ? top_avail_bits : 0x0);
654         /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */
655 
656     }
657 
658     /* Applying Inverse transform on all the TU's in CTB */
659     for(tu_cnt = 0; tu_cnt < ps_proc->i4_ctb_tu_cnt; tu_cnt++, ps_tu++)
660     {
661         WORD32 transform_skip_flag = 0;
662         WORD32 transform_skip_flag_v = 0;
663         WORD32 num_comp, c_idx, func_idx;
664         WORD32 src_strd, pred_strd, dst_strd;
665         WORD32 qp_div = 0, qp_rem = 0;
666         WORD32 qp_div_v = 0, qp_rem_v = 0;
667         UWORD32 zero_cols = 0, zero_cols_v = 0;
668         UWORD32 zero_rows = 0, zero_rows_v = 0;
669         UWORD32 coeff_type = 0, coeff_type_v = 0;
670         WORD16 i2_coeff_value, i2_coeff_value_v;
671         WORD32 trans_size = 0;
672         TRANSFORM_TYPE e_trans_type;
673         WORD32 log2_y_trans_size_minus_2, log2_uv_trans_size_minus_2;
674         WORD32 log2_trans_size;
675         WORD32 chroma_qp_idx;
676         WORD16 *pi2_src = NULL, *pi2_src_v = NULL;
677         UWORD8 *pu1_pred = NULL, *pu1_pred_v = NULL;
678         UWORD8 *pu1_dst = NULL, *pu1_dst_v = NULL;
679         WORD16 *pi2_dequant_matrix = NULL, *pi2_dequant_matrix_v = NULL;
680         WORD32 tu_x, tu_y;
681         WORD32 tu_y_offset, tu_uv_offset;
682         WORD8 i1_chroma_pic_qp_offset, i1_chroma_slice_qp_offset;
683         UWORD8 u1_cbf = 0, u1_cbf_v = 0, u1_luma_pred_mode, u1_chroma_pred_mode;
684         WORD32 offset;
685         WORD32 pcm_flag;
686         WORD32  chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
687         /* If 420SP_VU is chroma format, pred and dst pointer   */
688         /* will be added +1 to point to U                       */
689         WORD32 chroma_yuv420sp_vu_u_offset = 1 * chroma_yuv420sp_vu;
690         /* If 420SP_VU is chroma format, pred and dst pointer   */
691         /* will be added U offset of +1 and subtracted 2        */
692         /* to point to V                                        */
693         WORD32 chroma_yuv420sp_vu_v_offset = -2 * chroma_yuv420sp_vu;
694 
695         tu_x = ps_tu->b4_pos_x * 4; /* Converting minTU unit to pixel unit */
696         tu_y = ps_tu->b4_pos_y * 4; /* Converting minTU unit to pixel unit */
697         {
698             WORD32 tu_abs_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (tu_x);
699             WORD32 tu_abs_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (tu_y);
700 
701             WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
702 
703             pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
704             pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
705             pu1_pic_intra_flag += (tu_abs_x >> 6);
706 
707             intra_flag = *pu1_pic_intra_flag;
708             intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
709         }
710 
711         u1_luma_pred_mode = ps_tu->b6_luma_intra_mode;
712         u1_chroma_pred_mode = ps_tu->b3_chroma_intra_mode_idx;
713 
714         if(u1_chroma_pred_mode != 7)
715             num_comp = 2; /* Y and UV */
716         else
717             num_comp = 1; /* Y */
718 
719 
720         pcm_flag = 0;
721 
722         if((intra_flag) && (u1_luma_pred_mode == INTRA_PRED_NONE))
723         {
724             UWORD8 *pu1_buf;
725             UWORD8 *pu1_y_dst = pu1_y_dst_ctb;
726             UWORD8 *pu1_uv_dst = pu1_uv_dst_ctb;
727             WORD32 i, j;
728             tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
729             WORD32 cb_size = 1 << (ps_tu->b3_size + 2);
730 
731             /* trans_size is used to update availability after reconstruction */
732             trans_size = cb_size;
733 
734             pcm_flag = 1;
735 
736             tu_y_offset = tu_x + tu_y * pic_strd;
737             pu1_y_dst += tu_x + tu_y * pic_strd;
738             pu1_uv_dst += tu_x + (tu_y >> 1) * pic_strd;
739 
740             /* First byte points to number of coded blocks */
741             pu1_tu_coeff_data++;
742 
743             /* Next byte points to scan type */
744             pu1_tu_coeff_data++;
745 
746             ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)pu1_tu_coeff_data;
747 
748             pu1_buf = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
749             {
750 
751                 for(i = 0; i < cb_size; i++)
752                 {
753                     //pu1_y_dst[i * pic_strd + j] = *pu1_buf++;
754                     memcpy(&pu1_y_dst[i * pic_strd], pu1_buf, cb_size);
755                     pu1_buf += cb_size;
756                 }
757 
758                 pu1_uv_dst = pu1_uv_dst + chroma_yuv420sp_vu_u_offset;
759 
760                 /* U */
761                 for(i = 0; i < cb_size / 2; i++)
762                 {
763                     for(j = 0; j < cb_size / 2; j++)
764                     {
765                         pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
766                     }
767                 }
768 
769                 pu1_uv_dst = pu1_uv_dst + 1 + chroma_yuv420sp_vu_v_offset;
770 
771                 /* V */
772                 for(i = 0; i < cb_size / 2; i++)
773                 {
774                     for(j = 0; j < cb_size / 2; j++)
775                     {
776                         pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
777                     }
778                 }
779             }
780 
781             pu1_tu_coeff_data = pu1_buf;
782 
783         }
784 
785 
786 
787 
788 
789         for(c_idx = 0; c_idx < num_comp; c_idx++)
790         {
791             if(0 == pcm_flag)
792             {
793                 /* Initializing variables */
794                 pred_strd = pic_strd;
795                 dst_strd = pic_strd;
796 
797                 if(c_idx == 0) /* Y */
798                 {
799                     log2_y_trans_size_minus_2 = ps_tu->b3_size;
800                     trans_size = 1 << (log2_y_trans_size_minus_2 + 2);
801                     log2_trans_size = log2_y_trans_size_minus_2 + 2;
802 
803                     tu_y_offset = tu_x + tu_y * pic_strd;
804 
805                     pi2_src = pi2_tu_coeff;
806                     pu1_pred = pu1_y_dst_ctb + tu_y_offset;
807                     pu1_dst = pu1_y_dst_ctb + tu_y_offset;
808 
809                     /* Calculating scaling matrix offset */
810                     offset = log2_y_trans_size_minus_2 * 6
811                                     + (!intra_flag) * 3 + c_idx;
812                     pi2_dequant_matrix = pi2_scaling_mat
813                                     + scaling_mat_offset[offset];
814 
815                     src_strd = trans_size;
816 
817                     /* 4x4 transform Luma in INTRA mode is DST */
818                     if(log2_y_trans_size_minus_2 == 0 && intra_flag)
819                     {
820                         func_idx = log2_y_trans_size_minus_2;
821                         e_trans_type = DST_4x4;
822                     }
823                     else
824                     {
825                         func_idx = log2_y_trans_size_minus_2 + 1;
826                         e_trans_type = (TRANSFORM_TYPE)(log2_y_trans_size_minus_2 + 1);
827                     }
828 
829                     qp_div = ps_tu->b7_qp / 6;
830                     qp_rem = ps_tu->b7_qp % 6;
831 
832                     u1_cbf = ps_tu->b1_y_cbf;
833 
834                     transform_skip_flag = pu1_tu_coeff_data[1] & 1;
835                     /* Unpacking coeffs */
836                     if(1 == u1_cbf)
837                     {
838                         pu1_tu_coeff_data = ihevcd_unpack_coeffs(
839                                         pi2_src, log2_y_trans_size_minus_2 + 2,
840                                         pu1_tu_coeff_data, pi2_dequant_matrix,
841                                         qp_rem, qp_div, e_trans_type,
842                                         ps_tu->b1_transquant_bypass, &zero_cols,
843                                         &zero_rows, &coeff_type,
844                                         &i2_coeff_value);
845                     }
846                 }
847                 else /* UV interleaved */
848                 {
849                     /* Chroma :If Transform size is 4x4, keep 4x4 else do transform on (trans_size/2 x trans_size/2) */
850                     if(ps_tu->b3_size == 0)
851                     {
852                         /* Chroma 4x4 is present with 4th luma 4x4 block. For this case chroma postion has to be (luma pos x- 4,luma pos y- 4) */
853                         log2_uv_trans_size_minus_2 = ps_tu->b3_size;
854                         tu_uv_offset = (tu_x - 4) + ((tu_y - 4) / 2) * pic_strd;
855                     }
856                     else
857                     {
858                         log2_uv_trans_size_minus_2 = ps_tu->b3_size - 1;
859                         tu_uv_offset = tu_x + (tu_y >> 1) * pic_strd;
860                     }
861                     trans_size = 1 << (log2_uv_trans_size_minus_2 + 2);
862                     log2_trans_size = log2_uv_trans_size_minus_2 + 2;
863 
864                     pi2_src = pi2_tu_coeff;
865                     pi2_src_v = pi2_tu_coeff + trans_size * trans_size;
866                     pu1_pred = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
867                     pu1_pred_v = pu1_pred + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
868                     pu1_dst = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
869                     pu1_dst_v = pu1_dst + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
870 
871                     /*TODO: Add support for choosing different tables for U and V,
872                      * change this to a single array to handle flat/default/custom, intra/inter, luma/chroma and various sizes
873                      */
874                     /* Calculating scaling matrix offset */
875                     /* ((log2_uv_trans_size_minus_2 == 3) ? 1:3) condition check is not needed, since
876                      * max uv trans size is 16x16
877                      */
878                     offset = log2_uv_trans_size_minus_2 * 6
879                                     + (!intra_flag) * 3 + c_idx;
880                     pi2_dequant_matrix = pi2_scaling_mat
881                                     + scaling_mat_offset[offset];
882                     pi2_dequant_matrix_v = pi2_scaling_mat
883                                     + scaling_mat_offset[offset + 1];
884 
885                     src_strd = trans_size;
886 
887                     func_idx = 1 + 4 + log2_uv_trans_size_minus_2; /* DST func + Y funcs + cur func index*/
888 
889                     /* Handle error cases where 64x64 TU is signalled which results in 32x32 chroma.
890                      * By limiting func_idx to 7, max of 16x16 chroma is called */
891                     func_idx = MIN(func_idx, 7);
892 
893                     e_trans_type = (TRANSFORM_TYPE)(log2_uv_trans_size_minus_2 + 1);
894                     /* QP for U */
895                     i1_chroma_pic_qp_offset = ps_pps->i1_pic_cb_qp_offset;
896                     i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cb_qp_offset;
897                     u1_cbf = ps_tu->b1_cb_cbf;
898 
899                     chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
900                                     + i1_chroma_slice_qp_offset;
901                     chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
902                     qp_div = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
903                     qp_rem = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
904 
905                     /* QP for V */
906                     i1_chroma_pic_qp_offset = ps_pps->i1_pic_cr_qp_offset;
907                     i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cr_qp_offset;
908                     u1_cbf_v = ps_tu->b1_cr_cbf;
909 
910                     chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
911                                     + i1_chroma_slice_qp_offset;
912                     chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
913                     qp_div_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
914                     qp_rem_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
915 
916                     /* Unpacking coeffs */
917                     transform_skip_flag = pu1_tu_coeff_data[1] & 1;
918                     if(1 == u1_cbf)
919                     {
920                         pu1_tu_coeff_data = ihevcd_unpack_coeffs(
921                                         pi2_src, log2_uv_trans_size_minus_2 + 2,
922                                         pu1_tu_coeff_data, pi2_dequant_matrix,
923                                         qp_rem, qp_div, e_trans_type,
924                                         ps_tu->b1_transquant_bypass, &zero_cols,
925                                         &zero_rows, &coeff_type,
926                                         &i2_coeff_value);
927                     }
928 
929                     transform_skip_flag_v = pu1_tu_coeff_data[1] & 1;
930                     if(1 == u1_cbf_v)
931                     {
932                         pu1_tu_coeff_data = ihevcd_unpack_coeffs(
933                                         pi2_src_v, log2_uv_trans_size_minus_2 + 2,
934                                         pu1_tu_coeff_data, pi2_dequant_matrix_v,
935                                         qp_rem_v, qp_div_v, e_trans_type,
936                                         ps_tu->b1_transquant_bypass, &zero_cols_v,
937                                         &zero_rows_v, &coeff_type_v, &i2_coeff_value_v);
938                     }
939                 }
940                 /***************************************************************/
941                 /******************  Intra Prediction **************************/
942                 /***************************************************************/
943                 if(intra_flag) /* Intra */
944                 {
945                     /* While (MAX_TU_SIZE * 2 * 2) + 1 is the actaul size needed,
946                        au1_ref_sub_out size is kept as multiple of 8,
947                        so that SIMD functions can load 64 bits. Also some SIMD
948                        modules read few bytes before the start of the array, so
949                        allocate 16 extra bytes at the start */
950                     UWORD8 au1_ref_sub_out[16 + (MAX_TU_SIZE * 2 * 2) + 8] = {0};
951                     UWORD8 *pu1_ref_sub_out = &au1_ref_sub_out[16];
952                     UWORD8 *pu1_top_left, *pu1_top, *pu1_left;
953                     WORD32 luma_pred_func_idx, chroma_pred_func_idx;
954 
955                     /* Get the neighbour availability flags */
956                     /* Done for only Y */
957                     if(c_idx == 0)
958                     {
959                         /* Get neighbor availability for Y only */
960                         luma_nbr_flags = ihevcd_get_intra_nbr_flag(ps_proc,
961                                                                    ps_tu,
962                                                                    au4_intra_nbr_avail,
963                                                                    ps_sps->i2_pic_width_in_luma_samples,
964                                                                    ps_pps->i1_constrained_intra_pred_flag,
965                                                                    trans_size,
966                                                                    ctb_size);
967 
968                         if(trans_size == 4)
969                             luma_nbr_flags_4x4[(ps_tu->b4_pos_x % 2) + (ps_tu->b4_pos_y % 2) * 2] = luma_nbr_flags;
970 
971                         if((ps_tu->b4_pos_x % 2 == 0) && (ps_tu->b4_pos_y % 2 == 0))
972                         {
973                             chroma_nbr_flags = luma_nbr_flags;
974                         }
975 
976                         /* Initializing nbr pointers */
977                         pu1_top = pu1_pred - pic_strd;
978                         pu1_left = pu1_pred - 1;
979                         pu1_top_left = pu1_pred - pic_strd - 1;
980 
981                         /* call reference array substitution */
982                         if(luma_nbr_flags == 0x1ffff)
983                             ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr(
984                                             pu1_top_left,
985                                             pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, pu1_ref_sub_out, 1);
986                         else
987                             ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr(
988                                             pu1_top_left,
989                                             pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, pu1_ref_sub_out, 1);
990 
991                         /* call reference filtering */
992                         ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr(
993                                         pu1_ref_sub_out, trans_size,
994                                         pu1_ref_sub_out,
995                                         u1_luma_pred_mode, ps_sps->i1_strong_intra_smoothing_enable_flag);
996 
997                         /* use the look up to get the function idx */
998                         luma_pred_func_idx = g_i4_ip_funcs[u1_luma_pred_mode];
999 
1000                         /* call the intra prediction function */
1001                         ps_codec->apf_intra_pred_luma[luma_pred_func_idx](pu1_ref_sub_out, 1, pu1_pred, pred_strd, trans_size, u1_luma_pred_mode);
1002                     }
1003                     else
1004                     {
1005                         /* In case of yuv420sp_vu, prediction happens as usual.         */
1006                         /* So point the pu1_pred pointer to original prediction pointer */
1007                         UWORD8 *pu1_pred_orig = pu1_pred - chroma_yuv420sp_vu_u_offset;
1008 
1009                         /*    Top-Left | Top-Right | Top | Left | Bottom-Left
1010                          *      1         4         4     4         4
1011                          *
1012                          * Generating chroma_nbr_flags depending upon the transform size */
1013                         if(ps_tu->b3_size == 0)
1014                         {
1015                             /* Take TL,T,L flags of First luma 4x4 block */
1016                             chroma_nbr_flags = (luma_nbr_flags_4x4[0] & 0x10FF0);
1017                             /* Take TR flags of Second luma 4x4 block */
1018                             chroma_nbr_flags |= (luma_nbr_flags_4x4[1] & 0x0F000);
1019                             /* Take BL flags of Third luma 4x4 block */
1020                             chroma_nbr_flags |= (luma_nbr_flags_4x4[2] & 0x0000F);
1021                         }
1022 
1023                         /* Initializing nbr pointers */
1024                         pu1_top = pu1_pred_orig - pic_strd;
1025                         pu1_left = pu1_pred_orig - 2;
1026                         pu1_top_left = pu1_pred_orig - pic_strd - 2;
1027 
1028                         /* Chroma pred  mode derivation from luma pred mode */
1029                         {
1030                             tu_t *ps_tu_tmp = ps_tu;
1031                             while(!ps_tu_tmp->b1_first_tu_in_cu)
1032                             {
1033                                 ps_tu_tmp--;
1034                             }
1035                             u1_luma_pred_mode_first_tu = ps_tu_tmp->b6_luma_intra_mode;
1036                         }
1037                         if(4 == u1_chroma_pred_mode)
1038                             u1_chroma_pred_mode = u1_luma_pred_mode_first_tu;
1039                         else
1040                         {
1041                             u1_chroma_pred_mode = gau1_intra_pred_chroma_modes[u1_chroma_pred_mode];
1042 
1043                             if(u1_chroma_pred_mode ==
1044                                                             u1_luma_pred_mode_first_tu)
1045                             {
1046                                 u1_chroma_pred_mode = INTRA_ANGULAR(34);
1047                             }
1048                         }
1049 
1050                         /* call the chroma reference array substitution */
1051                         ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr(
1052                                         pu1_top_left,
1053                                         pu1_top, pu1_left, pic_strd, trans_size, chroma_nbr_flags, pu1_ref_sub_out, 1);
1054 
1055                         /* use the look up to get the function idx */
1056                         chroma_pred_func_idx =
1057                                         g_i4_ip_funcs[u1_chroma_pred_mode];
1058 
1059                         /* call the intra prediction function */
1060                         ps_codec->apf_intra_pred_chroma[chroma_pred_func_idx](pu1_ref_sub_out, 1, pu1_pred_orig, pred_strd, trans_size, u1_chroma_pred_mode);
1061                     }
1062                 }
1063 
1064                 /* Updating number of transform types */
1065                 STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx);
1066 
1067                 /* IQ, IT and Recon for Y if c_idx == 0, and U if c_idx !=0 */
1068                 if(1 == u1_cbf)
1069                 {
1070                     if(ps_tu->b1_transquant_bypass || transform_skip_flag)
1071                     {
1072                         /* Recon */
1073                         ps_codec->apf_recon[func_idx](pi2_src, pu1_pred, pu1_dst,
1074                                                       src_strd, pred_strd, dst_strd,
1075                                                       zero_cols);
1076                     }
1077                     else
1078                     {
1079 
1080                         /* Updating coded number of transform types(excluding trans skip and trans quant skip) */
1081                         STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
1082 
1083                         /* iQuant , iTrans and Recon */
1084                         if((0 == coeff_type))
1085                         {
1086                             ps_codec->apf_itrans_recon[func_idx](pi2_src, pi2_tmp,
1087                                                                  pu1_pred, pu1_dst,
1088                                                                  src_strd, pred_strd,
1089                                                                  dst_strd, zero_cols,
1090                                                                  zero_rows);
1091                         }
1092                         else /* DC only */
1093                         {
1094                             STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
1095                             ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred, pu1_dst,
1096                                                                  pred_strd, dst_strd,
1097                                                                  log2_trans_size,
1098                                                                  i2_coeff_value);
1099                         }
1100                     }
1101                 }
1102                 /* IQ, IT and Recon for V */
1103                 if(c_idx != 0)
1104                 {
1105                     if(1 == u1_cbf_v)
1106                     {
1107                         if(ps_tu->b1_transquant_bypass || transform_skip_flag_v)
1108                         {
1109                             /* Recon */
1110                             ps_codec->apf_recon[func_idx](pi2_src_v, pu1_pred_v,
1111                                                           pu1_dst_v, src_strd,
1112                                                           pred_strd, dst_strd,
1113                                                           zero_cols_v);
1114                         }
1115                         else
1116                         {
1117                             /* Updating number of transform types */
1118                             STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
1119 
1120                             /* iQuant , iTrans and Recon */
1121                             if((0 == coeff_type_v))
1122                             {
1123                                 ps_codec->apf_itrans_recon[func_idx](pi2_src_v,
1124                                                                      pi2_tmp,
1125                                                                      pu1_pred_v,
1126                                                                      pu1_dst_v,
1127                                                                      src_strd,
1128                                                                      pred_strd,
1129                                                                      dst_strd,
1130                                                                      zero_cols_v,
1131                                                                      zero_rows_v);
1132                             }
1133                             else  /* DC only */
1134                             {
1135                                 STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
1136                                 ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred_v, pu1_dst_v,
1137                                                                      pred_strd, dst_strd,
1138                                                                      log2_trans_size,
1139                                                                      i2_coeff_value_v);
1140                             }
1141                         }
1142                     }
1143                 }
1144             }
1145 
1146             /* Neighbor availability inside CTB */
1147             /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
1148             /* Used for neighbor availability in intra pred */
1149             if(c_idx == 0)
1150             {
1151                 WORD32 i;
1152                 WORD32 trans_in_min_tu;
1153                 UWORD32 cur_tu_in_bits;
1154                 UWORD32 cur_tu_avail_flag;
1155 
1156                 trans_in_min_tu = trans_size / MIN_TU_SIZE;
1157                 cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
1158                 cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);
1159 
1160                 cur_tu_avail_flag = cur_tu_in_bits >> (ps_tu->b4_pos_x + 1);
1161 
1162                 for(i = 0; i < trans_in_min_tu; i++)
1163                     au4_intra_nbr_avail[1 + ps_tu->b4_pos_y + i] |=
1164                                     cur_tu_avail_flag;
1165             }
1166         }
1167     }
1168     ps_proc->pv_tu_coeff_data = pu1_tu_coeff_data;
1169 
1170     return ps_proc->i4_ctb_tu_cnt;
1171 }
1172 
1173