xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_me_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar  *
3*c83a76b0SSuyog Pawar  * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar  *
5*c83a76b0SSuyog Pawar  * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar  * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar  * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar  *
9*c83a76b0SSuyog Pawar  * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar  *
11*c83a76b0SSuyog Pawar  * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar  * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar  * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar  * limitations under the License.
16*c83a76b0SSuyog Pawar  *
17*c83a76b0SSuyog Pawar  *****************************************************************************
18*c83a76b0SSuyog Pawar  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar */
20*c83a76b0SSuyog Pawar /**
21*c83a76b0SSuyog Pawar ******************************************************************************
22*c83a76b0SSuyog Pawar * @file
23*c83a76b0SSuyog Pawar *  ihevce_me_neon.c
24*c83a76b0SSuyog Pawar *
25*c83a76b0SSuyog Pawar * @brief
26*c83a76b0SSuyog Pawar *  Subpel refinement modules for ME algo
27*c83a76b0SSuyog Pawar *
28*c83a76b0SSuyog Pawar * @author
29*c83a76b0SSuyog Pawar *  Ittiam
30*c83a76b0SSuyog Pawar *
31*c83a76b0SSuyog Pawar * @par List of Functions:
32*c83a76b0SSuyog Pawar *
33*c83a76b0SSuyog Pawar * @remarks
34*c83a76b0SSuyog Pawar *  None
35*c83a76b0SSuyog Pawar *
36*c83a76b0SSuyog Pawar ********************************************************************************
37*c83a76b0SSuyog Pawar */
38*c83a76b0SSuyog Pawar 
39*c83a76b0SSuyog Pawar /*****************************************************************************/
40*c83a76b0SSuyog Pawar /* File Includes                                                             */
41*c83a76b0SSuyog Pawar /*****************************************************************************/
42*c83a76b0SSuyog Pawar /* System include files */
43*c83a76b0SSuyog Pawar #include <stdio.h>
44*c83a76b0SSuyog Pawar #include <string.h>
45*c83a76b0SSuyog Pawar #include <assert.h>
46*c83a76b0SSuyog Pawar #include <arm_neon.h>
47*c83a76b0SSuyog Pawar 
48*c83a76b0SSuyog Pawar /* User include files */
49*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
50*c83a76b0SSuyog Pawar #include "itt_video_api.h"
51*c83a76b0SSuyog Pawar #include "ihevc_cmn_utils_neon.h"
52*c83a76b0SSuyog Pawar #include "ihevc_chroma_itrans_recon.h"
53*c83a76b0SSuyog Pawar #include "ihevc_chroma_intra_pred.h"
54*c83a76b0SSuyog Pawar #include "ihevc_debug.h"
55*c83a76b0SSuyog Pawar #include "ihevc_deblk.h"
56*c83a76b0SSuyog Pawar #include "ihevc_defs.h"
57*c83a76b0SSuyog Pawar #include "ihevc_itrans_recon.h"
58*c83a76b0SSuyog Pawar #include "ihevc_intra_pred.h"
59*c83a76b0SSuyog Pawar #include "ihevc_inter_pred.h"
60*c83a76b0SSuyog Pawar #include "ihevc_macros.h"
61*c83a76b0SSuyog Pawar #include "ihevc_mem_fns.h"
62*c83a76b0SSuyog Pawar #include "ihevc_padding.h"
63*c83a76b0SSuyog Pawar #include "ihevc_quant_iquant_ssd.h"
64*c83a76b0SSuyog Pawar #include "ihevc_resi_trans.h"
65*c83a76b0SSuyog Pawar #include "ihevc_sao.h"
66*c83a76b0SSuyog Pawar #include "ihevc_structs.h"
67*c83a76b0SSuyog Pawar #include "ihevc_weighted_pred.h"
68*c83a76b0SSuyog Pawar 
69*c83a76b0SSuyog Pawar #include "rc_cntrl_param.h"
70*c83a76b0SSuyog Pawar #include "rc_frame_info_collector.h"
71*c83a76b0SSuyog Pawar #include "rc_look_ahead_params.h"
72*c83a76b0SSuyog Pawar 
73*c83a76b0SSuyog Pawar #include "ihevce_api.h"
74*c83a76b0SSuyog Pawar #include "ihevce_defs.h"
75*c83a76b0SSuyog Pawar #include "ihevce_lap_enc_structs.h"
76*c83a76b0SSuyog Pawar #include "ihevce_multi_thrd_structs.h"
77*c83a76b0SSuyog Pawar #include "ihevce_function_selector.h"
78*c83a76b0SSuyog Pawar #include "ihevce_me_common_defs.h"
79*c83a76b0SSuyog Pawar #include "ihevce_enc_structs.h"
80*c83a76b0SSuyog Pawar #include "ihevce_had_satd.h"
81*c83a76b0SSuyog Pawar #include "ihevce_ipe_instr_set_router.h"
82*c83a76b0SSuyog Pawar #include "ihevce_global_tables.h"
83*c83a76b0SSuyog Pawar 
84*c83a76b0SSuyog Pawar #include "hme_datatype.h"
85*c83a76b0SSuyog Pawar #include "hme_common_defs.h"
86*c83a76b0SSuyog Pawar #include "hme_common_utils.h"
87*c83a76b0SSuyog Pawar #include "hme_interface.h"
88*c83a76b0SSuyog Pawar #include "hme_defs.h"
89*c83a76b0SSuyog Pawar #include "hme_err_compute.h"
90*c83a76b0SSuyog Pawar #include "hme_globals.h"
91*c83a76b0SSuyog Pawar 
92*c83a76b0SSuyog Pawar #include "ihevce_me_instr_set_router.h"
93*c83a76b0SSuyog Pawar 
94*c83a76b0SSuyog Pawar /*****************************************************************************/
95*c83a76b0SSuyog Pawar /* Typedefs                                                                  */
96*c83a76b0SSuyog Pawar /*****************************************************************************/
97*c83a76b0SSuyog Pawar typedef void ft_calc_sad4_nxn(
98*c83a76b0SSuyog Pawar     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad);
99*c83a76b0SSuyog Pawar 
100*c83a76b0SSuyog Pawar /*****************************************************************************/
101*c83a76b0SSuyog Pawar /* Function Macros                                                           */
102*c83a76b0SSuyog Pawar /*****************************************************************************/
103*c83a76b0SSuyog Pawar #define COMBINE_SADS(pps, as, i)                                                                   \
104*c83a76b0SSuyog Pawar     {                                                                                              \
105*c83a76b0SSuyog Pawar         pps[PART_ID_NxN_TL][i] = (as[0] + as[1] + as[4] + as[5]);                                  \
106*c83a76b0SSuyog Pawar         pps[PART_ID_NxN_TR][i] = (as[2] + as[3] + as[6] + as[7]);                                  \
107*c83a76b0SSuyog Pawar         pps[PART_ID_NxN_BL][i] = (as[8] + as[9] + as[12] + as[13]);                                \
108*c83a76b0SSuyog Pawar         pps[PART_ID_NxN_BR][i] = (as[10] + as[11] + as[14] + as[15]);                              \
109*c83a76b0SSuyog Pawar                                                                                                    \
110*c83a76b0SSuyog Pawar         pps[PART_ID_Nx2N_L][i] = pps[PART_ID_NxN_TL][i] + pps[PART_ID_NxN_BL][i];                  \
111*c83a76b0SSuyog Pawar         pps[PART_ID_Nx2N_R][i] = pps[PART_ID_NxN_TR][i] + pps[PART_ID_NxN_BR][i];                  \
112*c83a76b0SSuyog Pawar         pps[PART_ID_2NxN_T][i] = pps[PART_ID_NxN_TR][i] + pps[PART_ID_NxN_TL][i];                  \
113*c83a76b0SSuyog Pawar         pps[PART_ID_2NxN_B][i] = pps[PART_ID_NxN_BR][i] + pps[PART_ID_NxN_BL][i];                  \
114*c83a76b0SSuyog Pawar                                                                                                    \
115*c83a76b0SSuyog Pawar         pps[PART_ID_nLx2N_L][i] = (as[8] + as[0] + as[12] + as[4]);                                \
116*c83a76b0SSuyog Pawar         pps[PART_ID_nRx2N_R][i] = (as[3] + as[7] + as[15] + as[11]);                               \
117*c83a76b0SSuyog Pawar         pps[PART_ID_2NxnU_T][i] = (as[1] + as[0] + as[2] + as[3]);                                 \
118*c83a76b0SSuyog Pawar         pps[PART_ID_2NxnD_B][i] = (as[15] + as[14] + as[12] + as[13]);                             \
119*c83a76b0SSuyog Pawar                                                                                                    \
120*c83a76b0SSuyog Pawar         pps[PART_ID_2Nx2N][i] = pps[PART_ID_2NxN_T][i] + pps[PART_ID_2NxN_B][i];                   \
121*c83a76b0SSuyog Pawar                                                                                                    \
122*c83a76b0SSuyog Pawar         pps[PART_ID_2NxnU_B][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_2NxnU_T][i];                 \
123*c83a76b0SSuyog Pawar         pps[PART_ID_2NxnD_T][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_2NxnD_B][i];                 \
124*c83a76b0SSuyog Pawar         pps[PART_ID_nRx2N_L][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_nRx2N_R][i];                 \
125*c83a76b0SSuyog Pawar         pps[PART_ID_nLx2N_R][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_nLx2N_L][i];                 \
126*c83a76b0SSuyog Pawar     }
127*c83a76b0SSuyog Pawar 
128*c83a76b0SSuyog Pawar #define COMBINE_SADS_2(ps, as)                                                                     \
129*c83a76b0SSuyog Pawar     {                                                                                              \
130*c83a76b0SSuyog Pawar         ps[PART_ID_NxN_TL] = (as[0] + as[1] + as[4] + as[5]);                                      \
131*c83a76b0SSuyog Pawar         ps[PART_ID_NxN_TR] = (as[2] + as[3] + as[6] + as[7]);                                      \
132*c83a76b0SSuyog Pawar         ps[PART_ID_NxN_BL] = (as[8] + as[9] + as[12] + as[13]);                                    \
133*c83a76b0SSuyog Pawar         ps[PART_ID_NxN_BR] = (as[10] + as[11] + as[14] + as[15]);                                  \
134*c83a76b0SSuyog Pawar                                                                                                    \
135*c83a76b0SSuyog Pawar         ps[PART_ID_Nx2N_L] = ps[PART_ID_NxN_TL] + ps[PART_ID_NxN_BL];                              \
136*c83a76b0SSuyog Pawar         ps[PART_ID_Nx2N_R] = ps[PART_ID_NxN_TR] + ps[PART_ID_NxN_BR];                              \
137*c83a76b0SSuyog Pawar         ps[PART_ID_2NxN_T] = ps[PART_ID_NxN_TR] + ps[PART_ID_NxN_TL];                              \
138*c83a76b0SSuyog Pawar         ps[PART_ID_2NxN_B] = ps[PART_ID_NxN_BR] + ps[PART_ID_NxN_BL];                              \
139*c83a76b0SSuyog Pawar                                                                                                    \
140*c83a76b0SSuyog Pawar         ps[PART_ID_nLx2N_L] = (as[8] + as[0] + as[12] + as[4]);                                    \
141*c83a76b0SSuyog Pawar         ps[PART_ID_nRx2N_R] = (as[3] + as[7] + as[15] + as[11]);                                   \
142*c83a76b0SSuyog Pawar         ps[PART_ID_2NxnU_T] = (as[1] + as[0] + as[2] + as[3]);                                     \
143*c83a76b0SSuyog Pawar         ps[PART_ID_2NxnD_B] = (as[15] + as[14] + as[12] + as[13]);                                 \
144*c83a76b0SSuyog Pawar                                                                                                    \
145*c83a76b0SSuyog Pawar         ps[PART_ID_2Nx2N] = ps[PART_ID_2NxN_T] + ps[PART_ID_2NxN_B];                               \
146*c83a76b0SSuyog Pawar                                                                                                    \
147*c83a76b0SSuyog Pawar         ps[PART_ID_2NxnU_B] = ps[PART_ID_2Nx2N] - ps[PART_ID_2NxnU_T];                             \
148*c83a76b0SSuyog Pawar         ps[PART_ID_2NxnD_T] = ps[PART_ID_2Nx2N] - ps[PART_ID_2NxnD_B];                             \
149*c83a76b0SSuyog Pawar         ps[PART_ID_nRx2N_L] = ps[PART_ID_2Nx2N] - ps[PART_ID_nRx2N_R];                             \
150*c83a76b0SSuyog Pawar         ps[PART_ID_nLx2N_R] = ps[PART_ID_2Nx2N] - ps[PART_ID_nLx2N_L];                             \
151*c83a76b0SSuyog Pawar     }
152*c83a76b0SSuyog Pawar 
153*c83a76b0SSuyog Pawar /*****************************************************************************/
154*c83a76b0SSuyog Pawar /* Function Definitions                                                      */
155*c83a76b0SSuyog Pawar /*****************************************************************************/
156*c83a76b0SSuyog Pawar 
ihevce_sad4_2x2_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)157*c83a76b0SSuyog Pawar static void ihevce_sad4_2x2_neon(
158*c83a76b0SSuyog Pawar     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
159*c83a76b0SSuyog Pawar {
160*c83a76b0SSuyog Pawar     uint16x8_t abs = vdupq_n_u16(0);
161*c83a76b0SSuyog Pawar     uint32x4_t sad;
162*c83a76b0SSuyog Pawar     WORD32 i;
163*c83a76b0SSuyog Pawar 
164*c83a76b0SSuyog Pawar     /* -------- Compute four 2x2 SAD Transforms of 8x2 in one call--------- */
165*c83a76b0SSuyog Pawar     for(i = 0; i < 2; i++)
166*c83a76b0SSuyog Pawar     {
167*c83a76b0SSuyog Pawar         const uint8x8_t src = vld1_u8(pu1_src);
168*c83a76b0SSuyog Pawar         const uint8x8_t pred = vld1_u8(pu1_pred);
169*c83a76b0SSuyog Pawar 
170*c83a76b0SSuyog Pawar         abs = vabal_u8(abs, src, pred);
171*c83a76b0SSuyog Pawar         pu1_src += src_strd;
172*c83a76b0SSuyog Pawar         pu1_pred += pred_strd;
173*c83a76b0SSuyog Pawar     }
174*c83a76b0SSuyog Pawar     sad = vpaddlq_u16(abs);
175*c83a76b0SSuyog Pawar     vst1q_u32(pu4_sad, sad);
176*c83a76b0SSuyog Pawar }
177*c83a76b0SSuyog Pawar 
ihevce_sad4_4x4_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD16 * pu2_sad)178*c83a76b0SSuyog Pawar static void ihevce_sad4_4x4_neon(
179*c83a76b0SSuyog Pawar     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD16 *pu2_sad)
180*c83a76b0SSuyog Pawar {
181*c83a76b0SSuyog Pawar     uint16x8_t abs_01 = vdupq_n_u16(0);
182*c83a76b0SSuyog Pawar     uint16x8_t abs_23 = vdupq_n_u16(0);
183*c83a76b0SSuyog Pawar     uint16x4_t tmp_a0, tmp_a1;
184*c83a76b0SSuyog Pawar     WORD32 i;
185*c83a76b0SSuyog Pawar 
186*c83a76b0SSuyog Pawar     /* -------- Compute four 4x4 SAD Transforms of 16x4 in one call--------- */
187*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
188*c83a76b0SSuyog Pawar     {
189*c83a76b0SSuyog Pawar         const uint8x16_t src = vld1q_u8(pu1_src);
190*c83a76b0SSuyog Pawar         const uint8x16_t pred = vld1q_u8(pu1_pred);
191*c83a76b0SSuyog Pawar 
192*c83a76b0SSuyog Pawar         abs_01 = vabal_u8(abs_01, vget_low_u8(src), vget_low_u8(pred));
193*c83a76b0SSuyog Pawar         abs_23 = vabal_u8(abs_23, vget_high_u8(src), vget_high_u8(pred));
194*c83a76b0SSuyog Pawar         pu1_src += src_strd;
195*c83a76b0SSuyog Pawar         pu1_pred += pred_strd;
196*c83a76b0SSuyog Pawar     }
197*c83a76b0SSuyog Pawar     tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
198*c83a76b0SSuyog Pawar     tmp_a1 = vpadd_u16(vget_low_u16(abs_23), vget_high_u16(abs_23));
199*c83a76b0SSuyog Pawar     abs_01 = vcombine_u16(tmp_a0, tmp_a1);
200*c83a76b0SSuyog Pawar     tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
201*c83a76b0SSuyog Pawar     vst1_u16(pu2_sad, tmp_a0);
202*c83a76b0SSuyog Pawar }
203*c83a76b0SSuyog Pawar 
ihevce_sad4_8x8_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)204*c83a76b0SSuyog Pawar static void ihevce_sad4_8x8_neon(
205*c83a76b0SSuyog Pawar     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
206*c83a76b0SSuyog Pawar {
207*c83a76b0SSuyog Pawar     uint16x8_t abs_0 = vdupq_n_u16(0);
208*c83a76b0SSuyog Pawar     uint16x8_t abs_1 = vdupq_n_u16(0);
209*c83a76b0SSuyog Pawar     uint16x8_t abs_2 = vdupq_n_u16(0);
210*c83a76b0SSuyog Pawar     uint16x8_t abs_3 = vdupq_n_u16(0);
211*c83a76b0SSuyog Pawar     uint16x4_t tmp_a0, tmp_a1;
212*c83a76b0SSuyog Pawar     uint32x4_t sad;
213*c83a76b0SSuyog Pawar     WORD32 i;
214*c83a76b0SSuyog Pawar 
215*c83a76b0SSuyog Pawar     /* -------- Compute four 8x8 SAD Transforms of 32x8 in one call--------- */
216*c83a76b0SSuyog Pawar     for(i = 0; i < 8; i++)
217*c83a76b0SSuyog Pawar     {
218*c83a76b0SSuyog Pawar         uint8x16_t src_01 = vld1q_u8(pu1_src);
219*c83a76b0SSuyog Pawar         uint8x16_t pred_01 = vld1q_u8(pu1_pred);
220*c83a76b0SSuyog Pawar         uint8x16_t src_23 = vld1q_u8(pu1_src + 16);
221*c83a76b0SSuyog Pawar         uint8x16_t pred_23 = vld1q_u8(pu1_pred + 16);
222*c83a76b0SSuyog Pawar 
223*c83a76b0SSuyog Pawar         abs_0 = vabal_u8(abs_0, vget_low_u8(src_01), vget_low_u8(pred_01));
224*c83a76b0SSuyog Pawar         abs_1 = vabal_u8(abs_1, vget_high_u8(src_01), vget_high_u8(pred_01));
225*c83a76b0SSuyog Pawar         abs_2 = vabal_u8(abs_2, vget_low_u8(src_23), vget_low_u8(pred_23));
226*c83a76b0SSuyog Pawar         abs_3 = vabal_u8(abs_3, vget_high_u8(src_23), vget_high_u8(pred_23));
227*c83a76b0SSuyog Pawar         pu1_src += src_strd;
228*c83a76b0SSuyog Pawar         pu1_pred += pred_strd;
229*c83a76b0SSuyog Pawar     }
230*c83a76b0SSuyog Pawar     tmp_a0 = vpadd_u16(vget_low_u16(abs_0), vget_high_u16(abs_0));
231*c83a76b0SSuyog Pawar     tmp_a1 = vpadd_u16(vget_low_u16(abs_1), vget_high_u16(abs_1));
232*c83a76b0SSuyog Pawar     abs_0 = vcombine_u16(tmp_a0, tmp_a1);
233*c83a76b0SSuyog Pawar     tmp_a0 = vpadd_u16(vget_low_u16(abs_2), vget_high_u16(abs_2));
234*c83a76b0SSuyog Pawar     tmp_a1 = vpadd_u16(vget_low_u16(abs_3), vget_high_u16(abs_3));
235*c83a76b0SSuyog Pawar     abs_1 = vcombine_u16(tmp_a0, tmp_a1);
236*c83a76b0SSuyog Pawar     tmp_a0 = vpadd_u16(vget_low_u16(abs_0), vget_high_u16(abs_0));
237*c83a76b0SSuyog Pawar     tmp_a1 = vpadd_u16(vget_low_u16(abs_1), vget_high_u16(abs_1));
238*c83a76b0SSuyog Pawar     abs_0 = vcombine_u16(tmp_a0, tmp_a1);
239*c83a76b0SSuyog Pawar     sad = vpaddlq_u16(abs_0);
240*c83a76b0SSuyog Pawar     vst1q_u32(pu4_sad, sad);
241*c83a76b0SSuyog Pawar }
242*c83a76b0SSuyog Pawar 
ihevce_sad4_16x16_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)243*c83a76b0SSuyog Pawar static void ihevce_sad4_16x16_neon(
244*c83a76b0SSuyog Pawar     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
245*c83a76b0SSuyog Pawar {
246*c83a76b0SSuyog Pawar     WORD32 i;
247*c83a76b0SSuyog Pawar 
248*c83a76b0SSuyog Pawar     /* ------ Compute four 16x16 SAD Transforms of 64x16 in one call-------- */
249*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
250*c83a76b0SSuyog Pawar     {
251*c83a76b0SSuyog Pawar         pu4_sad[i] = ihevce_nxn_sad_computer_neon(
252*c83a76b0SSuyog Pawar             pu1_src + (i * 16), src_strd, pu1_pred + (i * 16), pred_strd, 16);
253*c83a76b0SSuyog Pawar     }
254*c83a76b0SSuyog Pawar }
255*c83a76b0SSuyog Pawar 
compute_part_sads_for_MxM_blk_neon(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)256*c83a76b0SSuyog Pawar void compute_part_sads_for_MxM_blk_neon(
257*c83a76b0SSuyog Pawar     grid_ctxt_t *ps_grid,
258*c83a76b0SSuyog Pawar     UWORD8 *pu1_cur_ptr,
259*c83a76b0SSuyog Pawar     WORD32 cur_buf_stride,
260*c83a76b0SSuyog Pawar     WORD32 **pp_part_sads,
261*c83a76b0SSuyog Pawar     cand_t *ps_cand,
262*c83a76b0SSuyog Pawar     WORD32 *num_cands,
263*c83a76b0SSuyog Pawar     CU_SIZE_T e_cu_size)
264*c83a76b0SSuyog Pawar {
265*c83a76b0SSuyog Pawar     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
266*c83a76b0SSuyog Pawar     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
267*c83a76b0SSuyog Pawar 
268*c83a76b0SSuyog Pawar     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
269*c83a76b0SSuyog Pawar     WORD32 offset_x[NUM_CANDIDATES_IN_GRID] = { 0,         -grd_sz_x, 0,         grd_sz_x, 0,
270*c83a76b0SSuyog Pawar                                                 -grd_sz_x, grd_sz_x,  -grd_sz_x, grd_sz_x };
271*c83a76b0SSuyog Pawar     WORD32 offset_y[NUM_CANDIDATES_IN_GRID] = { 0,         0,         -grd_sz_y, 0,       grd_sz_y,
272*c83a76b0SSuyog Pawar                                                 -grd_sz_y, -grd_sz_y, grd_sz_y,  grd_sz_y };
273*c83a76b0SSuyog Pawar     WORD32 shift = (WORD32)e_cu_size;
274*c83a76b0SSuyog Pawar 
275*c83a76b0SSuyog Pawar     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
276*c83a76b0SSuyog Pawar     WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
277*c83a76b0SSuyog Pawar     WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
278*c83a76b0SSuyog Pawar 
279*c83a76b0SSuyog Pawar     cand_t *cand0 = ps_cand;
280*c83a76b0SSuyog Pawar 
281*c83a76b0SSuyog Pawar     ft_calc_sad4_nxn *calc_sad4 = NULL;
282*c83a76b0SSuyog Pawar 
283*c83a76b0SSuyog Pawar     /* for a 2Nx2N partition we evaluate (N/2)x(N/2) SADs. This is needed for
284*c83a76b0SSuyog Pawar      * AMP cases */
285*c83a76b0SSuyog Pawar     UWORD32 au4_nxn_sad[16];
286*c83a76b0SSuyog Pawar 
287*c83a76b0SSuyog Pawar     WORD32 i, j;
288*c83a76b0SSuyog Pawar 
289*c83a76b0SSuyog Pawar     *num_cands = 0;
290*c83a76b0SSuyog Pawar 
291*c83a76b0SSuyog Pawar     /* Loop to fill up the cand_t array and to calculate num_cands */
292*c83a76b0SSuyog Pawar     for(i = 0; i < ps_grid->num_grids; i++)
293*c83a76b0SSuyog Pawar     {
294*c83a76b0SSuyog Pawar         WORD32 j;
295*c83a76b0SSuyog Pawar         WORD32 mask = ps_grid->pi4_grd_mask[i];
296*c83a76b0SSuyog Pawar         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
297*c83a76b0SSuyog Pawar         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
298*c83a76b0SSuyog Pawar         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
299*c83a76b0SSuyog Pawar 
300*c83a76b0SSuyog Pawar         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
301*c83a76b0SSuyog Pawar         {
302*c83a76b0SSuyog Pawar             if(mask & 1)
303*c83a76b0SSuyog Pawar             {
304*c83a76b0SSuyog Pawar                 *num_cands = *num_cands + 1;
305*c83a76b0SSuyog Pawar                 cand0->grid_ix = i;
306*c83a76b0SSuyog Pawar                 cand0->ref_idx = ps_grid->p_ref_idx[i];
307*c83a76b0SSuyog Pawar                 cand0->pu1_ref_ptr =
308*c83a76b0SSuyog Pawar                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
309*c83a76b0SSuyog Pawar                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
310*c83a76b0SSuyog Pawar                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
311*c83a76b0SSuyog Pawar                 cand0++;
312*c83a76b0SSuyog Pawar             }
313*c83a76b0SSuyog Pawar         }
314*c83a76b0SSuyog Pawar     }
315*c83a76b0SSuyog Pawar 
316*c83a76b0SSuyog Pawar     /* fn selector */
317*c83a76b0SSuyog Pawar     if(e_cu_size == CU_8x8)
318*c83a76b0SSuyog Pawar         calc_sad4 = ihevce_sad4_2x2_neon;
319*c83a76b0SSuyog Pawar     else if(e_cu_size == CU_32x32)
320*c83a76b0SSuyog Pawar         calc_sad4 = ihevce_sad4_8x8_neon;
321*c83a76b0SSuyog Pawar     else if(e_cu_size == CU_64x64)
322*c83a76b0SSuyog Pawar         calc_sad4 = ihevce_sad4_16x16_neon;
323*c83a76b0SSuyog Pawar 
324*c83a76b0SSuyog Pawar     /* Loop to compute the SAD's */
325*c83a76b0SSuyog Pawar     for(i = 0; i < *num_cands; i++)
326*c83a76b0SSuyog Pawar     {
327*c83a76b0SSuyog Pawar         cand_t *cand = ps_cand + i;
328*c83a76b0SSuyog Pawar 
329*c83a76b0SSuyog Pawar         for(j = 0; j < 4; j++)
330*c83a76b0SSuyog Pawar             (*calc_sad4)(
331*c83a76b0SSuyog Pawar                 pu1_cur_ptr + j * cur_buf_stride_lsN,
332*c83a76b0SSuyog Pawar                 cur_buf_stride,
333*c83a76b0SSuyog Pawar                 cand->pu1_ref_ptr + j * ref_buf_stride_lsN,
334*c83a76b0SSuyog Pawar                 ref_buf_stride,
335*c83a76b0SSuyog Pawar                 &au4_nxn_sad[4 * j]);
336*c83a76b0SSuyog Pawar 
337*c83a76b0SSuyog Pawar         COMBINE_SADS(pp_part_sads, au4_nxn_sad, i);
338*c83a76b0SSuyog Pawar     }
339*c83a76b0SSuyog Pawar }
340*c83a76b0SSuyog Pawar 
compute_4x4_sads_for_16x16_blk_neon(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands)341*c83a76b0SSuyog Pawar void compute_4x4_sads_for_16x16_blk_neon(
342*c83a76b0SSuyog Pawar     grid_ctxt_t *ps_grid,
343*c83a76b0SSuyog Pawar     UWORD8 *pu1_cur_ptr,
344*c83a76b0SSuyog Pawar     WORD32 cur_buf_stride,
345*c83a76b0SSuyog Pawar     UWORD16 **pp_part_sads,
346*c83a76b0SSuyog Pawar     cand_t *ps_cand,
347*c83a76b0SSuyog Pawar     WORD32 *num_cands)
348*c83a76b0SSuyog Pawar {
349*c83a76b0SSuyog Pawar     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
350*c83a76b0SSuyog Pawar     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
351*c83a76b0SSuyog Pawar 
352*c83a76b0SSuyog Pawar     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
353*c83a76b0SSuyog Pawar     WORD32 offset_x[NUM_CANDIDATES_IN_GRID] = { 0,         -grd_sz_x, 0,         grd_sz_x, 0,
354*c83a76b0SSuyog Pawar                                                 -grd_sz_x, grd_sz_x,  -grd_sz_x, grd_sz_x };
355*c83a76b0SSuyog Pawar     WORD32 offset_y[NUM_CANDIDATES_IN_GRID] = { 0,         0,         -grd_sz_y, 0,       grd_sz_y,
356*c83a76b0SSuyog Pawar                                                 -grd_sz_y, -grd_sz_y, grd_sz_y,  grd_sz_y };
357*c83a76b0SSuyog Pawar 
358*c83a76b0SSuyog Pawar     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
359*c83a76b0SSuyog Pawar     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
360*c83a76b0SSuyog Pawar     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
361*c83a76b0SSuyog Pawar 
362*c83a76b0SSuyog Pawar     cand_t *cand0 = ps_cand;
363*c83a76b0SSuyog Pawar 
364*c83a76b0SSuyog Pawar     /* for a 2Nx2N partition we evaluate (N/2)x(N/2) SADs. This is needed for
365*c83a76b0SSuyog Pawar      * AMP cases */
366*c83a76b0SSuyog Pawar     UWORD16 au2_4x4_sad[16];
367*c83a76b0SSuyog Pawar 
368*c83a76b0SSuyog Pawar     WORD32 i, j;
369*c83a76b0SSuyog Pawar 
370*c83a76b0SSuyog Pawar     *num_cands = 0;
371*c83a76b0SSuyog Pawar 
372*c83a76b0SSuyog Pawar     /* Loop to fill up the cand_t array and to calculate num_cands */
373*c83a76b0SSuyog Pawar     for(i = 0; i < ps_grid->num_grids; i++)
374*c83a76b0SSuyog Pawar     {
375*c83a76b0SSuyog Pawar         WORD32 j;
376*c83a76b0SSuyog Pawar         WORD32 mask = ps_grid->pi4_grd_mask[i];
377*c83a76b0SSuyog Pawar         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
378*c83a76b0SSuyog Pawar         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
379*c83a76b0SSuyog Pawar         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
380*c83a76b0SSuyog Pawar 
381*c83a76b0SSuyog Pawar         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
382*c83a76b0SSuyog Pawar         {
383*c83a76b0SSuyog Pawar             if(mask & 1)
384*c83a76b0SSuyog Pawar             {
385*c83a76b0SSuyog Pawar                 *num_cands = *num_cands + 1;
386*c83a76b0SSuyog Pawar                 cand0->grid_ix = i;
387*c83a76b0SSuyog Pawar                 cand0->ref_idx = ps_grid->p_ref_idx[i];
388*c83a76b0SSuyog Pawar                 cand0->pu1_ref_ptr =
389*c83a76b0SSuyog Pawar                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
390*c83a76b0SSuyog Pawar                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
391*c83a76b0SSuyog Pawar                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
392*c83a76b0SSuyog Pawar                 cand0++;
393*c83a76b0SSuyog Pawar             }
394*c83a76b0SSuyog Pawar         }
395*c83a76b0SSuyog Pawar     }
396*c83a76b0SSuyog Pawar 
397*c83a76b0SSuyog Pawar     /* Loop to compute the SAD's */
398*c83a76b0SSuyog Pawar     for(i = 0; i < *num_cands; i++)
399*c83a76b0SSuyog Pawar     {
400*c83a76b0SSuyog Pawar         cand_t *cand = ps_cand + i;
401*c83a76b0SSuyog Pawar 
402*c83a76b0SSuyog Pawar         for(j = 0; j < 4; j++)
403*c83a76b0SSuyog Pawar             ihevce_sad4_4x4_neon(
404*c83a76b0SSuyog Pawar                 pu1_cur_ptr + j * cur_buf_stride_ls2,
405*c83a76b0SSuyog Pawar                 cur_buf_stride,
406*c83a76b0SSuyog Pawar                 cand->pu1_ref_ptr + j * ref_buf_stride_ls2,
407*c83a76b0SSuyog Pawar                 ref_buf_stride,
408*c83a76b0SSuyog Pawar                 &au2_4x4_sad[4 * j]);
409*c83a76b0SSuyog Pawar 
410*c83a76b0SSuyog Pawar         COMBINE_SADS(pp_part_sads, au2_4x4_sad, i);
411*c83a76b0SSuyog Pawar     }
412*c83a76b0SSuyog Pawar }
413*c83a76b0SSuyog Pawar 
hme_evalsad_grid_npu_MxN_neon(err_prms_t * ps_prms)414*c83a76b0SSuyog Pawar void hme_evalsad_grid_npu_MxN_neon(err_prms_t *ps_prms)
415*c83a76b0SSuyog Pawar {
416*c83a76b0SSuyog Pawar     S32 *pi4_sad = ps_prms->pi4_sad_grid;
417*c83a76b0SSuyog Pawar     S32 i, grid_count = 0;
418*c83a76b0SSuyog Pawar     S32 x_off = ps_prms->i4_step;
419*c83a76b0SSuyog Pawar     S32 y_off = ps_prms->i4_step * ps_prms->i4_ref_stride;
420*c83a76b0SSuyog Pawar 
421*c83a76b0SSuyog Pawar     assert((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
422*c83a76b0SSuyog Pawar 
423*c83a76b0SSuyog Pawar     for(i = 0; i < 9; i++)
424*c83a76b0SSuyog Pawar     {
425*c83a76b0SSuyog Pawar         if(ps_prms->i4_grid_mask & (1 << i))
426*c83a76b0SSuyog Pawar             grid_count++;
427*c83a76b0SSuyog Pawar     }
428*c83a76b0SSuyog Pawar     pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
429*c83a76b0SSuyog Pawar 
430*c83a76b0SSuyog Pawar     for(i = 0; i < 9; i++)
431*c83a76b0SSuyog Pawar     {
432*c83a76b0SSuyog Pawar         U08 *pu1_inp = ps_prms->pu1_inp;
433*c83a76b0SSuyog Pawar         U08 *pu1_ref = ps_prms->pu1_ref;
434*c83a76b0SSuyog Pawar 
435*c83a76b0SSuyog Pawar         if(!(ps_prms->i4_grid_mask & (1 << i)))
436*c83a76b0SSuyog Pawar             continue;
437*c83a76b0SSuyog Pawar 
438*c83a76b0SSuyog Pawar         pu1_ref += x_off * gai1_grid_id_to_x[i];
439*c83a76b0SSuyog Pawar         pu1_ref += y_off * gai1_grid_id_to_y[i];
440*c83a76b0SSuyog Pawar         *pi4_sad++ = ihevce_4mx4n_sad_computer_neon(
441*c83a76b0SSuyog Pawar             pu1_inp,
442*c83a76b0SSuyog Pawar             pu1_ref,
443*c83a76b0SSuyog Pawar             ps_prms->i4_inp_stride,
444*c83a76b0SSuyog Pawar             ps_prms->i4_ref_stride,
445*c83a76b0SSuyog Pawar             ps_prms->i4_blk_wd,
446*c83a76b0SSuyog Pawar             ps_prms->i4_blk_ht);
447*c83a76b0SSuyog Pawar     }
448*c83a76b0SSuyog Pawar }
449*c83a76b0SSuyog Pawar 
hme_evalsad_pt_npu_MxN_8bit_neon(err_prms_t * ps_prms)450*c83a76b0SSuyog Pawar void hme_evalsad_pt_npu_MxN_8bit_neon(err_prms_t *ps_prms)
451*c83a76b0SSuyog Pawar {
452*c83a76b0SSuyog Pawar     ps_prms->pi4_sad_grid[0] = ihevce_4mx4n_sad_computer_neon(
453*c83a76b0SSuyog Pawar         ps_prms->pu1_inp,
454*c83a76b0SSuyog Pawar         ps_prms->pu1_ref,
455*c83a76b0SSuyog Pawar         ps_prms->i4_inp_stride,
456*c83a76b0SSuyog Pawar         ps_prms->i4_ref_stride,
457*c83a76b0SSuyog Pawar         ps_prms->i4_blk_wd,
458*c83a76b0SSuyog Pawar         ps_prms->i4_blk_ht);
459*c83a76b0SSuyog Pawar }
460*c83a76b0SSuyog Pawar 
hme_calc_sad_and_1_best_result_neon(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)461*c83a76b0SSuyog Pawar void hme_calc_sad_and_1_best_result_neon(
462*c83a76b0SSuyog Pawar     hme_search_prms_t *ps_search_prms,
463*c83a76b0SSuyog Pawar     wgt_pred_ctxt_t *ps_wt_inp_prms,
464*c83a76b0SSuyog Pawar     err_prms_t *ps_err_prms,
465*c83a76b0SSuyog Pawar     result_upd_prms_t *ps_result_prms,
466*c83a76b0SSuyog Pawar     U08 **ppu1_ref,
467*c83a76b0SSuyog Pawar     S32 i4_ref_stride)
468*c83a76b0SSuyog Pawar {
469*c83a76b0SSuyog Pawar     mv_refine_ctxt_t *refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
470*c83a76b0SSuyog Pawar     search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
471*c83a76b0SSuyog Pawar     S32 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
472*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
473*c83a76b0SSuyog Pawar     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
474*c83a76b0SSuyog Pawar     S32 ref_buf_stride = ps_err_prms->i4_ref_stride;
475*c83a76b0SSuyog Pawar     S32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
476*c83a76b0SSuyog Pawar     S32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
477*c83a76b0SSuyog Pawar     S32 i4_inp_off, i4_ref_off;
478*c83a76b0SSuyog Pawar     S32 i;
479*c83a76b0SSuyog Pawar 
480*c83a76b0SSuyog Pawar     i4_inp_off = ps_search_prms->i4_cu_x_off;
481*c83a76b0SSuyog Pawar     i4_inp_off += (ps_search_prms->i4_cu_y_off * cur_buf_stride);
482*c83a76b0SSuyog Pawar     i4_ref_off = ps_search_prms->i4_x_off;
483*c83a76b0SSuyog Pawar     i4_ref_off += (ps_search_prms->i4_y_off * i4_ref_stride);
484*c83a76b0SSuyog Pawar 
485*c83a76b0SSuyog Pawar     /* Run through each of the candts in a loop */
486*c83a76b0SSuyog Pawar     for(i = 0; i < i4_num_nodes; i++)
487*c83a76b0SSuyog Pawar     {
488*c83a76b0SSuyog Pawar         U16 au2_4x4_sad[16];
489*c83a76b0SSuyog Pawar         S32 i4_mv_cost;
490*c83a76b0SSuyog Pawar         S32 j;
491*c83a76b0SSuyog Pawar 
492*c83a76b0SSuyog Pawar         if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
493*c83a76b0SSuyog Pawar         {
494*c83a76b0SSuyog Pawar             continue;
495*c83a76b0SSuyog Pawar         }
496*c83a76b0SSuyog Pawar 
497*c83a76b0SSuyog Pawar         ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
498*c83a76b0SSuyog Pawar         ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_off;
499*c83a76b0SSuyog Pawar         ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
500*c83a76b0SSuyog Pawar         ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
501*c83a76b0SSuyog Pawar 
502*c83a76b0SSuyog Pawar         /* Loop to compute the SAD's */
503*c83a76b0SSuyog Pawar         for(j = 0; j < 4; j++)
504*c83a76b0SSuyog Pawar         {
505*c83a76b0SSuyog Pawar             UWORD8 *pu1_curr = ps_err_prms->pu1_inp;
506*c83a76b0SSuyog Pawar             UWORD8 *pu1_ref = ps_err_prms->pu1_ref;
507*c83a76b0SSuyog Pawar 
508*c83a76b0SSuyog Pawar             ihevce_sad4_4x4_neon(
509*c83a76b0SSuyog Pawar                 pu1_curr + j * cur_buf_stride_ls2,
510*c83a76b0SSuyog Pawar                 cur_buf_stride,
511*c83a76b0SSuyog Pawar                 pu1_ref + j * ref_buf_stride_ls2,
512*c83a76b0SSuyog Pawar                 ref_buf_stride,
513*c83a76b0SSuyog Pawar                 &au2_4x4_sad[4 * j]);
514*c83a76b0SSuyog Pawar         }
515*c83a76b0SSuyog Pawar 
516*c83a76b0SSuyog Pawar         COMBINE_SADS_2(pi4_sad_grid, au2_4x4_sad);
517*c83a76b0SSuyog Pawar 
518*c83a76b0SSuyog Pawar         // calculate MV cost
519*c83a76b0SSuyog Pawar         {
520*c83a76b0SSuyog Pawar             S16 mvdx1, mvdy1;
521*c83a76b0SSuyog Pawar             S32 i4_ref_idx = ps_result_prms->i1_ref_idx;
522*c83a76b0SSuyog Pawar             search_results_t *ps_srch_rslts = ps_result_prms->ps_search_results;
523*c83a76b0SSuyog Pawar 
524*c83a76b0SSuyog Pawar             pred_ctxt_t *ps_pred_ctxt = &ps_srch_rslts->as_pred_ctxt[i4_ref_idx];
525*c83a76b0SSuyog Pawar             pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
526*c83a76b0SSuyog Pawar             search_node_t *ps_mvp_node = ps_pred_nodes->ps_mvp_node;
527*c83a76b0SSuyog Pawar 
528*c83a76b0SSuyog Pawar             S32 inp_shift = 2;
529*c83a76b0SSuyog Pawar             S32 pred_shift = ps_mvp_node->u1_subpel_done ? 0 : 2;
530*c83a76b0SSuyog Pawar             S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
531*c83a76b0SSuyog Pawar             S32 lambda = ps_pred_ctxt->lambda;
532*c83a76b0SSuyog Pawar             S32 rnd = 1 << (lambda_q_shift - 1);
533*c83a76b0SSuyog Pawar             S32 mv_p_x = ps_mvp_node->s_mv.i2_mvx;
534*c83a76b0SSuyog Pawar             S32 mv_p_y = ps_mvp_node->s_mv.i2_mvy;
535*c83a76b0SSuyog Pawar             S32 ref_bits =
536*c83a76b0SSuyog Pawar                 ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
537*c83a76b0SSuyog Pawar 
538*c83a76b0SSuyog Pawar             COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
539*c83a76b0SSuyog Pawar 
540*c83a76b0SSuyog Pawar             mvdx1 = ABS(mvdx1);
541*c83a76b0SSuyog Pawar             mvdy1 = ABS(mvdy1);
542*c83a76b0SSuyog Pawar 
543*c83a76b0SSuyog Pawar             i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) +
544*c83a76b0SSuyog Pawar                          ref_bits + 2;
545*c83a76b0SSuyog Pawar 
546*c83a76b0SSuyog Pawar             i4_mv_cost *= lambda;
547*c83a76b0SSuyog Pawar             i4_mv_cost += rnd;
548*c83a76b0SSuyog Pawar             i4_mv_cost >>= lambda_q_shift;
549*c83a76b0SSuyog Pawar 
550*c83a76b0SSuyog Pawar             i4_mv_cost = CLIP_U16(i4_mv_cost);
551*c83a76b0SSuyog Pawar         }
552*c83a76b0SSuyog Pawar 
553*c83a76b0SSuyog Pawar         {
554*c83a76b0SSuyog Pawar             S32 i4_sad, i4_tot_cost;
555*c83a76b0SSuyog Pawar             S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
556*c83a76b0SSuyog Pawar             S32 best_node_cost;
557*c83a76b0SSuyog Pawar 
558*c83a76b0SSuyog Pawar             /* For each valid partition, update the refine_prm structure to
559*c83a76b0SSuyog Pawar              * reflect the best and second best candidates for that partition */
560*c83a76b0SSuyog Pawar             for(j = 0; j < refine_ctxt->i4_num_valid_parts; j++)
561*c83a76b0SSuyog Pawar             {
562*c83a76b0SSuyog Pawar                 S32 part_id = pi4_valid_part_ids[j];
563*c83a76b0SSuyog Pawar                 S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : j;
564*c83a76b0SSuyog Pawar 
565*c83a76b0SSuyog Pawar                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
566*c83a76b0SSuyog Pawar                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
567*c83a76b0SSuyog Pawar 
568*c83a76b0SSuyog Pawar                 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
569*c83a76b0SSuyog Pawar 
570*c83a76b0SSuyog Pawar                 if(i4_tot_cost < best_node_cost)
571*c83a76b0SSuyog Pawar                 {
572*c83a76b0SSuyog Pawar                     refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
573*c83a76b0SSuyog Pawar                     refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
574*c83a76b0SSuyog Pawar                     refine_ctxt->i2_mv_x[0][id] = ps_search_node->s_mv.i2_mvx;
575*c83a76b0SSuyog Pawar                     refine_ctxt->i2_mv_y[0][id] = ps_search_node->s_mv.i2_mvy;
576*c83a76b0SSuyog Pawar                     refine_ctxt->i2_ref_idx[0][id] = ps_search_node->i1_ref_idx;
577*c83a76b0SSuyog Pawar                 }
578*c83a76b0SSuyog Pawar             }
579*c83a76b0SSuyog Pawar         }
580*c83a76b0SSuyog Pawar 
581*c83a76b0SSuyog Pawar         ps_search_node++;
582*c83a76b0SSuyog Pawar     }
583*c83a76b0SSuyog Pawar 
584*c83a76b0SSuyog Pawar     ps_search_node = ps_search_prms->ps_search_nodes;
585*c83a76b0SSuyog Pawar 
586*c83a76b0SSuyog Pawar     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
587*c83a76b0SSuyog Pawar     {
588*c83a76b0SSuyog Pawar         S32 part_id = refine_ctxt->ai4_part_id[i];
589*c83a76b0SSuyog Pawar 
590*c83a76b0SSuyog Pawar         if(refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
591*c83a76b0SSuyog Pawar         {
592*c83a76b0SSuyog Pawar             assert(refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
593*c83a76b0SSuyog Pawar             assert(refine_ctxt->i2_mv_x[0][part_id] == 0);
594*c83a76b0SSuyog Pawar             assert(refine_ctxt->i2_mv_y[0][part_id] == 0);
595*c83a76b0SSuyog Pawar 
596*c83a76b0SSuyog Pawar             refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
597*c83a76b0SSuyog Pawar         }
598*c83a76b0SSuyog Pawar         if(refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
599*c83a76b0SSuyog Pawar         {
600*c83a76b0SSuyog Pawar             assert(refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
601*c83a76b0SSuyog Pawar             assert(refine_ctxt->i2_mv_x[1][part_id] == 0);
602*c83a76b0SSuyog Pawar             assert(refine_ctxt->i2_mv_y[1][part_id] == 0);
603*c83a76b0SSuyog Pawar 
604*c83a76b0SSuyog Pawar             refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
605*c83a76b0SSuyog Pawar         }
606*c83a76b0SSuyog Pawar     }
607*c83a76b0SSuyog Pawar }
608*c83a76b0SSuyog Pawar 
hme_calc_sad_and_1_best_result_subpel_neon(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)609*c83a76b0SSuyog Pawar void hme_calc_sad_and_1_best_result_subpel_neon(
610*c83a76b0SSuyog Pawar     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
611*c83a76b0SSuyog Pawar {
612*c83a76b0SSuyog Pawar     mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
613*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
614*c83a76b0SSuyog Pawar     S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
615*c83a76b0SSuyog Pawar     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
616*c83a76b0SSuyog Pawar     S32 ref_buf_stride = ps_err_prms->i4_ref_stride;
617*c83a76b0SSuyog Pawar     S32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
618*c83a76b0SSuyog Pawar     S32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
619*c83a76b0SSuyog Pawar     U16 au2_4x4_sad[16];
620*c83a76b0SSuyog Pawar     S32 i;
621*c83a76b0SSuyog Pawar 
622*c83a76b0SSuyog Pawar     /* Loop to compute the SAD's */
623*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
624*c83a76b0SSuyog Pawar     {
625*c83a76b0SSuyog Pawar         UWORD8 *pu1_curr = ps_err_prms->pu1_inp;
626*c83a76b0SSuyog Pawar         UWORD8 *pu1_ref = ps_err_prms->pu1_ref;
627*c83a76b0SSuyog Pawar 
628*c83a76b0SSuyog Pawar         ihevce_sad4_4x4_neon(
629*c83a76b0SSuyog Pawar             pu1_curr + i * cur_buf_stride_ls2,
630*c83a76b0SSuyog Pawar             cur_buf_stride,
631*c83a76b0SSuyog Pawar             pu1_ref + i * ref_buf_stride_ls2,
632*c83a76b0SSuyog Pawar             ref_buf_stride,
633*c83a76b0SSuyog Pawar             &au2_4x4_sad[4 * i]);
634*c83a76b0SSuyog Pawar     }
635*c83a76b0SSuyog Pawar 
636*c83a76b0SSuyog Pawar     COMBINE_SADS_2(pi4_sad_grid, au2_4x4_sad);
637*c83a76b0SSuyog Pawar 
638*c83a76b0SSuyog Pawar     /* For each valid partition, update the refine_prm structure to
639*c83a76b0SSuyog Pawar      * reflect the best and second best candidates for that partition */
640*c83a76b0SSuyog Pawar     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
641*c83a76b0SSuyog Pawar     {
642*c83a76b0SSuyog Pawar         S32 part_id = pi4_valid_part_ids[i];
643*c83a76b0SSuyog Pawar         S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
644*c83a76b0SSuyog Pawar         S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
645*c83a76b0SSuyog Pawar         S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
646*c83a76b0SSuyog Pawar         S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
647*c83a76b0SSuyog Pawar         S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
648*c83a76b0SSuyog Pawar 
649*c83a76b0SSuyog Pawar         if(i4_tot_cost < best_node_cost)
650*c83a76b0SSuyog Pawar         {
651*c83a76b0SSuyog Pawar             refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
652*c83a76b0SSuyog Pawar             refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
653*c83a76b0SSuyog Pawar             refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
654*c83a76b0SSuyog Pawar             refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
655*c83a76b0SSuyog Pawar             refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
656*c83a76b0SSuyog Pawar         }
657*c83a76b0SSuyog Pawar     }
658*c83a76b0SSuyog Pawar 
659*c83a76b0SSuyog Pawar     for(i = 0; i < TOT_NUM_PARTS; i++)
660*c83a76b0SSuyog Pawar     {
661*c83a76b0SSuyog Pawar         if(refine_ctxt->i2_tot_cost[0][i] >= MAX_SIGNED_16BIT_VAL)
662*c83a76b0SSuyog Pawar         {
663*c83a76b0SSuyog Pawar             refine_ctxt->ai2_fullpel_satd[0][i] = MAX_SIGNED_16BIT_VAL;
664*c83a76b0SSuyog Pawar         }
665*c83a76b0SSuyog Pawar     }
666*c83a76b0SSuyog Pawar }
667