xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_ssd_calculator_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_ssd_calculator_neon.c
24 *
25 * @brief
26 *  Contains intrinsic definitions of functions for sad computation
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_cmn_utils_instr_set_router.h"
52 
53 /*****************************************************************************/
54 /* Function Definitions                                                      */
55 /*****************************************************************************/
ihevce_4x4_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,CHROMA_PLANE_ID_T chroma_plane)56 static INLINE uint32x4_t ihevce_4x4_ssd_computer_neon(
57     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd,
58     CHROMA_PLANE_ID_T chroma_plane)
59 {
60     uint32x4_t ssd_low, ssd_high;
61     uint8x16_t src, pred, abs;
62     uint16x8_t sqabs_low, sqabs_high;
63 
64     if(chroma_plane == NULL_PLANE)
65     {
66         src = load_unaligned_u8q(pu1_src, src_strd);
67         pred = load_unaligned_u8q(pu1_pred, pred_strd);
68     }
69     else
70     {
71         src = load_unaligned_u8qi(pu1_src + chroma_plane, src_strd);
72         pred = load_unaligned_u8qi(pu1_pred + chroma_plane, pred_strd);
73     }
74     abs = vabdq_u8(src, pred);
75     sqabs_low = vmull_u8(vget_low_u8(abs), vget_low_u8(abs));
76     sqabs_high = vmull_u8(vget_high_u8(abs), vget_high_u8(abs));
77 
78     ssd_low = vaddl_u16(vget_low_u16(sqabs_low), vget_high_u16(sqabs_low));
79     ssd_high = vaddl_u16(vget_low_u16(sqabs_high), vget_high_u16(sqabs_high));
80     return vaddq_u32(ssd_low, ssd_high);
81 }
82 
83 static INLINE uint32x4_t
ihevce_1x8_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,CHROMA_PLANE_ID_T chroma_plane)84     ihevce_1x8_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred,
85     CHROMA_PLANE_ID_T chroma_plane)
86 {
87     uint32x4_t ssd_val;
88     uint8x8_t src, pred, abs;
89     uint16x8_t sqabs;
90 
91     if(chroma_plane == NULL_PLANE)
92     {
93         src = vld1_u8(pu1_src);
94         pred = vld1_u8(pu1_pred);
95     }
96     else
97     {
98         src = vld2_u8(pu1_src).val[chroma_plane];
99         pred = vld2_u8(pu1_pred).val[chroma_plane];
100     }
101     abs = vabd_u8(src, pred);
102     sqabs = vmull_u8(abs, abs);
103 
104     ssd_val = vaddl_u16(vget_low_u16(sqabs), vget_high_u16(sqabs));
105     return ssd_val;
106 }
107 
108 static INLINE uint32x4_t
ihevce_1x16_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,CHROMA_PLANE_ID_T chroma_plane)109     ihevce_1x16_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred,
110     CHROMA_PLANE_ID_T chroma_plane)
111 {
112     uint32x4_t ssd_low, ssd_high;
113     uint8x16_t src, pred, abs;
114     uint16x8_t sqabs_low, sqabs_high;
115 
116     if(chroma_plane == NULL_PLANE)
117     {
118         src = vld1q_u8(pu1_src);
119         pred = vld1q_u8(pu1_pred);
120     }
121     else
122     {
123         src = vld2q_u8(pu1_src).val[chroma_plane];
124         pred = vld2q_u8(pu1_pred).val[chroma_plane];
125     }
126     abs = vabdq_u8(src, pred);
127     sqabs_low = vmull_u8(vget_low_u8(abs), vget_low_u8(abs));
128     sqabs_high = vmull_u8(vget_high_u8(abs), vget_high_u8(abs));
129 
130     ssd_low = vaddl_u16(vget_low_u16(sqabs_low), vget_high_u16(sqabs_low));
131     ssd_high = vaddl_u16(vget_low_u16(sqabs_high), vget_high_u16(sqabs_high));
132     return vaddq_u32(ssd_low, ssd_high);
133 }
134 
135 static INLINE uint32x4_t
ihevce_1x32_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,CHROMA_PLANE_ID_T chroma_plane)136     ihevce_1x32_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred,
137     CHROMA_PLANE_ID_T chroma_plane)
138 {
139     uint32x4_t ssd_0, ssd_1, ssd_2, ssd_3;
140     uint8x16_t src_0, pred_0, src_1, pred_1, abs_0, abs_1;
141     uint16x8_t sqabs_0, sqabs_1, sqabs_2, sqabs_3;
142 
143     if(chroma_plane == NULL_PLANE)
144     {
145         src_0 = vld1q_u8(pu1_src);
146         pred_0 = vld1q_u8(pu1_pred);
147         src_1 = vld1q_u8(pu1_src + 16);
148         pred_1 = vld1q_u8(pu1_pred + 16);
149     }
150     else
151     {
152         src_0 = vld2q_u8(pu1_src).val[chroma_plane];
153         pred_0 = vld2q_u8(pu1_pred).val[chroma_plane];
154         src_1 = vld2q_u8(pu1_src + 32).val[chroma_plane];
155         pred_1 = vld2q_u8(pu1_pred + 32).val[chroma_plane];
156     }
157     abs_0 = vabdq_u8(src_0, pred_0);
158     abs_1 = vabdq_u8(src_1, pred_1);
159     sqabs_0 = vmull_u8(vget_low_u8(abs_0), vget_low_u8(abs_0));
160     sqabs_1 = vmull_u8(vget_high_u8(abs_0), vget_high_u8(abs_0));
161     sqabs_2 = vmull_u8(vget_low_u8(abs_1), vget_low_u8(abs_1));
162     sqabs_3 = vmull_u8(vget_high_u8(abs_1), vget_high_u8(abs_1));
163 
164     ssd_0 = vaddl_u16(vget_low_u16(sqabs_0), vget_high_u16(sqabs_0));
165     ssd_1 = vaddl_u16(vget_low_u16(sqabs_1), vget_high_u16(sqabs_1));
166     ssd_2 = vaddl_u16(vget_low_u16(sqabs_2), vget_high_u16(sqabs_2));
167     ssd_3 = vaddl_u16(vget_low_u16(sqabs_3), vget_high_u16(sqabs_3));
168     ssd_0 = vaddq_u32(ssd_0, ssd_1);
169     ssd_2 = vaddq_u32(ssd_2, ssd_3);
170     return vaddq_u32(ssd_0, ssd_2);
171 }
172 
173 static INLINE uint32x4_t
ihevce_1x64_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,CHROMA_PLANE_ID_T chroma_plane)174     ihevce_1x64_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred,
175     CHROMA_PLANE_ID_T chroma_plane)
176 {
177     uint32x4_t ssd_0, ssd_1, ssd_2, ssd_3;
178     uint32x4_t ssd_4, ssd_5, ssd_6, ssd_7;
179     uint8x16_t src_0, src_1, src_2, src_3;
180     uint8x16_t pred_0, pred_1, pred_2, pred_3;
181     uint8x16_t abs_0, abs_1, abs_2, abs_3;
182     uint16x8_t sqabs_0, sqabs_1, sqabs_2, sqabs_3;
183     uint16x8_t sqabs_4, sqabs_5, sqabs_6, sqabs_7;
184 
185     if(chroma_plane == NULL_PLANE)
186     {
187         src_0 = vld1q_u8(pu1_src);
188         pred_0 = vld1q_u8(pu1_pred);
189         src_1 = vld1q_u8(pu1_src + 16);
190         pred_1 = vld1q_u8(pu1_pred + 16);
191         src_2 = vld1q_u8(pu1_src + 32);
192         pred_2 = vld1q_u8(pu1_pred + 32);
193         src_3 = vld1q_u8(pu1_src + 48);
194         pred_3 = vld1q_u8(pu1_pred + 48);
195     }
196     else
197     {
198         src_0 = vld2q_u8(pu1_src).val[chroma_plane];
199         pred_0 = vld2q_u8(pu1_pred).val[chroma_plane];
200         src_1 = vld2q_u8(pu1_src + 32).val[chroma_plane];
201         pred_1 = vld2q_u8(pu1_pred + 32).val[chroma_plane];
202         src_2 = vld2q_u8(pu1_src + 64).val[chroma_plane];
203         pred_2 = vld2q_u8(pu1_pred + 64).val[chroma_plane];
204         src_3 = vld2q_u8(pu1_src + 96).val[chroma_plane];
205         pred_3 = vld2q_u8(pu1_pred + 96).val[chroma_plane];
206     }
207     abs_0 = vabdq_u8(src_0, pred_0);
208     abs_1 = vabdq_u8(src_1, pred_1);
209     abs_2 = vabdq_u8(src_2, pred_2);
210     abs_3 = vabdq_u8(src_3, pred_3);
211     sqabs_0 = vmull_u8(vget_low_u8(abs_0), vget_low_u8(abs_0));
212     sqabs_1 = vmull_u8(vget_high_u8(abs_0), vget_high_u8(abs_0));
213     sqabs_2 = vmull_u8(vget_low_u8(abs_1), vget_low_u8(abs_1));
214     sqabs_3 = vmull_u8(vget_high_u8(abs_1), vget_high_u8(abs_1));
215     sqabs_4 = vmull_u8(vget_low_u8(abs_2), vget_low_u8(abs_2));
216     sqabs_5 = vmull_u8(vget_high_u8(abs_2), vget_high_u8(abs_2));
217     sqabs_6 = vmull_u8(vget_low_u8(abs_3), vget_low_u8(abs_3));
218     sqabs_7 = vmull_u8(vget_high_u8(abs_3), vget_high_u8(abs_3));
219 
220     ssd_0 = vaddl_u16(vget_low_u16(sqabs_0), vget_high_u16(sqabs_0));
221     ssd_1 = vaddl_u16(vget_low_u16(sqabs_1), vget_high_u16(sqabs_1));
222     ssd_2 = vaddl_u16(vget_low_u16(sqabs_2), vget_high_u16(sqabs_2));
223     ssd_3 = vaddl_u16(vget_low_u16(sqabs_3), vget_high_u16(sqabs_3));
224     ssd_4 = vaddl_u16(vget_low_u16(sqabs_4), vget_high_u16(sqabs_4));
225     ssd_5 = vaddl_u16(vget_low_u16(sqabs_5), vget_high_u16(sqabs_5));
226     ssd_6 = vaddl_u16(vget_low_u16(sqabs_6), vget_high_u16(sqabs_6));
227     ssd_7 = vaddl_u16(vget_low_u16(sqabs_7), vget_high_u16(sqabs_7));
228     ssd_0 = vaddq_u32(ssd_0, ssd_1);
229     ssd_2 = vaddq_u32(ssd_2, ssd_3);
230     ssd_4 = vaddq_u32(ssd_4, ssd_5);
231     ssd_6 = vaddq_u32(ssd_6, ssd_7);
232     ssd_0 = vaddq_u32(ssd_0, ssd_2);
233     ssd_4 = vaddq_u32(ssd_4, ssd_6);
234     return vaddq_u32(ssd_0, ssd_4);
235 }
236 
ihevce_ssd_calculator_plane_neon(UWORD8 * pu1_inp,UWORD8 * pu1_ref,UWORD32 inp_stride,UWORD32 ref_stride,UWORD32 wd,UWORD32 ht,CHROMA_PLANE_ID_T chroma_plane)237 static LWORD64 ihevce_ssd_calculator_plane_neon(
238     UWORD8 *pu1_inp,
239     UWORD8 *pu1_ref,
240     UWORD32 inp_stride,
241     UWORD32 ref_stride,
242     UWORD32 wd,
243     UWORD32 ht,
244     CHROMA_PLANE_ID_T chroma_plane)
245 {
246     uint32x4_t ssd = vdupq_n_u32(0);
247     uint32x2_t sum;
248 
249     if(wd >= 8)
250     {
251         UWORD32 row;
252 
253         for(row = ht; row > 0; row--)
254         {
255             if(wd == 8)
256                 ssd = vaddq_u32(ssd, ihevce_1x8_ssd_computer_neon(pu1_inp, pu1_ref, chroma_plane));
257             else if(wd == 16)
258                 ssd = vaddq_u32(ssd, ihevce_1x16_ssd_computer_neon(pu1_inp, pu1_ref, chroma_plane));
259             else if(wd == 32)
260                 ssd = vaddq_u32(ssd, ihevce_1x32_ssd_computer_neon(pu1_inp, pu1_ref, chroma_plane));
261             else if(wd == 64)
262                 ssd = vaddq_u32(ssd, ihevce_1x64_ssd_computer_neon(pu1_inp, pu1_ref, chroma_plane));
263             else if(wd % 8 == 0)
264             {
265                 UWORD32 col;
266                 UWORD8 *inp = pu1_inp, *ref = pu1_ref;
267 
268                 for(col = 0; col < wd; col += 8)
269                 {
270                     ssd = vaddq_u32(ssd, ihevce_1x8_ssd_computer_neon(inp, ref, chroma_plane));
271                     ref = ref + 8;
272                     inp = inp + 8;
273                 }
274             }
275 
276             pu1_inp += inp_stride;
277             pu1_ref += ref_stride;
278         }
279     }
280     else if(wd == 4)
281     {
282         assert(ht == 4);
283         ssd = ihevce_4x4_ssd_computer_neon(pu1_inp, pu1_ref, inp_stride, ref_stride, chroma_plane);
284     }
285 
286     sum = vadd_u32(vget_low_u32(ssd), vget_high_u32(ssd));
287     return vget_lane_u64(vpaddl_u32(sum), 0);
288 }
289 
ihevce_ssd_calculator_neon(UWORD8 * pu1_inp,UWORD8 * pu1_ref,UWORD32 inp_stride,UWORD32 ref_stride,UWORD32 wd,UWORD32 ht,CHROMA_PLANE_ID_T chroma_plane)290 LWORD64 ihevce_ssd_calculator_neon(
291     UWORD8 *pu1_inp, UWORD8 *pu1_ref, UWORD32 inp_stride, UWORD32 ref_stride, UWORD32 wd,
292     UWORD32 ht, CHROMA_PLANE_ID_T chroma_plane)
293 {
294     return ihevce_ssd_calculator_plane_neon(pu1_inp, pu1_ref, inp_stride, ref_stride, wd, ht,
295                                             chroma_plane);
296 }
297 
ihevce_chroma_interleave_ssd_calculator_neon(UWORD8 * pu1_inp,UWORD8 * pu1_ref,UWORD32 inp_stride,UWORD32 ref_stride,UWORD32 wd,UWORD32 ht,CHROMA_PLANE_ID_T chroma_plane)298 LWORD64 ihevce_chroma_interleave_ssd_calculator_neon(
299     UWORD8 *pu1_inp, UWORD8 *pu1_ref, UWORD32 inp_stride, UWORD32 ref_stride, UWORD32 wd,
300     UWORD32 ht, CHROMA_PLANE_ID_T chroma_plane)
301 {
302     return ihevce_ssd_calculator_plane_neon(pu1_inp, pu1_ref, inp_stride, ref_stride, wd, ht,
303                                             chroma_plane);
304 }
305