1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_pred_residual_recon_neonintr.c
24 *
25 * @brief
26 * Contains definition of functions for h264 inverse quantization inverse
27 * transformation and resd comp
28 *
29 * @author
30 * Kishore
31 *
32 * @par List of Functions:
33 * - isvcd_pred_residual_recon_16x16_neonintr()
34 * - isvcd_pred_residual_recon_8x8_neonintr()
35 * - isvcd_pred_residual_recon_4x4_neonintr()
36 * - isvcd_pred_residual_recon_chroma_4x4_neonintr()
37 * - isvcd_pred_residual_recon_chroma_8x8_neonintr()
38 * - isvcd_residual_luma_4x4_neonintr()
39 * - isvcd_residual_luma_8x8_neonintr()
40 * - isvcd_residual_luma_16x16_neonintr()
41 * - isvcd_residual_chroma_cb_cr_8x8_neonintr()
42 *
43 * @remarks
44 *
45 *******************************************************************************
46 */
47
48 /*****************************************************************************/
49 /* File Includes */
50 /*****************************************************************************/
51 #include <string.h>
52 #include <arm_neon.h>
53
54 /* User include files */
55 #include "ih264_typedefs.h"
56 #include "ih264_defs.h"
57 #include "ih264_trans_macros.h"
58 #include "ih264_macros.h"
59 #include "ih264_platform_macros.h"
60 #include "ih264_trans_data.h"
61 #include "ih264_size_defs.h"
62 #include "ih264_structs.h"
63 #include "isvcd_pred_residual_recon.h"
64
65 /*****************************************************************************/
66 /* */
67 /* Function Name : isvcd_pred_residual_recon_4x4_neonintr */
68 /* */
69 /* Description : this function computes the recon data from the */
70 /* pred and residual buffer */
71 /* */
72 /* Inputs : */
73 /* Globals : none */
74 /* Processing : */
75 /* */
76 /* Outputs : none */
77 /* Returns : nnz */
78 /* */
79 /* Issues : none */
80 /* */
81 /* Revision History: */
82 /* */
83 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
84 /* 25 11 2021 Kishore creation */
85 /* */
86 /*****************************************************************************/
87
isvcd_pred_residual_recon_4x4_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)88 WORD32 isvcd_pred_residual_recon_4x4_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
89 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
90 {
91 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
92 int16x8_t pred0, pred1, pred2, pred3;
93 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
94 int16x8_t resd01_in, resd23_in;
95 WORD32 i4_nnz;
96 int16x8_t dup_val_1, dup_val_2, dup_abs;
97
98 pred0_in = vld1_u8((uint8_t *) pu1_pred);
99 pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 1));
100 pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
101 pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
102
103 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
104 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
105 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
106 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
107
108 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
109 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
110 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
111 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
112
113 resd01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
114 vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
115
116 resd23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
117 vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
118
119 dup_val_1 = vabsq_s16(resd01_in);
120 dup_val_2 = vabsq_s16(resd23_in);
121 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
122 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
123 dup_abs[6] || dup_abs[7];
124
125 pred0 = vaddq_s16(pred0, resd0_in);
126 pred1 = vaddq_s16(pred1, resd1_in);
127 pred2 = vaddq_s16(pred2, resd2_in);
128 pred3 = vaddq_s16(pred3, resd3_in);
129
130 pred0_in = vqmovun_s16(pred0);
131 pred1_in = vqmovun_s16(pred1);
132 pred2_in = vqmovun_s16(pred2);
133 pred3_in = vqmovun_s16(pred3);
134
135 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
136 vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred1_in), 0);
137 vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 2), vreinterpret_u32_u8(pred2_in), 0);
138 vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 3), vreinterpret_u32_u8(pred3_in), 0);
139
140 return i4_nnz;
141 }
142
143 /*****************************************************************************/
144 /* */
145 /* Function Name : isvcd_pred_residual_recon_8x8_neonintr */
146 /* */
147 /* Description : this function computes the recon data from the */
148 /* pred and residual buffer */
149 /* */
150 /* Inputs : */
151 /* Globals : none */
152 /* Processing : */
153 /* */
154 /* Outputs : none */
155 /* Returns : nnz */
156 /* */
157 /* Issues : none */
158 /* */
159 /* Revision History: */
160 /* */
161 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
162 /* 25 11 2021 Kishore creation */
163 /* */
164 /*****************************************************************************/
165
isvcd_pred_residual_recon_8x8_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)166 WORD32 isvcd_pred_residual_recon_8x8_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
167 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
168 {
169 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
170 uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
171 int16x8_t pred0, pred1, pred2, pred3;
172 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
173 int16x8_t pred4, pred5, pred6, pred7;
174 int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
175 int16x8_t dup_val_1, dup_val_2, dup_abs;
176 int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
177 resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
178
179 int16x8_t resd_b0_r01_in;
180 int16x8_t resd_b0_r23_in;
181 int16x8_t resd_b1_r01_in;
182 int16x8_t resd_b1_r23_in;
183 int16x8_t resd_b2_r45_in;
184 int16x8_t resd_b2_r67_in;
185 int16x8_t resd_b3_r45_in;
186 int16x8_t resd_b3_r67_in;
187
188 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
189
190 pred0_in = vld1_u8((uint8_t *) pu1_pred);
191 pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
192 pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
193 pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
194 pred4_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 4));
195 pred5_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 5));
196 pred6_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 6));
197 pred7_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 7));
198
199 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
200 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
201 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
202 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
203 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
204 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
205 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
206 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
207
208 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
209 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
210 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
211 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
212 resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
213 resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
214 resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
215 resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
216
217 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
218 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
219 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
220 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
221 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
222 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
223 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
224 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
225
226 resd_b0_r01_in = vreinterpretq_s16_s64(
227 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
228 resd_b0_r23_in = vreinterpretq_s16_s64(
229 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
230 resd_b1_r01_in = vreinterpretq_s16_s64(
231 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
232 resd_b1_r23_in = vreinterpretq_s16_s64(
233 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
234 resd_b2_r45_in = vreinterpretq_s16_s64(
235 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
236 resd_b2_r67_in = vreinterpretq_s16_s64(
237 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
238 resd_b3_r45_in = vreinterpretq_s16_s64(
239 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
240 resd_b3_r67_in = vreinterpretq_s16_s64(
241 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
242
243 dup_val_1 = vabsq_s16(resd_b0_r01_in);
244 dup_val_2 = vabsq_s16(resd_b0_r23_in);
245 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
246 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
247 dup_abs[6] || dup_abs[7];
248
249 dup_val_1 = vabsq_s16(resd_b1_r01_in);
250 dup_val_2 = vabsq_s16(resd_b1_r23_in);
251 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
252 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
253 dup_abs[6] || dup_abs[7];
254
255 dup_val_1 = vabsq_s16(resd_b2_r45_in);
256 dup_val_2 = vabsq_s16(resd_b2_r67_in);
257 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
258 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
259 dup_abs[6] || dup_abs[7];
260
261 dup_val_1 = vabsq_s16(resd_b3_r45_in);
262 dup_val_2 = vabsq_s16(resd_b3_r67_in);
263 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
264 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
265 dup_abs[6] || dup_abs[7];
266
267 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
268
269 pred0 = vaddq_s16(pred0, resd0_in);
270 pred1 = vaddq_s16(pred1, resd1_in);
271 pred2 = vaddq_s16(pred2, resd2_in);
272 pred3 = vaddq_s16(pred3, resd3_in);
273 pred4 = vaddq_s16(pred4, resd4_in);
274 pred5 = vaddq_s16(pred5, resd5_in);
275 pred6 = vaddq_s16(pred6, resd6_in);
276 pred7 = vaddq_s16(pred7, resd7_in);
277
278 pred0_in = vqmovun_s16(pred0);
279 pred1_in = vqmovun_s16(pred1);
280 pred2_in = vqmovun_s16(pred2);
281 pred3_in = vqmovun_s16(pred3);
282 pred4_in = vqmovun_s16(pred4);
283 pred5_in = vqmovun_s16(pred5);
284 pred6_in = vqmovun_s16(pred6);
285 pred7_in = vqmovun_s16(pred7);
286
287 vst1_u8((uint8_t *) (pu1_out), pred0_in);
288 vst1_u8((uint8_t *) (pu1_out + out_strd), pred1_in);
289 vst1_u8((uint8_t *) (pu1_out + out_strd * 2), pred2_in);
290 vst1_u8((uint8_t *) (pu1_out + out_strd * 3), pred3_in);
291 vst1_u8((uint8_t *) (pu1_out + out_strd * 4), pred4_in);
292 vst1_u8((uint8_t *) (pu1_out + out_strd * 5), pred5_in);
293 vst1_u8((uint8_t *) (pu1_out + out_strd * 6), pred6_in);
294 vst1_u8((uint8_t *) (pu1_out + out_strd * 7), pred7_in);
295
296 return nnz;
297 }
298
299 /*****************************************************************************/
300 /* */
301 /* Function Name : isvcd_pred_residual_recon_16x16_neonintr */
302 /* */
303 /* Description : this function computes the recon data from the */
304 /* pred and residual buffer */
305 /* */
306 /* Inputs : */
307 /* Globals : none */
308 /* Processing : */
309 /* */
310 /* Outputs : none */
311 /* Returns : nnz */
312 /* */
313 /* Issues : none */
314 /* */
315 /* Revision History: */
316 /* */
317 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
318 /* 25 11 2021 Kishore creation */
319 /* */
320 /*****************************************************************************/
321
isvcd_pred_residual_recon_16x16_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)322 WORD32 isvcd_pred_residual_recon_16x16_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
323 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
324 {
325 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
326 uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
327 int16x8_t pred0, pred1, pred2, pred3;
328 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
329 int16x8_t pred4, pred5, pred6, pred7;
330 int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
331 int16x8_t dup_val_1, dup_val_2, dup_abs;
332 UWORD8 *pu1_pred_ptr = pu1_pred;
333 WORD16 *pi2_rsd_ptr = pi2_rsd;
334 UWORD8 *pu1_out_ptr = pu1_out;
335
336 int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
337 resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
338
339 int16x8_t resd_b0_r01_in;
340 int16x8_t resd_b0_r23_in;
341 int16x8_t resd_b1_r01_in;
342 int16x8_t resd_b1_r23_in;
343 int16x8_t resd_b2_r45_in;
344 int16x8_t resd_b2_r67_in;
345 int16x8_t resd_b3_r45_in;
346 int16x8_t resd_b3_r67_in;
347
348 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
349
350 /* First row of 8, first 8x8 elements */
351 pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
352 pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
353 pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
354 pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
355 pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
356 pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
357 pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
358 pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
359
360 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
361 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
362 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
363 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
364 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
365 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
366 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
367 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
368
369 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
370 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
371 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
372 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
373 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
374 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
375 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
376 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
377
378 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
379 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
380 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
381 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
382 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
383 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
384 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
385 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
386
387 resd_b0_r01_in = vreinterpretq_s16_s64(
388 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
389 resd_b0_r23_in = vreinterpretq_s16_s64(
390 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
391 resd_b1_r01_in = vreinterpretq_s16_s64(
392 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
393 resd_b1_r23_in = vreinterpretq_s16_s64(
394 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
395 resd_b2_r45_in = vreinterpretq_s16_s64(
396 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
397 resd_b2_r67_in = vreinterpretq_s16_s64(
398 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
399 resd_b3_r45_in = vreinterpretq_s16_s64(
400 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
401 resd_b3_r67_in = vreinterpretq_s16_s64(
402 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
403
404 dup_val_1 = vabsq_s16(resd_b0_r01_in);
405 dup_val_2 = vabsq_s16(resd_b0_r23_in);
406 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
407 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
408 dup_abs[6] || dup_abs[7];
409
410 dup_val_1 = vabsq_s16(resd_b1_r01_in);
411 dup_val_2 = vabsq_s16(resd_b1_r23_in);
412 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
413 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
414 dup_abs[6] || dup_abs[7];
415
416 dup_val_1 = vabsq_s16(resd_b2_r45_in);
417 dup_val_2 = vabsq_s16(resd_b2_r67_in);
418 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
419 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
420 dup_abs[6] || dup_abs[7];
421
422 dup_val_1 = vabsq_s16(resd_b3_r45_in);
423 dup_val_2 = vabsq_s16(resd_b3_r67_in);
424 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
425 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
426 dup_abs[6] || dup_abs[7];
427
428 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
429
430 pred0 = vaddq_s16(pred0, resd0_in);
431 pred1 = vaddq_s16(pred1, resd1_in);
432 pred2 = vaddq_s16(pred2, resd2_in);
433 pred3 = vaddq_s16(pred3, resd3_in);
434 pred4 = vaddq_s16(pred4, resd4_in);
435 pred5 = vaddq_s16(pred5, resd5_in);
436 pred6 = vaddq_s16(pred6, resd6_in);
437 pred7 = vaddq_s16(pred7, resd7_in);
438
439 pred0_in = vqmovun_s16(pred0);
440 pred1_in = vqmovun_s16(pred1);
441 pred2_in = vqmovun_s16(pred2);
442 pred3_in = vqmovun_s16(pred3);
443 pred4_in = vqmovun_s16(pred4);
444 pred5_in = vqmovun_s16(pred5);
445 pred6_in = vqmovun_s16(pred6);
446 pred7_in = vqmovun_s16(pred7);
447
448 vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
449 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
450 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
451 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
452 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
453 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
454 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
455 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
456
457 /* first row of 8, sec 8x8 elements */
458 pu1_out_ptr = pu1_out_ptr + 8;
459 pi2_rsd_ptr = pi2_rsd_ptr + 8;
460 pu1_pred_ptr = pu1_pred_ptr + 8;
461
462 pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
463 pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
464 pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
465 pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
466 pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
467 pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
468 pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
469 pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
470
471 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
472 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
473 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
474 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
475 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
476 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
477 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
478 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
479
480 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
481 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
482 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
483 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
484 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
485 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
486 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
487 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
488
489 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
490 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
491 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
492 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
493 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
494 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
495 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
496 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
497
498 resd_b0_r01_in = vreinterpretq_s16_s64(
499 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
500 resd_b0_r23_in = vreinterpretq_s16_s64(
501 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
502 resd_b1_r01_in = vreinterpretq_s16_s64(
503 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
504 resd_b1_r23_in = vreinterpretq_s16_s64(
505 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
506 resd_b2_r45_in = vreinterpretq_s16_s64(
507 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
508 resd_b2_r67_in = vreinterpretq_s16_s64(
509 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
510 resd_b3_r45_in = vreinterpretq_s16_s64(
511 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
512 resd_b3_r67_in = vreinterpretq_s16_s64(
513 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
514
515 dup_val_1 = vabsq_s16(resd_b0_r01_in);
516 dup_val_2 = vabsq_s16(resd_b0_r23_in);
517 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
518 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
519 dup_abs[6] || dup_abs[7];
520
521 dup_val_1 = vabsq_s16(resd_b1_r01_in);
522 dup_val_2 = vabsq_s16(resd_b1_r23_in);
523 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
524 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
525 dup_abs[6] || dup_abs[7];
526
527 dup_val_1 = vabsq_s16(resd_b2_r45_in);
528 dup_val_2 = vabsq_s16(resd_b2_r67_in);
529 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
530 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
531 dup_abs[6] || dup_abs[7];
532
533 dup_val_1 = vabsq_s16(resd_b3_r45_in);
534 dup_val_2 = vabsq_s16(resd_b3_r67_in);
535 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
536 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
537 dup_abs[6] || dup_abs[7];
538
539 nnz |= (nnz_b0 << 2 | (nnz_b1 << 3) | (nnz_b2 << 6) | (nnz_b3 << 7));
540
541 pred0 = vaddq_s16(pred0, resd0_in);
542 pred1 = vaddq_s16(pred1, resd1_in);
543 pred2 = vaddq_s16(pred2, resd2_in);
544 pred3 = vaddq_s16(pred3, resd3_in);
545 pred4 = vaddq_s16(pred4, resd4_in);
546 pred5 = vaddq_s16(pred5, resd5_in);
547 pred6 = vaddq_s16(pred6, resd6_in);
548 pred7 = vaddq_s16(pred7, resd7_in);
549
550 pred0_in = vqmovun_s16(pred0);
551 pred1_in = vqmovun_s16(pred1);
552 pred2_in = vqmovun_s16(pred2);
553 pred3_in = vqmovun_s16(pred3);
554 pred4_in = vqmovun_s16(pred4);
555 pred5_in = vqmovun_s16(pred5);
556 pred6_in = vqmovun_s16(pred6);
557 pred7_in = vqmovun_s16(pred7);
558
559 vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
560 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
561 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
562 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
563 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
564 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
565 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
566 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
567
568 pu1_out_ptr = pu1_out + (8 * out_strd);
569 pi2_rsd_ptr = pi2_rsd + (8 * rsd_strd);
570 pu1_pred_ptr = pu1_pred + (8 * pred_strd);
571
572 /*Sec row of 8, first 8x8*/
573 pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
574 pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
575 pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
576 pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
577 pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
578 pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
579 pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
580 pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
581
582 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
583 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
584 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
585 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
586 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
587 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
588 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
589 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
590
591 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
592 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
593 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
594 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
595 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
596 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
597 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
598 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
599
600 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
601 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
602 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
603 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
604 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
605 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
606 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
607 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
608
609 resd_b0_r01_in = vreinterpretq_s16_s64(
610 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
611 resd_b0_r23_in = vreinterpretq_s16_s64(
612 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
613 resd_b1_r01_in = vreinterpretq_s16_s64(
614 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
615 resd_b1_r23_in = vreinterpretq_s16_s64(
616 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
617 resd_b2_r45_in = vreinterpretq_s16_s64(
618 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
619 resd_b2_r67_in = vreinterpretq_s16_s64(
620 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
621 resd_b3_r45_in = vreinterpretq_s16_s64(
622 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
623 resd_b3_r67_in = vreinterpretq_s16_s64(
624 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
625 dup_val_1 = vabsq_s16(resd_b0_r01_in);
626 dup_val_2 = vabsq_s16(resd_b0_r23_in);
627 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
628 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
629 dup_abs[6] || dup_abs[7];
630
631 dup_val_1 = vabsq_s16(resd_b1_r01_in);
632 dup_val_2 = vabsq_s16(resd_b1_r23_in);
633 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
634 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
635 dup_abs[6] || dup_abs[7];
636
637 dup_val_1 = vabsq_s16(resd_b2_r45_in);
638 dup_val_2 = vabsq_s16(resd_b2_r67_in);
639 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
640 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
641 dup_abs[6] || dup_abs[7];
642
643 dup_val_1 = vabsq_s16(resd_b3_r45_in);
644 dup_val_2 = vabsq_s16(resd_b3_r67_in);
645 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
646 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
647 dup_abs[6] || dup_abs[7];
648
649 nnz |= (nnz_b0 << 8 | (nnz_b1 << 9) | (nnz_b2 << 12) | (nnz_b3 << 13));
650
651 pred0 = vaddq_s16(pred0, resd0_in);
652 pred1 = vaddq_s16(pred1, resd1_in);
653 pred2 = vaddq_s16(pred2, resd2_in);
654 pred3 = vaddq_s16(pred3, resd3_in);
655 pred4 = vaddq_s16(pred4, resd4_in);
656 pred5 = vaddq_s16(pred5, resd5_in);
657 pred6 = vaddq_s16(pred6, resd6_in);
658 pred7 = vaddq_s16(pred7, resd7_in);
659
660 pred0_in = vqmovun_s16(pred0);
661 pred1_in = vqmovun_s16(pred1);
662 pred2_in = vqmovun_s16(pred2);
663 pred3_in = vqmovun_s16(pred3);
664 pred4_in = vqmovun_s16(pred4);
665 pred5_in = vqmovun_s16(pred5);
666 pred6_in = vqmovun_s16(pred6);
667 pred7_in = vqmovun_s16(pred7);
668
669 vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
670 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
671 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
672 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
673 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
674 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
675 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
676 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
677
678 /*Sec row of 8, Sec 8x8*/
679 pu1_out_ptr = pu1_out_ptr + 8;
680 pi2_rsd_ptr = pi2_rsd_ptr + 8;
681 pu1_pred_ptr = pu1_pred_ptr + 8;
682
683 pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
684 pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
685 pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
686 pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
687 pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
688 pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
689 pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
690 pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
691
692 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
693 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
694 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
695 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
696 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
697 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
698 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
699 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
700
701 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
702 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
703 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
704 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
705 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
706 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
707 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
708 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
709
710 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
711 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
712 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
713 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
714 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
715 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
716 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
717 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
718
719 resd_b0_r01_in = vreinterpretq_s16_s64(
720 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
721 resd_b0_r23_in = vreinterpretq_s16_s64(
722 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
723 resd_b1_r01_in = vreinterpretq_s16_s64(
724 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
725 resd_b1_r23_in = vreinterpretq_s16_s64(
726 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
727 resd_b2_r45_in = vreinterpretq_s16_s64(
728 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
729 resd_b2_r67_in = vreinterpretq_s16_s64(
730 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
731 resd_b3_r45_in = vreinterpretq_s16_s64(
732 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
733 resd_b3_r67_in = vreinterpretq_s16_s64(
734 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
735 dup_val_1 = vabsq_s16(resd_b0_r01_in);
736 dup_val_2 = vabsq_s16(resd_b0_r23_in);
737 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
738 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
739 dup_abs[6] || dup_abs[7];
740
741 dup_val_1 = vabsq_s16(resd_b1_r01_in);
742 dup_val_2 = vabsq_s16(resd_b1_r23_in);
743 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
744 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
745 dup_abs[6] || dup_abs[7];
746
747 dup_val_1 = vabsq_s16(resd_b2_r45_in);
748 dup_val_2 = vabsq_s16(resd_b2_r67_in);
749 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
750 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
751 dup_abs[6] || dup_abs[7];
752
753 dup_val_1 = vabsq_s16(resd_b3_r45_in);
754 dup_val_2 = vabsq_s16(resd_b3_r67_in);
755 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
756 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
757 dup_abs[6] || dup_abs[7];
758
759 nnz |= (nnz_b0 << 10 | (nnz_b1 << 11) | (nnz_b2 << 14) | (nnz_b3 << 15));
760
761 pred0 = vaddq_s16(pred0, resd0_in);
762 pred1 = vaddq_s16(pred1, resd1_in);
763 pred2 = vaddq_s16(pred2, resd2_in);
764 pred3 = vaddq_s16(pred3, resd3_in);
765 pred4 = vaddq_s16(pred4, resd4_in);
766 pred5 = vaddq_s16(pred5, resd5_in);
767 pred6 = vaddq_s16(pred6, resd6_in);
768 pred7 = vaddq_s16(pred7, resd7_in);
769
770 pred0_in = vqmovun_s16(pred0);
771 pred1_in = vqmovun_s16(pred1);
772 pred2_in = vqmovun_s16(pred2);
773 pred3_in = vqmovun_s16(pred3);
774 pred4_in = vqmovun_s16(pred4);
775 pred5_in = vqmovun_s16(pred5);
776 pred6_in = vqmovun_s16(pred6);
777 pred7_in = vqmovun_s16(pred7);
778
779 vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
780 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
781 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
782 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
783 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
784 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
785 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
786 vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
787
788 return nnz;
789 }
790
791 /*****************************************************************************/
792 /* */
793 /* Function Name : isvcd_pred_residual_recon_chroma_8x8_neonintr */
794 /* */
795 /* Description : this function computes the recon data from the */
796 /* pred and residual buffer */
797 /* */
798 /* Inputs : */
799 /* Globals : none */
800 /* Processing : */
801 /* */
802 /* Outputs : none */
803 /* Returns : none */
804 /* */
805 /* Issues : none */
806 /* */
807 /* Revision History: */
808 /* */
809 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
810 /* 25 11 2021 Kishore creation */
811 /* */
812 /*****************************************************************************/
813
isvcd_pred_residual_recon_chroma_8x8_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)814 void isvcd_pred_residual_recon_chroma_8x8_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd,
815 UWORD8 *pu1_out, WORD32 pred_strd,
816 WORD32 rsd_strd, WORD32 out_strd)
817 {
818 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
819 uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
820 int16x8_t pred0, pred1, pred2, pred3;
821 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
822 int16x8_t pred4, pred5, pred6, pred7;
823 int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
824
825 UWORD8 *pu1_pred_ptr = pu1_pred;
826 WORD16 *pi2_rsd_ptr = pi2_rsd;
827 UWORD8 *pu1_out_ptr = pu1_out;
828 uint8x16_t pred0_inp_full, pred1_inp_full, pred2_inp_full, pred3_inp_full;
829 uint8x16_t pred4_inp_full, pred5_inp_full, pred6_inp_full, pred7_inp_full;
830 uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
831 uint8x8_t i4_out_horz_8x8_r4, i4_out_horz_8x8_r5, i4_out_horz_8x8_r6, i4_out_horz_8x8_r7;
832 uint8x8_t chroma_mask_8x8;
833
834 pred0_inp_full = vld1q_u8((uint8_t *) pu1_pred);
835 pred1_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd));
836 pred2_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 2));
837 pred3_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 3));
838 pred4_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 4));
839 pred5_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 5));
840 pred6_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 6));
841 pred7_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 7));
842
843 pred0_in = vget_low_u8(pred0_inp_full);
844 pred1_in = vget_low_u8(pred1_inp_full);
845 pred2_in = vget_low_u8(pred2_inp_full);
846 pred3_in = vget_low_u8(pred3_inp_full);
847 pred4_in = vget_low_u8(pred4_inp_full);
848 pred5_in = vget_low_u8(pred5_inp_full);
849 pred6_in = vget_low_u8(pred6_inp_full);
850 pred7_in = vget_low_u8(pred7_inp_full);
851
852 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
853 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
854 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
855 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
856 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
857 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
858 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
859 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
860
861 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
862 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
863 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
864 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
865 resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
866 resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
867 resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
868 resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
869
870 pred0 = vaddq_s16(pred0, resd0_in);
871 pred1 = vaddq_s16(pred1, resd1_in);
872 pred2 = vaddq_s16(pred2, resd2_in);
873 pred3 = vaddq_s16(pred3, resd3_in);
874 pred4 = vaddq_s16(pred4, resd4_in);
875 pred5 = vaddq_s16(pred5, resd5_in);
876 pred6 = vaddq_s16(pred6, resd6_in);
877 pred7 = vaddq_s16(pred7, resd7_in);
878
879 pred0_in = vqmovun_s16(pred0);
880 pred1_in = vqmovun_s16(pred1);
881 pred2_in = vqmovun_s16(pred2);
882 pred3_in = vqmovun_s16(pred3);
883 pred4_in = vqmovun_s16(pred4);
884 pred5_in = vqmovun_s16(pred5);
885 pred6_in = vqmovun_s16(pred6);
886 pred7_in = vqmovun_s16(pred7);
887
888 chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
889
890 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
891 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
892 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
893 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
894 i4_out_horz_8x8_r4 = vld1_u8(pu1_out + out_strd * 4);
895 i4_out_horz_8x8_r5 = vld1_u8(pu1_out + out_strd * 5);
896 i4_out_horz_8x8_r6 = vld1_u8(pu1_out + out_strd * 6);
897 i4_out_horz_8x8_r7 = vld1_u8(pu1_out + out_strd * 7);
898
899 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
900 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
901 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
902 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
903 i4_out_horz_8x8_r4 = vbsl_u8(chroma_mask_8x8, pred4_in, i4_out_horz_8x8_r4);
904 i4_out_horz_8x8_r5 = vbsl_u8(chroma_mask_8x8, pred5_in, i4_out_horz_8x8_r5);
905 i4_out_horz_8x8_r6 = vbsl_u8(chroma_mask_8x8, pred6_in, i4_out_horz_8x8_r6);
906 i4_out_horz_8x8_r7 = vbsl_u8(chroma_mask_8x8, pred7_in, i4_out_horz_8x8_r7);
907
908 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
909 vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
910 vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
911 vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
912 vst1_u8((uint8_t *) (pu1_out + out_strd * 4), i4_out_horz_8x8_r4);
913 vst1_u8((uint8_t *) (pu1_out + out_strd * 5), i4_out_horz_8x8_r5);
914 vst1_u8((uint8_t *) (pu1_out + out_strd * 6), i4_out_horz_8x8_r6);
915 vst1_u8((uint8_t *) (pu1_out + out_strd * 7), i4_out_horz_8x8_r7);
916
917 /* for the next 4 elements interleaved format */
918 pred0_in = vget_high_u8(pred0_inp_full);
919 pred1_in = vget_high_u8(pred1_inp_full);
920 pred2_in = vget_high_u8(pred2_inp_full);
921 pred3_in = vget_high_u8(pred3_inp_full);
922 pred4_in = vget_high_u8(pred4_inp_full);
923 pred5_in = vget_high_u8(pred5_inp_full);
924 pred6_in = vget_high_u8(pred6_inp_full);
925 pred7_in = vget_high_u8(pred7_inp_full);
926
927 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
928 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
929 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
930 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
931 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
932 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
933 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
934 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
935
936 pi2_rsd = pi2_rsd + 8;
937 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
938 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
939 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
940 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
941 resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
942 resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
943 resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
944 resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
945
946 pred0 = vaddq_s16(pred0, resd0_in);
947 pred1 = vaddq_s16(pred1, resd1_in);
948 pred2 = vaddq_s16(pred2, resd2_in);
949 pred3 = vaddq_s16(pred3, resd3_in);
950 pred4 = vaddq_s16(pred4, resd4_in);
951 pred5 = vaddq_s16(pred5, resd5_in);
952 pred6 = vaddq_s16(pred6, resd6_in);
953 pred7 = vaddq_s16(pred7, resd7_in);
954
955 pred0_in = vqmovun_s16(pred0);
956 pred1_in = vqmovun_s16(pred1);
957 pred2_in = vqmovun_s16(pred2);
958 pred3_in = vqmovun_s16(pred3);
959 pred4_in = vqmovun_s16(pred4);
960 pred5_in = vqmovun_s16(pred5);
961 pred6_in = vqmovun_s16(pred6);
962 pred7_in = vqmovun_s16(pred7);
963
964 pu1_out = pu1_out + 8;
965 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
966 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
967 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
968 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
969 i4_out_horz_8x8_r4 = vld1_u8(pu1_out + out_strd * 4);
970 i4_out_horz_8x8_r5 = vld1_u8(pu1_out + out_strd * 5);
971 i4_out_horz_8x8_r6 = vld1_u8(pu1_out + out_strd * 6);
972 i4_out_horz_8x8_r7 = vld1_u8(pu1_out + out_strd * 7);
973
974 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
975 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
976 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
977 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
978 i4_out_horz_8x8_r4 = vbsl_u8(chroma_mask_8x8, pred4_in, i4_out_horz_8x8_r4);
979 i4_out_horz_8x8_r5 = vbsl_u8(chroma_mask_8x8, pred5_in, i4_out_horz_8x8_r5);
980 i4_out_horz_8x8_r6 = vbsl_u8(chroma_mask_8x8, pred6_in, i4_out_horz_8x8_r6);
981 i4_out_horz_8x8_r7 = vbsl_u8(chroma_mask_8x8, pred7_in, i4_out_horz_8x8_r7);
982
983 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
984 vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
985 vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
986 vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
987 vst1_u8((uint8_t *) (pu1_out + out_strd * 4), i4_out_horz_8x8_r4);
988 vst1_u8((uint8_t *) (pu1_out + out_strd * 5), i4_out_horz_8x8_r5);
989 vst1_u8((uint8_t *) (pu1_out + out_strd * 6), i4_out_horz_8x8_r6);
990 vst1_u8((uint8_t *) (pu1_out + out_strd * 7), i4_out_horz_8x8_r7);
991
992 pu1_out = pu1_out_ptr;
993 pu1_pred = pu1_pred_ptr;
994 pi2_rsd = pi2_rsd_ptr;
995 }
996
997 /*****************************************************************************/
998 /* */
999 /* Function Name : isvcd_pred_residual_recon_chroma_4x4_neonintr */
1000 /* */
1001 /* Description : this function computes the recon data from the */
1002 /* pred and residual buffer */
1003 /* */
1004 /* Inputs : */
1005 /* Globals : none */
1006 /* Processing : */
1007 /* */
1008 /* Outputs : none */
1009 /* Returns : none */
1010 /* */
1011 /* Issues : none */
1012 /* */
1013 /* Revision History: */
1014 /* */
1015 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1016 /* 25 11 2021 Kishore creation */
1017 /* */
1018 /*****************************************************************************/
1019
isvcd_pred_residual_recon_chroma_4x4_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)1020 void isvcd_pred_residual_recon_chroma_4x4_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd,
1021 UWORD8 *pu1_out, WORD32 pred_strd,
1022 WORD32 rsd_strd, WORD32 out_strd)
1023 {
1024 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1025 int16x8_t pred0, pred1, pred2, pred3;
1026 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1027
1028 uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1029 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1030
1031 UWORD8 *pu1_pred_ptr = pu1_pred;
1032 WORD16 *pi2_rsd_ptr = pi2_rsd;
1033 UWORD8 *pu1_out_ptr = pu1_out;
1034
1035 pred0_in = vld1_u8((uint8_t *) pu1_pred);
1036 pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
1037 pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
1038 pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
1039
1040 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1041 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1042 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1043 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1044
1045 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1046 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1047 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1048 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1049
1050 pred0 = vaddq_s16(pred0, resd0_in);
1051 pred1 = vaddq_s16(pred1, resd1_in);
1052 pred2 = vaddq_s16(pred2, resd2_in);
1053 pred3 = vaddq_s16(pred3, resd3_in);
1054
1055 pred0_in = vqmovun_s16(pred0);
1056 pred1_in = vqmovun_s16(pred1);
1057 pred2_in = vqmovun_s16(pred2);
1058 pred3_in = vqmovun_s16(pred3);
1059
1060 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1061 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
1062 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
1063 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
1064
1065 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1066 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1067 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1068 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1069
1070 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1071 vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
1072 vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
1073 vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
1074
1075 pu1_out = pu1_out_ptr;
1076 pu1_pred = pu1_pred_ptr;
1077 pi2_rsd = pi2_rsd_ptr;
1078 }
1079
1080 /*****************************************************************************/
1081 /* */
1082 /* Function Name : isvcd_residual_luma_4x4_neonintr */
1083 /* */
1084 /* Description : this function computes the nnz from resd */
1085 /* */
1086 /* Inputs : */
1087 /* Globals : none */
1088 /* Processing : */
1089 /* */
1090 /* Outputs : none */
1091 /* Returns : nnz */
1092 /* */
1093 /* Issues : none */
1094 /* */
1095 /* Revision History: */
1096 /* */
1097 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1098 /* 25 11 2021 Kishore creation */
1099 /* */
1100 /*****************************************************************************/
1101
isvcd_residual_luma_4x4_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1102 WORD32 isvcd_residual_luma_4x4_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1103 {
1104 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1105 int16x8_t resd01_in, resd23_in;
1106 WORD32 i4_nnz;
1107 int16x8_t dup_val_1, dup_val_2, dup_abs;
1108 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1109 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1110 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1111 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1112
1113 resd01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
1114 vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
1115
1116 resd23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
1117 vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
1118
1119 dup_val_1 = vabsq_s16(resd01_in);
1120 dup_val_2 = vabsq_s16(resd23_in);
1121 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1122 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1123 dup_abs[6] || dup_abs[7];
1124
1125 return i4_nnz;
1126 }
1127
1128 /*****************************************************************************/
1129 /* */
1130 /* Function Name : isvcd_residual_luma_8x8_neonintr */
1131 /* */
1132 /* Description : this function computes the nnz from resd */
1133 /* */
1134 /* Inputs : */
1135 /* Globals : none */
1136 /* Processing : */
1137 /* */
1138 /* Outputs : none */
1139 /* Returns : nnz */
1140 /* */
1141 /* Issues : none */
1142 /* */
1143 /* Revision History: */
1144 /* */
1145 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1146 /* 25 11 2021 Kishore creation */
1147 /* */
1148 /*****************************************************************************/
1149
isvcd_residual_luma_8x8_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1150 WORD32 isvcd_residual_luma_8x8_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1151 {
1152 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1153 int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
1154
1155 int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
1156 resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
1157
1158 int16x8_t resd_b0_r01_in;
1159 int16x8_t resd_b0_r23_in;
1160 int16x8_t resd_b1_r01_in;
1161 int16x8_t resd_b1_r23_in;
1162 int16x8_t resd_b2_r45_in;
1163 int16x8_t resd_b2_r67_in;
1164 int16x8_t resd_b3_r45_in;
1165 int16x8_t resd_b3_r67_in;
1166
1167 int16x8_t dup_val_1, dup_val_2, dup_abs;
1168 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1169
1170 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1171 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1172 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1173 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1174 resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
1175 resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
1176 resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
1177 resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
1178
1179 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1180 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1181 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1182 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1183 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1184 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1185 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1186 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1187
1188 resd_b0_r01_in = vreinterpretq_s16_s64(
1189 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1190 resd_b0_r23_in = vreinterpretq_s16_s64(
1191 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1192 resd_b1_r01_in = vreinterpretq_s16_s64(
1193 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1194 resd_b1_r23_in = vreinterpretq_s16_s64(
1195 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1196 resd_b2_r45_in = vreinterpretq_s16_s64(
1197 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1198 resd_b2_r67_in = vreinterpretq_s16_s64(
1199 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1200 resd_b3_r45_in = vreinterpretq_s16_s64(
1201 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1202 resd_b3_r67_in = vreinterpretq_s16_s64(
1203 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1204 dup_val_1 = vabsq_s16(resd_b0_r01_in);
1205 dup_val_2 = vabsq_s16(resd_b0_r23_in);
1206 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1207 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1208 dup_abs[6] || dup_abs[7];
1209
1210 dup_val_1 = vabsq_s16(resd_b1_r01_in);
1211 dup_val_2 = vabsq_s16(resd_b1_r23_in);
1212 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1213 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1214 dup_abs[6] || dup_abs[7];
1215
1216 dup_val_1 = vabsq_s16(resd_b2_r45_in);
1217 dup_val_2 = vabsq_s16(resd_b2_r67_in);
1218 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1219 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1220 dup_abs[6] || dup_abs[7];
1221
1222 dup_val_1 = vabsq_s16(resd_b3_r45_in);
1223 dup_val_2 = vabsq_s16(resd_b3_r67_in);
1224 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1225 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1226 dup_abs[6] || dup_abs[7];
1227
1228 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1229
1230 return nnz;
1231 }
1232
1233 /*****************************************************************************/
1234 /* */
1235 /* Function Name : isvcd_residual_luma_16x16_neonintr */
1236 /* */
1237 /* Description : this function computes the nnz from resd */
1238 /* */
1239 /* Inputs : */
1240 /* Globals : none */
1241 /* Processing : */
1242 /* */
1243 /* Outputs : none */
1244 /* Returns : nnz */
1245 /* */
1246 /* Issues : none */
1247 /* */
1248 /* Revision History: */
1249 /* */
1250 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1251 /* 25 11 2021 Kishore creation */
1252 /* */
1253 /*****************************************************************************/
1254
isvcd_residual_luma_16x16_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1255 WORD32 isvcd_residual_luma_16x16_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1256 {
1257 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1258 int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
1259
1260 WORD16 *pi2_rsd_ptr = pi2_rsd;
1261 int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
1262 resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
1263
1264 int16x8_t resd_b0_r01_in;
1265 int16x8_t resd_b0_r23_in;
1266 int16x8_t resd_b1_r01_in;
1267 int16x8_t resd_b1_r23_in;
1268 int16x8_t resd_b2_r45_in;
1269 int16x8_t resd_b2_r67_in;
1270 int16x8_t resd_b3_r45_in;
1271 int16x8_t resd_b3_r67_in;
1272
1273 int16x8_t dup_val_1, dup_val_2, dup_abs;
1274 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1275
1276 /* First row of 8, first 8x8 elements */
1277 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1278 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1279 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1280 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1281 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1282 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1283 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1284 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1285
1286 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1287 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1288 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1289 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1290 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1291 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1292 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1293 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1294
1295 resd_b0_r01_in = vreinterpretq_s16_s64(
1296 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1297 resd_b0_r23_in = vreinterpretq_s16_s64(
1298 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1299 resd_b1_r01_in = vreinterpretq_s16_s64(
1300 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1301 resd_b1_r23_in = vreinterpretq_s16_s64(
1302 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1303 resd_b2_r45_in = vreinterpretq_s16_s64(
1304 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1305 resd_b2_r67_in = vreinterpretq_s16_s64(
1306 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1307 resd_b3_r45_in = vreinterpretq_s16_s64(
1308 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1309 resd_b3_r67_in = vreinterpretq_s16_s64(
1310 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1311 dup_val_1 = vabsq_s16(resd_b0_r01_in);
1312 dup_val_2 = vabsq_s16(resd_b0_r23_in);
1313 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1314 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1315 dup_abs[6] || dup_abs[7];
1316
1317 dup_val_1 = vabsq_s16(resd_b1_r01_in);
1318 dup_val_2 = vabsq_s16(resd_b1_r23_in);
1319 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1320 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1321 dup_abs[6] || dup_abs[7];
1322
1323 dup_val_1 = vabsq_s16(resd_b2_r45_in);
1324 dup_val_2 = vabsq_s16(resd_b2_r67_in);
1325 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1326 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1327 dup_abs[6] || dup_abs[7];
1328
1329 dup_val_1 = vabsq_s16(resd_b3_r45_in);
1330 dup_val_2 = vabsq_s16(resd_b3_r67_in);
1331 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1332 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1333 dup_abs[6] || dup_abs[7];
1334
1335 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1336
1337 /* first row of 8, sec 8x8 elements */
1338 pi2_rsd_ptr = pi2_rsd_ptr + 8;
1339
1340 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1341 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1342 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1343 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1344 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1345 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1346 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1347 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1348
1349 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1350 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1351 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1352 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1353 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1354 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1355 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1356 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1357
1358 resd_b0_r01_in = vreinterpretq_s16_s64(
1359 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1360 resd_b0_r23_in = vreinterpretq_s16_s64(
1361 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1362 resd_b1_r01_in = vreinterpretq_s16_s64(
1363 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1364 resd_b1_r23_in = vreinterpretq_s16_s64(
1365 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1366 resd_b2_r45_in = vreinterpretq_s16_s64(
1367 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1368 resd_b2_r67_in = vreinterpretq_s16_s64(
1369 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1370 resd_b3_r45_in = vreinterpretq_s16_s64(
1371 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1372 resd_b3_r67_in = vreinterpretq_s16_s64(
1373 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1374 dup_val_1 = vabsq_s16(resd_b0_r01_in);
1375 dup_val_2 = vabsq_s16(resd_b0_r23_in);
1376 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1377 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1378 dup_abs[6] || dup_abs[7];
1379
1380 dup_val_1 = vabsq_s16(resd_b1_r01_in);
1381 dup_val_2 = vabsq_s16(resd_b1_r23_in);
1382 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1383 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1384 dup_abs[6] || dup_abs[7];
1385
1386 dup_val_1 = vabsq_s16(resd_b2_r45_in);
1387 dup_val_2 = vabsq_s16(resd_b2_r67_in);
1388 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1389 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1390 dup_abs[6] || dup_abs[7];
1391
1392 dup_val_1 = vabsq_s16(resd_b3_r45_in);
1393 dup_val_2 = vabsq_s16(resd_b3_r67_in);
1394 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1395 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1396 dup_abs[6] || dup_abs[7];
1397
1398 nnz |= (nnz_b0 << 2 | (nnz_b1 << 3) | (nnz_b2 << 6) | (nnz_b3 << 7));
1399
1400 pi2_rsd_ptr = pi2_rsd + (8 * rsd_strd);
1401 /*Sec row of 8, first 8x8*/
1402 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1403 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1404 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1405 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1406 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1407 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1408 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1409 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1410
1411 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1412 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1413 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1414 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1415 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1416 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1417 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1418 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1419
1420 resd_b0_r01_in = vreinterpretq_s16_s64(
1421 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1422 resd_b0_r23_in = vreinterpretq_s16_s64(
1423 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1424 resd_b1_r01_in = vreinterpretq_s16_s64(
1425 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1426 resd_b1_r23_in = vreinterpretq_s16_s64(
1427 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1428 resd_b2_r45_in = vreinterpretq_s16_s64(
1429 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1430 resd_b2_r67_in = vreinterpretq_s16_s64(
1431 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1432 resd_b3_r45_in = vreinterpretq_s16_s64(
1433 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1434 resd_b3_r67_in = vreinterpretq_s16_s64(
1435 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1436 dup_val_1 = vabsq_s16(resd_b0_r01_in);
1437 dup_val_2 = vabsq_s16(resd_b0_r23_in);
1438 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1439 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1440 dup_abs[6] || dup_abs[7];
1441
1442 dup_val_1 = vabsq_s16(resd_b1_r01_in);
1443 dup_val_2 = vabsq_s16(resd_b1_r23_in);
1444 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1445 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1446 dup_abs[6] || dup_abs[7];
1447
1448 dup_val_1 = vabsq_s16(resd_b2_r45_in);
1449 dup_val_2 = vabsq_s16(resd_b2_r67_in);
1450 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1451 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1452 dup_abs[6] || dup_abs[7];
1453
1454 dup_val_1 = vabsq_s16(resd_b3_r45_in);
1455 dup_val_2 = vabsq_s16(resd_b3_r67_in);
1456 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1457 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1458 dup_abs[6] || dup_abs[7];
1459
1460 nnz |= (nnz_b0 << 8 | (nnz_b1 << 9) | (nnz_b2 << 12) | (nnz_b3 << 13));
1461
1462 /*Sec row of 8, Sec 8x8*/
1463 pi2_rsd_ptr = pi2_rsd_ptr + 8;
1464
1465 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1466 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1467 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1468 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1469 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1470 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1471 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1472 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1473
1474 resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1475 resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1476 resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1477 resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1478 resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1479 resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1480 resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1481 resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1482
1483 resd_b0_r01_in = vreinterpretq_s16_s64(
1484 vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1485 resd_b0_r23_in = vreinterpretq_s16_s64(
1486 vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1487 resd_b1_r01_in = vreinterpretq_s16_s64(
1488 vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1489 resd_b1_r23_in = vreinterpretq_s16_s64(
1490 vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1491 resd_b2_r45_in = vreinterpretq_s16_s64(
1492 vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1493 resd_b2_r67_in = vreinterpretq_s16_s64(
1494 vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1495 resd_b3_r45_in = vreinterpretq_s16_s64(
1496 vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1497 resd_b3_r67_in = vreinterpretq_s16_s64(
1498 vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1499 dup_val_1 = vabsq_s16(resd_b0_r01_in);
1500 dup_val_2 = vabsq_s16(resd_b0_r23_in);
1501 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1502 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1503 dup_abs[6] || dup_abs[7];
1504
1505 dup_val_1 = vabsq_s16(resd_b1_r01_in);
1506 dup_val_2 = vabsq_s16(resd_b1_r23_in);
1507 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1508 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1509 dup_abs[6] || dup_abs[7];
1510
1511 dup_val_1 = vabsq_s16(resd_b2_r45_in);
1512 dup_val_2 = vabsq_s16(resd_b2_r67_in);
1513 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1514 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1515 dup_abs[6] || dup_abs[7];
1516
1517 dup_val_1 = vabsq_s16(resd_b3_r45_in);
1518 dup_val_2 = vabsq_s16(resd_b3_r67_in);
1519 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1520 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1521 dup_abs[6] || dup_abs[7];
1522
1523 nnz |= (nnz_b0 << 10 | (nnz_b1 << 11) | (nnz_b2 << 14) | (nnz_b3 << 15));
1524 return nnz;
1525 }
1526
1527 /*****************************************************************************/
1528 /* */
1529 /* Function Name : isvcd_residual_chroma_cb_cr_8x8_neonintr */
1530 /* */
1531 /* Description : this function computes the nnz from resd */
1532 /* */
1533 /* Inputs : */
1534 /* Globals : none */
1535 /* Processing : */
1536 /* */
1537 /* Outputs : none */
1538 /* Returns : nnz */
1539 /* */
1540 /* Issues : none */
1541 /* */
1542 /* Revision History: */
1543 /* */
1544 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1545 /* 25 11 2021 Kishore creation */
1546 /* */
1547 /*****************************************************************************/
1548
isvcd_residual_chroma_cb_cr_8x8_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1549 WORD32 isvcd_residual_chroma_cb_cr_8x8_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1550 {
1551 int16x8x2_t resd0_in, resd1_in, resd2_in, resd3_in;
1552 int16x8x2_t resd4_in, resd5_in, resd6_in, resd7_in;
1553
1554 int64x2_t resd0_cr_64x2, resd1_cr_64x2, resd2_cr_64x2, resd3_cr_64x2, resd4_cr_64x2,
1555 resd5_cr_64x2, resd6_cr_64x2, resd7_cr_64x2;
1556
1557 int16x8_t resd_b0_r01_cr;
1558 int16x8_t resd_b0_r23_cr;
1559 int16x8_t resd_b1_r01_cr;
1560 int16x8_t resd_b1_r23_cr;
1561 int16x8_t resd_b2_r45_cr;
1562 int16x8_t resd_b2_r67_cr;
1563 int16x8_t resd_b3_r45_cr;
1564 int16x8_t resd_b3_r67_cr;
1565
1566 int64x2_t resd0_cb_64x2, resd1_cb_64x2, resd2_cb_64x2, resd3_cb_64x2, resd4_cb_64x2,
1567 resd5_cb_64x2, resd6_cb_64x2, resd7_cb_64x2;
1568
1569 int16x8_t resd_b0_r01_cb;
1570 int16x8_t resd_b0_r23_cb;
1571 int16x8_t resd_b1_r01_cb;
1572 int16x8_t resd_b1_r23_cb;
1573 int16x8_t resd_b2_r45_cb;
1574 int16x8_t resd_b2_r67_cb;
1575 int16x8_t resd_b3_r45_cb;
1576 int16x8_t resd_b3_r67_cb;
1577
1578 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1579 int16x8_t dup_val_1, dup_val_2, dup_abs;
1580
1581 resd0_in = vld2q_s16((int16_t *) pi2_rsd);
1582 resd1_in = vld2q_s16((int16_t *) pi2_rsd + rsd_strd);
1583 resd2_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1584 resd3_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1585 resd4_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
1586 resd5_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
1587 resd6_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
1588 resd7_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
1589
1590 resd0_cb_64x2 = vreinterpretq_s64_s16(resd0_in.val[0]);
1591 resd1_cb_64x2 = vreinterpretq_s64_s16(resd1_in.val[0]);
1592 resd2_cb_64x2 = vreinterpretq_s64_s16(resd2_in.val[0]);
1593 resd3_cb_64x2 = vreinterpretq_s64_s16(resd3_in.val[0]);
1594 resd4_cb_64x2 = vreinterpretq_s64_s16(resd4_in.val[0]);
1595 resd5_cb_64x2 = vreinterpretq_s64_s16(resd5_in.val[0]);
1596 resd6_cb_64x2 = vreinterpretq_s64_s16(resd6_in.val[0]);
1597 resd7_cb_64x2 = vreinterpretq_s64_s16(resd7_in.val[0]);
1598
1599 resd_b0_r01_cb = vreinterpretq_s16_s64(
1600 vcombine_s64(vget_low_s64(resd0_cb_64x2), vget_low_s64(resd1_cb_64x2)));
1601 resd_b0_r23_cb = vreinterpretq_s16_s64(
1602 vcombine_s64(vget_low_s64(resd2_cb_64x2), vget_low_s64(resd3_cb_64x2)));
1603 resd_b1_r01_cb = vreinterpretq_s16_s64(
1604 vcombine_s64(vget_high_s64(resd0_cb_64x2), vget_high_s64(resd1_cb_64x2)));
1605 resd_b1_r23_cb = vreinterpretq_s16_s64(
1606 vcombine_s64(vget_high_s64(resd2_cb_64x2), vget_high_s64(resd3_cb_64x2)));
1607 resd_b2_r45_cb = vreinterpretq_s16_s64(
1608 vcombine_s64(vget_low_s64(resd4_cb_64x2), vget_low_s64(resd5_cb_64x2)));
1609 resd_b2_r67_cb = vreinterpretq_s16_s64(
1610 vcombine_s64(vget_low_s64(resd6_cb_64x2), vget_low_s64(resd7_cb_64x2)));
1611 resd_b3_r45_cb = vreinterpretq_s16_s64(
1612 vcombine_s64(vget_high_s64(resd4_cb_64x2), vget_high_s64(resd5_cb_64x2)));
1613 resd_b3_r67_cb = vreinterpretq_s16_s64(
1614 vcombine_s64(vget_high_s64(resd6_cb_64x2), vget_high_s64(resd7_cb_64x2)));
1615
1616 resd0_cr_64x2 = vreinterpretq_s64_s16(resd0_in.val[1]);
1617 resd1_cr_64x2 = vreinterpretq_s64_s16(resd1_in.val[1]);
1618 resd2_cr_64x2 = vreinterpretq_s64_s16(resd2_in.val[1]);
1619 resd3_cr_64x2 = vreinterpretq_s64_s16(resd3_in.val[1]);
1620 resd4_cr_64x2 = vreinterpretq_s64_s16(resd4_in.val[1]);
1621 resd5_cr_64x2 = vreinterpretq_s64_s16(resd5_in.val[1]);
1622 resd6_cr_64x2 = vreinterpretq_s64_s16(resd6_in.val[1]);
1623 resd7_cr_64x2 = vreinterpretq_s64_s16(resd7_in.val[1]);
1624
1625 resd_b0_r01_cr = vreinterpretq_s16_s64(
1626 vcombine_s64(vget_low_s64(resd0_cr_64x2), vget_low_s64(resd1_cr_64x2)));
1627 resd_b0_r23_cr = vreinterpretq_s16_s64(
1628 vcombine_s64(vget_low_s64(resd2_cr_64x2), vget_low_s64(resd3_cr_64x2)));
1629 resd_b1_r01_cr = vreinterpretq_s16_s64(
1630 vcombine_s64(vget_high_s64(resd0_cr_64x2), vget_high_s64(resd1_cr_64x2)));
1631 resd_b1_r23_cr = vreinterpretq_s16_s64(
1632 vcombine_s64(vget_high_s64(resd2_cr_64x2), vget_high_s64(resd3_cr_64x2)));
1633 resd_b2_r45_cr = vreinterpretq_s16_s64(
1634 vcombine_s64(vget_low_s64(resd4_cr_64x2), vget_low_s64(resd5_cr_64x2)));
1635 resd_b2_r67_cr = vreinterpretq_s16_s64(
1636 vcombine_s64(vget_low_s64(resd6_cr_64x2), vget_low_s64(resd7_cr_64x2)));
1637 resd_b3_r45_cr = vreinterpretq_s16_s64(
1638 vcombine_s64(vget_high_s64(resd4_cr_64x2), vget_high_s64(resd5_cr_64x2)));
1639 resd_b3_r67_cr = vreinterpretq_s16_s64(
1640 vcombine_s64(vget_high_s64(resd6_cr_64x2), vget_high_s64(resd7_cr_64x2)));
1641
1642 dup_val_1 = vabsq_s16(resd_b0_r01_cr);
1643 dup_val_2 = vabsq_s16(resd_b0_r23_cr);
1644 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1645 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1646 dup_abs[6] || dup_abs[7];
1647
1648 dup_val_1 = vabsq_s16(resd_b1_r01_cr);
1649 dup_val_2 = vabsq_s16(resd_b1_r23_cr);
1650 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1651 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1652 dup_abs[6] || dup_abs[7];
1653
1654 dup_val_1 = vabsq_s16(resd_b2_r45_cr);
1655 dup_val_2 = vabsq_s16(resd_b2_r67_cr);
1656 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1657 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1658 dup_abs[6] || dup_abs[7];
1659
1660 dup_val_1 = vabsq_s16(resd_b3_r45_cr);
1661 dup_val_2 = vabsq_s16(resd_b3_r67_cr);
1662 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1663 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1664 dup_abs[6] || dup_abs[7];
1665
1666 nnz = ((nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 2) | (nnz_b3 << 3)) << 4);
1667
1668 dup_val_1 = vabsq_s16(resd_b0_r01_cb);
1669 dup_val_2 = vabsq_s16(resd_b0_r23_cb);
1670 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1671 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1672 dup_abs[6] || dup_abs[7];
1673
1674 dup_val_1 = vabsq_s16(resd_b1_r01_cb);
1675 dup_val_2 = vabsq_s16(resd_b1_r23_cb);
1676 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1677 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1678 dup_abs[6] || dup_abs[7];
1679
1680 dup_val_1 = vabsq_s16(resd_b2_r45_cb);
1681 dup_val_2 = vabsq_s16(resd_b2_r67_cb);
1682 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1683 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1684 dup_abs[6] || dup_abs[7];
1685
1686 dup_val_1 = vabsq_s16(resd_b3_r45_cb);
1687 dup_val_2 = vabsq_s16(resd_b3_r67_cb);
1688 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1689 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1690 dup_abs[6] || dup_abs[7];
1691
1692 nnz |= ((nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 2) | (nnz_b3 << 3)));
1693 return nnz;
1694 }
1695