xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_resi_trans_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22  *******************************************************************************
23  * @file
24  *  ihevc_resi_trans_neon.c
25  *
26  * @brief
27  *  Contains definitions of functions for computing residue and fwd transform
28  *
29  * @author
30  *  Ittiam
31  *
32  * @par List of Functions:
33  *  - ihevc_resi_trans_4x4_neon()
34  *  - ihevc_resi_trans_4x4_ttype1_neon()
35  *  - ihevc_resi_trans_8x8_neon()
36  *  - ihevc_resi_trans_16x16_neon()
37  * @remarks
38  *  None
39  *
40  *******************************************************************************
41  */
42 
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 /* System include files */
47 #include <stdio.h>
48 #include <string.h>
49 
50 /* System user files */
51 #include "ihevc_typedefs.h"
52 #include "ihevc_macros.h"
53 #include "ihevc_defs.h"
54 #include "ihevc_cmn_utils_neon.h"
55 
56 #include "ihevc_trans_tables.h"
57 #include "ihevc_resi_trans.h"
58 
59 /*****************************************************************************/
60 /* Function Definitions                                                      */
61 /*****************************************************************************/
ihevc_resi_trans_4x4_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)62 UWORD32 ihevc_resi_trans_4x4_neon(
63     UWORD8 *pu1_src,
64     UWORD8 *pu1_pred,
65     WORD32 *pi4_temp,
66     WORD16 *pi2_dst,
67     WORD32 src_strd,
68     WORD32 pred_strd,
69     WORD32 dst_strd,
70     CHROMA_PLANE_ID_T e_chroma_plane)
71 {
72     UWORD32 sad;
73     uint8x16_t inp_buf, pred_buf;
74     int16x8_t diff_1, diff_2;
75     int16x4_t diff_1_low, diff_1_high, diff_2_low, diff_2_high;
76     int16x8_t e_01, o_32;
77     int16x4_t e_0, e_1, o_0, o_1;
78     int32x4_t e_0_a_e_1, e_0_s_e_1;
79     int32x4_t temp1, temp2, temp3, temp4;
80     int32x4_t o_1_m_trans_10, o_1_m_trans_11;
81     int32x4_t e_03, e_12, o_03, o_12;
82     int16x4_t out_0, out_1, out_2, out_3;
83     uint16x8_t abs;
84     uint32x4_t b;
85     uint64x2_t c;
86 
87     (void)pi4_temp;
88     if(e_chroma_plane == NULL_PLANE)
89     {
90         inp_buf = load_unaligned_u8q(pu1_src, src_strd);
91         pred_buf = load_unaligned_u8q(pu1_pred, pred_strd);
92     }
93     else
94     {
95         inp_buf = load_unaligned_u8qi(pu1_src + e_chroma_plane, src_strd);
96         pred_buf = load_unaligned_u8qi(pu1_pred + e_chroma_plane, pred_strd);
97     }
98 
99     abs = vabdl_u8(vget_low_u8(inp_buf), vget_low_u8(pred_buf));
100     abs = vabal_u8(abs, vget_high_u8(inp_buf), vget_high_u8(pred_buf));
101     b = vpaddlq_u16(abs);
102     c = vpaddlq_u32(b);
103     sad = vget_lane_u32(vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
104                                  vreinterpret_u32_u64(vget_high_u64(c))),
105             0);
106 
107     diff_1 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(inp_buf), vget_low_u8(pred_buf)));
108     diff_2 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(inp_buf), vget_high_u8(pred_buf)));
109 
110     diff_1_low = vget_low_s16(diff_1);
111     diff_1_high = vget_high_s16(diff_1);
112     diff_2_low = vget_low_s16(diff_2);
113     diff_2_high = vget_high_s16(diff_2);
114 
115     transpose_s16_4x4d(&diff_1_low, &diff_1_high, &diff_2_low, &diff_2_high);
116     diff_1 = vcombine_s16(diff_1_low, diff_1_high);
117     diff_2 = vcombine_s16(diff_2_high, diff_2_low);
118 
119     e_01 = vaddq_s16(diff_1, diff_2);
120     o_32 = vsubq_s16(diff_1, diff_2);
121 
122     e_0 = vget_low_s16(e_01);
123     e_1 = vget_high_s16(e_01);
124     o_0 = vget_high_s16(o_32);
125     o_1 = vget_low_s16(o_32);
126 
127     e_0_a_e_1 = vaddl_s16(e_0, e_1);
128     e_0_s_e_1 = vsubl_s16(e_0, e_1);
129 
130     temp1 = vmulq_n_s32(e_0_a_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
131     temp2 = vmulq_n_s32(e_0_s_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
132 
133     o_1_m_trans_10 = vmull_n_s16(o_1, (WORD32)g_ai2_ihevc_trans_4[1][0]);
134     o_1_m_trans_11 = vmull_n_s16(o_1, (WORD32)g_ai2_ihevc_trans_4[1][1]);
135 
136     temp3 = vmlal_n_s16(o_1_m_trans_10, o_0, (WORD32)g_ai2_ihevc_trans_4[1][1]);
137     temp4 = vmlsl_n_s16(o_1_m_trans_11, o_0, (WORD32)g_ai2_ihevc_trans_4[1][0]);
138 
139     transpose_s32_4x4(&temp1, &temp3, &temp2, &temp4);
140 
141     e_03 = vaddq_s32(temp1, temp4);
142     e_12 = vaddq_s32(temp3, temp2);
143     o_03 = vsubq_s32(temp1, temp4);
144     o_12 = vsubq_s32(temp3, temp2);
145 
146     e_0_a_e_1 = vaddq_s32(e_03, e_12);
147     e_0_s_e_1 = vsubq_s32(e_03, e_12);
148 
149     temp1 = vmulq_n_s32(e_0_a_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
150     temp2 = vmulq_n_s32(e_0_s_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
151 
152     o_1_m_trans_10 = vmulq_n_s32(o_03, (WORD32)g_ai2_ihevc_trans_4[1][0]);
153     o_1_m_trans_11 = vmulq_n_s32(o_03, (WORD32)g_ai2_ihevc_trans_4[1][1]);
154 
155     temp3 = vmlaq_n_s32(o_1_m_trans_10, o_12, (WORD32)g_ai2_ihevc_trans_4[1][1]);
156     temp4 = vmlsq_n_s32(o_1_m_trans_11, o_12, (WORD32)g_ai2_ihevc_trans_4[1][0]);
157 
158     out_0 = vrshrn_n_s32(temp1, 9);
159     out_1 = vrshrn_n_s32(temp3, 9);
160     out_2 = vrshrn_n_s32(temp2, 9);
161     out_3 = vrshrn_n_s32(temp4, 9);
162 
163     vst1_s16(pi2_dst, out_0);
164     vst1_s16(pi2_dst + dst_strd, out_1);
165     vst1_s16(pi2_dst + 2 * dst_strd, out_2);
166     vst1_s16(pi2_dst + 3 * dst_strd, out_3);
167 
168     return sad;
169 }
170 
171 /**
172  *******************************************************************************
173  *
174  * @brief
175  *  This function performs residue calculation and forward  transform type 1
176  *  on input pixels
177  *
178  * @par Description:
179  *  Performs residue calculation by subtracting source and  prediction and
180  *  followed by forward transform
181  *
182  * @param[in] pu1_src
183  *  Input 4x4 pixels
184  *
185  * @param[in] pu1_pred
186  *  Prediction data
187  *
188  * @param[in] pi2_tmp
189  *  Temporary buffer of size 4x4
190  *
191  * @param[out] pi2_dst
192  *  Output 4x4 coefficients
193  *
194  * @param[in] src_strd
195  *  Input stride
196  *
197  * @param[in] pred_strd
198  *  Prediction Stride
199  *
200  * @param[in] dst_strd
201  *  Output Stride
202  *
203  * @param[in] e_chroma_plane
204  *  Enum singalling chroma plane
205  *
206  * @returns  block sad
207  *
208  * @remarks
209  *  None
210  *
211  *******************************************************************************
212  */
ihevc_resi_trans_4x4_ttype1_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)213 UWORD32 ihevc_resi_trans_4x4_ttype1_neon(
214     UWORD8 *pu1_src,
215     UWORD8 *pu1_pred,
216     WORD32 *pi4_temp,
217     WORD16 *pi2_dst,
218     WORD32 src_strd,
219     WORD32 pred_strd,
220     WORD32 dst_strd,
221     CHROMA_PLANE_ID_T e_chroma_plane)
222 {
223     UWORD32 sad;
224     int16x4_t src0_4x16b;
225     int16x4_t src1_4x16b;
226     int16x4_t src2_4x16b;
227     int16x4_t src3_4x16b;
228     int32x4_t src0_4x32b;
229     int32x4_t src1_4x32b;
230     int32x4_t src2_4x32b;
231     int32x4_t src3_4x32b;
232     /*load source and pred values */
233     const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
234     const uint8x16_t pred_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
235 
236     const int16x8_t src_reg0 =
237         vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8)));
238     const int16x8_t src_reg1 =
239         vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(src_u8), vget_high_u8(pred_u8)));
240 
241     int32x4_t add_val = vdupq_n_s32(1);
242 
243     uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8));
244     uint32x4_t b;
245     uint64x2_t c;
246     UNUSED(e_chroma_plane);
247     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(pred_u8));
248     b = vpaddlq_u16(abs);
249     c = vpaddlq_u32(b);
250     sad = vget_lane_u32(vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
251                                  vreinterpret_u32_u64(vget_high_u64(c))),
252             0);
253 
254     (void)pi4_temp;
255 
256     /*************************    4x4 16bit Transpose  ***********************/
257     src0_4x16b = vget_low_s16(src_reg0);
258     src1_4x16b = vget_high_s16(src_reg0);
259     src2_4x16b = vget_low_s16(src_reg1);
260     src3_4x16b = vget_high_s16(src_reg1);
261 
262     transpose_s16_4x4d(&src0_4x16b, &src1_4x16b, &src2_4x16b, &src3_4x16b);
263 
264     /**************************  4x4 Transpose End   *************************/
265 
266     /* Residue + Forward Transform 1st stage */
267     /* coeff2_4x32b = 74 74 74 74 */
268     const int32x4_t coeff2_4x32b =
269         vdupq_n_s32(74);  //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[2][0]);
270     /* coeff0_4x32b = 29 29 29 29 */
271     const int32x4_t coeff0_4x32b =
272         vdupq_n_s32(29);  //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[0][0]);
273     /* coeff1_4x32b = 55 55 55 55 */
274     const int32x4_t coeff1_4x32b =
275         vdupq_n_s32(55);  //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[1][0]);
276 
277     /* c0 to c3 calculation */
278     int32x4_t c0_4x32b = vaddl_s16(src0_4x16b, src3_4x16b); /* r0+r3 */
279     int32x4_t c1_4x32b = vaddl_s16(src1_4x16b, src3_4x16b); /* r1+r3 */
280     int32x4_t c2_4x32b = vsubl_s16(src0_4x16b, src1_4x16b); /* r0-r1 */
281     int32x4_t c3_4x32b = vmulq_s32(vmovl_s16(src2_4x16b), coeff2_4x32b); /* 74*r2 */
282     src0_4x16b = vadd_s16(src0_4x16b, src1_4x16b); /* r0+r1 */
283 
284     src1_4x32b = vsubl_s16(src0_4x16b, src3_4x16b); /* r0+r1-r3 */
285     src0_4x32b = vmlaq_s32(c3_4x32b, c0_4x32b, coeff0_4x32b); /* 29*c0 + c3 */
286     src2_4x32b = vmulq_s32(c2_4x32b, coeff0_4x32b); /* 29*c2 - c3 */
287     src3_4x32b = vmlaq_s32(c3_4x32b, c2_4x32b, coeff1_4x32b); /* 55*c2 + c3 */
288     src2_4x32b = vsubq_s32(src2_4x32b, c3_4x32b);
289 
290     src0_4x32b = vmlaq_s32(src0_4x32b, c1_4x32b, coeff1_4x32b); /* 29*c0 + 55*c1 + c3 */
291     src2_4x32b = vmlaq_s32(src2_4x32b, c0_4x32b, coeff1_4x32b); /* 29*c2 + 55*c0 - c3 */
292     c1_4x32b = vmulq_s32(c1_4x32b, coeff0_4x32b); /* 55*c2 - 29*c1 + c3 */
293     src1_4x32b = vmulq_s32(src1_4x32b, coeff2_4x32b); /*74*(r0+r1-r3)*/
294     src3_4x32b = vsubq_s32(src3_4x32b, c1_4x32b);
295 
296     /* result + add */
297     src1_4x32b = vaddq_s32(src1_4x32b, add_val);
298     src0_4x32b = vaddq_s32(src0_4x32b, add_val);
299     src2_4x32b = vaddq_s32(src2_4x32b, add_val);
300     src3_4x32b = vaddq_s32(src3_4x32b, add_val);
301     /* result >> shift */
302     src1_4x32b = vshrq_n_s32(src1_4x32b, 1);
303     src0_4x32b = vshrq_n_s32(src0_4x32b, 1);
304     src2_4x32b = vshrq_n_s32(src2_4x32b, 1);
305     src3_4x32b = vshrq_n_s32(src3_4x32b, 1);
306     /* Forward transform 2nd stage */
307     {
308         /*************************    4x4 32bit Transpose  ***********************/
309 
310         transpose_s32_4x4(&src0_4x32b, &src1_4x32b, &src2_4x32b, &src3_4x32b);
311 
312         /**************************  4x4 Transpose End   *************************/
313 
314         /* add value */
315         add_val = vdupq_n_s32(128);
316         c0_4x32b = vaddq_s32(src0_4x32b, src3_4x32b); /* r0+r3 */
317         c1_4x32b = vaddq_s32(src1_4x32b, src3_4x32b); /* r1+r3 */
318         c2_4x32b = vsubq_s32(src0_4x32b, src1_4x32b); /* r0-r1 */
319         c3_4x32b = vmulq_s32(src2_4x32b, coeff2_4x32b); /* 74*r2 */
320         src1_4x32b = vaddq_s32(src0_4x32b, src1_4x32b); /* r0+r1 */
321 
322         src1_4x32b = vsubq_s32(src1_4x32b, src3_4x32b); /* r0+r1-r3 */
323         src0_4x32b = vmlaq_s32(c3_4x32b, c0_4x32b, coeff0_4x32b); /* 29*c0 + c3 */
324         src2_4x32b = vmulq_s32(c2_4x32b, coeff0_4x32b); /* 29*c2 - c3 */
325         src3_4x32b = vmlaq_s32(c3_4x32b, c2_4x32b, coeff1_4x32b); /* 55*c2 + c3 */
326         src2_4x32b = vsubq_s32(src2_4x32b, c3_4x32b);
327 
328         src0_4x32b = vmlaq_s32(src0_4x32b, c1_4x32b, coeff1_4x32b); /* 29*c0 + 55*c1 + c3 */
329         src2_4x32b = vmlaq_s32(src2_4x32b, c0_4x32b, coeff1_4x32b); /* 29*c2 + 55*c0 - c3 */
330         c1_4x32b = vmulq_s32(c1_4x32b, coeff0_4x32b); /* 55*c2 - 29*c1 + c3 */
331         src1_4x32b = vmulq_s32(src1_4x32b, coeff2_4x32b); /*74*(r0+r1-r3)*/
332         src3_4x32b = vsubq_s32(src3_4x32b, c1_4x32b);
333 
334         /* result + add */
335         src1_4x32b = vaddq_s32(src1_4x32b, add_val);
336         src0_4x32b = vaddq_s32(src0_4x32b, add_val);
337         src2_4x32b = vaddq_s32(src2_4x32b, add_val);
338         src3_4x32b = vaddq_s32(src3_4x32b, add_val);
339 
340         src1_4x32b = vshrq_n_s32(src1_4x32b, 8);
341         src0_4x32b = vshrq_n_s32(src0_4x32b, 8);
342         src2_4x32b = vshrq_n_s32(src2_4x32b, 8);
343         src3_4x32b = vshrq_n_s32(src3_4x32b, 8);
344 
345         vst1_s16((pi2_dst + dst_strd), vmovn_s32(src1_4x32b));
346         vst1_s16(pi2_dst, vmovn_s32(src0_4x32b));
347         vst1_s16((pi2_dst + 2 * dst_strd), vmovn_s32(src2_4x32b));
348         vst1_s16((pi2_dst + 3 * dst_strd), vmovn_s32(src3_4x32b));
349     }
350     return sad;
351 }
352 
353 /**
354  *******************************************************************************
355  *
356  * @brief
357  *  This function performs residue calculation and forward  transform on
358  * input pixels
359  *
360  * @par Description:
361  *  Performs residue calculation by subtracting source and  prediction and
362  * followed by forward transform
363  *
364  * @param[in] pu1_src
365  *  Input 8x8 pixels
366  *
367  * @param[in] pu1_pred
368  *  Prediction data
369  *
370  * @param[in] pi2_tmp
371  *  Temporary buffer of size 8x8
372  *
373  * @param[out] pi2_dst
374  *  Output 8x8 coefficients
375  *
376  * @param[in] src_strd
377  *  Input stride
378  *
379  * @param[in] pred_strd
380  *  Prediction Stride
381  *
382  * @param[in] dst_strd
383  *  Output Stride
384  *
385  * @param[in] e_chroma_plane
386  *  Enum singalling chroma plane
387  *
388  * @returns  Void
389  *
390  * @remarks
391  *  None
392  *
393  *******************************************************************************
394  */
ihevc_resi_trans_8x8_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)395 UWORD32 ihevc_resi_trans_8x8_neon(
396     UWORD8 *pu1_src,
397     UWORD8 *pu1_pred,
398     WORD32 *pi4_temp,
399     WORD16 *pi2_dst,
400     WORD32 src_strd,
401     WORD32 pred_strd,
402     WORD32 dst_strd,
403     CHROMA_PLANE_ID_T e_chroma_plane)
404 {
405     int16x8_t diff_16[8];
406     int16x8_t abs = vdupq_n_s16(0);
407     int32x4_t tmp_a;
408     int64x2_t tmp_b;
409     int32x2_t sad_v;
410     int32x4x2_t a0, a1, a2, a3, a4, a5, a6, a7;
411     UWORD32 sad;
412 
413     (void)pi4_temp;
414     // stage 1
415     for(int k = 0; k < 8; k++)
416     {
417         if(NULL_PLANE == e_chroma_plane)
418         {
419             diff_16[k] = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(pu1_src), vld1_u8(pu1_pred)));
420         }
421         else
422         {
423             diff_16[k] = vreinterpretq_s16_u16(vsubl_u8(vld2_u8(pu1_src).val[e_chroma_plane],
424                                                         vld2_u8(pu1_pred).val[e_chroma_plane]));
425         }
426         pu1_src += src_strd;
427         pu1_pred += pred_strd;
428         abs = vaddq_s16(abs, vabsq_s16(diff_16[k]));
429     }
430 
431     tmp_a = vpaddlq_s16(abs);
432     tmp_b = vpaddlq_s32(tmp_a);
433     sad_v = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_b)),
434                    vreinterpret_s32_s64(vget_high_s64(tmp_b)));
435     sad = vget_lane_s32(sad_v, 0);
436 
437     transpose_s16_8x8(
438         &diff_16[0],
439         &diff_16[1],
440         &diff_16[2],
441         &diff_16[3],
442         &diff_16[4],
443         &diff_16[5],
444         &diff_16[6],
445         &diff_16[7]);
446 
447     {
448         const int16x8_t o3 = vsubq_s16(diff_16[3], diff_16[4]); /*C3 - C4*/
449         const int16x8_t o2 = vsubq_s16(diff_16[2], diff_16[5]); /*C2 - C5*/
450         const int16x8_t o1 = vsubq_s16(diff_16[1], diff_16[6]); /*C1 - C6*/
451         const int16x8_t o0 = vsubq_s16(diff_16[0], diff_16[7]); /*C0 - C7*/
452         const int16x8_t e0 = vaddq_s16(diff_16[0], diff_16[7]); /*C0 + C7*/
453         const int16x8_t e1 = vaddq_s16(diff_16[1], diff_16[6]); /*C1 + C6*/
454         const int16x8_t e2 = vaddq_s16(diff_16[2], diff_16[5]); /*C2 + C5*/
455         const int16x8_t e3 = vaddq_s16(diff_16[3], diff_16[4]); /*C3 + C4*/
456 
457         const int16x8_t ee0 = vaddq_s16(e0, e3); /*C0 + C3 + C4 + C7*/
458         const int16x8_t ee1 = vaddq_s16(e1, e2); /*C1 + C2 + C5 + C6*/
459         const int16x8_t eo0 = vsubq_s16(e0, e3); /*C0 - C3 - C4 + C7*/
460         const int16x8_t eo1 = vsubq_s16(e1, e2); /*C1 - C2 - C5 + C6*/
461 
462         /*C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7*/
463         const int16x8_t eee = vaddq_s16(ee1, ee0);
464         /*C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7*/
465         const int16x8_t eeo = vsubq_s16(ee0, ee1);
466 
467         /*F2[0] of 83*(C0 - C3 - C4 + C7)*/
468         a2.val[0] = vmull_n_s16(vget_low_s16(eo0), 83);
469         /*F6[0] of 36*(C0 - C3 - C4 + C7)*/
470         a6.val[0] = vmull_n_s16(vget_low_s16(eo0), 36);
471         /*F2[1] of 83*(C0 - C3 - C4 + C7)*/
472         a2.val[1] = vmull_n_s16(vget_high_s16(eo0), 83);
473         /*F6[1] of 36*(C0 - C3 - C4 + C7)*/
474         a6.val[1] = vmull_n_s16(vget_high_s16(eo0), 36);
475 
476         /*F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)*/
477         a6.val[1] = vmlsl_n_s16(a6.val[1], vget_high_s16(eo1), 83);
478         /*F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)*/
479         a2.val[1] = vmlal_n_s16(a2.val[1], vget_high_s16(eo1), 36);
480         /*F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)*/
481         a6.val[0] = vmlsl_n_s16(a6.val[0], vget_low_s16(eo1), 83);
482         /*F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)*/
483         a2.val[0] = vmlal_n_s16(a2.val[0], vget_low_s16(eo1), 36);
484 
485         /*F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)*/
486         a0.val[0] = vshll_n_s16(vget_low_s16(eee), 6);
487         /*F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)*/
488         a0.val[1] = vshll_n_s16(vget_high_s16(eee), 6);
489         /*F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)*/
490         a4.val[0] = vshll_n_s16(vget_low_s16(eeo), 6);
491         /*F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)*/
492         a4.val[1] = vshll_n_s16(vget_high_s16(eeo), 6);
493 
494         a7.val[0] = vmull_n_s16(vget_low_s16(o0), 18); /*F7[0] = 18*(C0 - C7)*/
495         a5.val[0] = vmull_n_s16(vget_low_s16(o0), 50); /*F5[0] = 50*(C0 - C7)*/
496         a3.val[0] = vmull_n_s16(vget_low_s16(o0), 75); /*F3[0] = 75*(C0 - C7)*/
497         a1.val[0] = vmull_n_s16(vget_low_s16(o0), 89); /*F1[0] = 89*(C0 - C7)*/
498         a1.val[1] = vmull_n_s16(vget_high_s16(o0), 89); /*F1[1] = 89*(C0 - C7)*/
499         a3.val[1] = vmull_n_s16(vget_high_s16(o0), 75); /*F3[1] = 75*(C0 - C7)*/
500         a5.val[1] = vmull_n_s16(vget_high_s16(o0), 50); /*F5[1] = 50*(C0 - C7)*/
501         a7.val[1] = vmull_n_s16(vget_high_s16(o0), 18); /*F7[1] = 18*(C0 - C7)*/
502 
503         /*F7[0] = 18*(C0 - C7) - 50*(C1 - C6)*/
504         a7.val[0] = vmlsl_n_s16(a7.val[0], vget_low_s16(o1), 50);
505         /*F5[0] = 50*(C0 - C7) - 89*(C1 - C6)*/
506         a5.val[0] = vmlsl_n_s16(a5.val[0], vget_low_s16(o1), 89);
507         /*F3[0] = 75*(C0 - C7) - 18*(C1 - C6)*/
508         a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o1), 18);
509         /*F1[0] = 89*(C0 - C7) + 75*(C1 - C6)*/
510         a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o1), 75);
511         /*F1[1] = 89*(C0 - C7) + 75*(C1 - C6)*/
512         a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o1), 75);
513         /*F3[1] = 75*(C0 - C7) - 18*(C1 - C6)*/
514         a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o1), 18);
515         /*F5[1] = 50*(C0 - C7) - 89*(C1 - C6)*/
516         a5.val[1] = vmlsl_n_s16(a5.val[1], vget_high_s16(o1), 89);
517         /*F7[1] = 18*(C0 - C7) - 50*(C1 - C6)*/
518         a7.val[1] = vmlsl_n_s16(a7.val[1], vget_high_s16(o1), 50);
519 
520         /*F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)*/
521         a7.val[0] = vmlal_n_s16(a7.val[0], vget_low_s16(o2), 75);
522         /*F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)*/
523         a5.val[0] = vmlal_n_s16(a5.val[0], vget_low_s16(o2), 18);
524         /*F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)*/
525         a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o2), 89);
526         /*F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)*/
527         a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o2), 50);
528         /*F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)*/
529         a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o2), 50);
530         /*F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)*/
531         a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o2), 89);
532         /*F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)*/
533         a5.val[1] = vmlal_n_s16(a5.val[1], vget_high_s16(o2), 18);
534         /*F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)*/
535         a7.val[1] = vmlal_n_s16(a7.val[1], vget_high_s16(o2), 75);
536 
537         /*F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)*/
538         a7.val[0] = vmlsl_n_s16(a7.val[0], vget_low_s16(o3), 89);
539         /*F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)*/
540         a5.val[0] = vmlal_n_s16(a5.val[0], vget_low_s16(o3), 75);
541         /*F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)*/
542         a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o3), 50);
543         /*F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)*/
544         a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o3), 18);
545         /*F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)*/
546         a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o3), 18);
547         /*F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)*/
548         a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o3), 50);
549         /*F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)*/
550         a5.val[1] = vmlal_n_s16(a5.val[1], vget_high_s16(o3), 75);
551         /*F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)*/
552         a7.val[1] = vmlsl_n_s16(a7.val[1], vget_high_s16(o3), 89);
553     }
554 
555     //Stage 2
556     {
557         int32x4_t h0, h1, h2, h3, h4, h5, h6, h7;
558         int32x4_t e0_2, e1_2, e2_2, e3_2;
559         int32x4_t o0_2, o1_2, o2_2, o3_2;
560         int32x4_t ee1_2, eo1_2, eo0_2, ee0_2;
561         int16x4_t row0, row1, row2, row3, row4, row5, row6, row7;
562 
563         /*Transposing second half of transform stage 1 (1)*/
564         int32x4x2_t b1 = vtrnq_s32(a0.val[1], a1.val[1]);
565         int32x4x2_t b3 = vtrnq_s32(a2.val[1], a3.val[1]);
566         int32x4x2_t b0 = vtrnq_s32(a0.val[0], a1.val[0]);
567         int32x4x2_t b2 = vtrnq_s32(a2.val[0], a3.val[0]);
568 
569         /*Transposing second half of transform stage 1 (2)*/
570         a0.val[0] = vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b2.val[0]));
571         a2.val[0] = vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b2.val[0]));
572         a1.val[0] = vcombine_s32(vget_low_s32(b0.val[1]), vget_low_s32(b2.val[1]));
573         a3.val[0] = vcombine_s32(vget_high_s32(b0.val[1]), vget_high_s32(b2.val[1]));
574         a0.val[1] = vcombine_s32(vget_low_s32(b1.val[0]), vget_low_s32(b3.val[0]));
575         a2.val[1] = vcombine_s32(vget_high_s32(b1.val[0]), vget_high_s32(b3.val[0]));
576         a1.val[1] = vcombine_s32(vget_low_s32(b1.val[1]), vget_low_s32(b3.val[1]));
577         a3.val[1] = vcombine_s32(vget_high_s32(b1.val[1]), vget_high_s32(b3.val[1]));
578 
579         o0_2 = vsubq_s32(a0.val[0], a3.val[1]); /*B0 - B7*/
580         o1_2 = vsubq_s32(a1.val[0], a2.val[1]); /*B1 - B6*/
581         o2_2 = vsubq_s32(a2.val[0], a1.val[1]); /*B2 - B5*/
582         o3_2 = vsubq_s32(a3.val[0], a0.val[1]); /*B3 - B4*/
583         e3_2 = vaddq_s32(a3.val[0], a0.val[1]); /*B3 + B4*/
584         e2_2 = vaddq_s32(a2.val[0], a1.val[1]); /*B2 + B5*/
585         e1_2 = vaddq_s32(a1.val[0], a2.val[1]); /*B1 + B6*/
586         e0_2 = vaddq_s32(a0.val[0], a3.val[1]); /*B0 + B7*/
587 
588         eo1_2 = vsubq_s32(e1_2, e2_2); /*B1 - B2 - B5 + B6*/
589         ee1_2 = vaddq_s32(e1_2, e2_2); /*B1 + B2 + B5 + B6*/
590         eo0_2 = vsubq_s32(e0_2, e3_2); /*B0 - B3 - B4 + B7*/
591         ee0_2 = vaddq_s32(e0_2, e3_2); /*B0 + B3 + B4 + B7*/
592 
593         /* F4 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7*/
594         h4 = vsubq_s32(ee0_2, ee1_2);
595         /* F0 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7*/
596         h0 = vaddq_s32(ee0_2, ee1_2);
597         /* Truncating last 11 bits in H0*/
598         row0 = vrshrn_n_s32(h0, 5);
599         /*First half-row of row 1 of transform stage 2 (H0) stored*/
600         vst1_s16(pi2_dst, row0);
601         /* Truncating last 11 bits in H4*/
602         row4 = vrshrn_n_s32(h4, 5);
603         /*First half-row of row 5 of transform stage 2 (H4) stored*/
604         vst1_s16(pi2_dst + 4 * dst_strd, row4);
605 
606         /* F6 = 36*(B0 - B3 - B4 + B7) */
607         h6 = vmulq_n_s32(eo0_2, 36);
608         /* F2 = 83*(B0 - B3 - B4 + B7) */
609         h2 = vmulq_n_s32(eo0_2, 83);
610         /*H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)*/
611         h2 = vmlaq_n_s32(h2, eo1_2, 36);
612         /*H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)*/
613         h6 = vmlsq_n_s32(h6, eo1_2, 83);
614         /* Truncating last 11 bits in H6*/
615         row6 = vrshrn_n_s32(h6, 11);
616         /*First half-row of row 7 of transform stage 2 (H6) stored*/
617         vst1_s16(pi2_dst + 6 * dst_strd, row6);
618         /* Truncating last 11 bits in H2*/
619         row2 = vrshrn_n_s32(h2, 11);
620         /*First half-row of row 3 of transform stage 2 (H2) stored*/
621         vst1_s16(pi2_dst + 2 * dst_strd, row2);
622 
623         h1 = vmulq_n_s32(o0_2, 89); /* H1 = 89*(B0 - B7) */
624         h3 = vmulq_n_s32(o0_2, 75); /* H3 = 75*(B0 - B7) */
625         h5 = vmulq_n_s32(o0_2, 50); /* H5 = 50*(B0 - B7) */
626         h7 = vmulq_n_s32(o0_2, 18); /* H7 = 18*(B0 - B7) */
627 
628         h7 = vmlsq_n_s32(h7, o1_2, 50); /* H7 = 18*(B0 - B7) - 50*(B1 - B6) */
629         h5 = vmlsq_n_s32(h5, o1_2, 89); /* H5 = 50*(B0 - B7) - 89*(B1 - B6) */
630         h3 = vmlsq_n_s32(h3, o1_2, 18); /* H3 = 75*(B0 - B7) - 18*(B1 - B6) */
631         h1 = vmlaq_n_s32(h1, o1_2, 75); /* H1 = 89*(B0 - B7) + 75*(B1 - B6) */
632 
633         /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) */
634         h1 = vmlaq_n_s32(h1, o2_2, 50);
635         /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) */
636         h3 = vmlsq_n_s32(h3, o2_2, 89);
637         /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) */
638         h5 = vmlaq_n_s32(h5, o2_2, 18);
639         /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) */
640         h7 = vmlaq_n_s32(h7, o2_2, 75);
641 
642         /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) */
643         h7 = vmlsq_n_s32(h7, o3_2, 89);
644         /* Truncating last 11 bits in H7*/
645         row7 = vrshrn_n_s32(h7, 11);
646         /*First half-row of row 8 of transform stage 2 (H7) stored*/
647         vst1_s16(pi2_dst + 7 * dst_strd, row7);
648         /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) */
649         h5 = vmlaq_n_s32(h5, o3_2, 75);
650         /* Truncating last 11 bits in H5*/
651         row5 = vrshrn_n_s32(h5, 11);
652         /*First half-row of row 6 of transform stage 2 (H5) stored*/
653         vst1_s16(pi2_dst + 5 * dst_strd, row5);
654         /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) */
655         h3 = vmlsq_n_s32(h3, o3_2, 50);
656         /* Truncating last 11 bits in H3*/
657         row3 = vrshrn_n_s32(h3, 11);
658         /*First half-row of row 4 of transform stage 2 (H3) stored*/
659         vst1_s16(pi2_dst + 3 * dst_strd, row3);
660         /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) */
661         h1 = vmlaq_n_s32(h1, o3_2, 18);
662         /* Truncating last 11 bits in H1*/
663         row1 = vrshrn_n_s32(h1, 11);
664         /*First half-row of row 2 of transform stage 2 (H1) stored*/
665         vst1_s16(pi2_dst + dst_strd, row1);
666     }
667 
668     pi2_dst += 4;
669 
670     {
671         int32x4_t h0, h1, h2, h3, h4, h5, h6, h7;
672         int32x4_t e0_2, e1_2, e2_2, e3_2;
673         int32x4_t o0_2, o1_2, o2_2, o3_2;
674         int32x4_t ee1_2, eo1_2, eo0_2, ee0_2;
675         int16x4_t row0, row1, row2, row3, row4, row5, row6, row7;
676 
677         /*Transposing second half of transform stage 1 (1)*/
678         int32x4x2_t b1 = vtrnq_s32(a4.val[1], a5.val[1]);
679         int32x4x2_t b3 = vtrnq_s32(a6.val[1], a7.val[1]);
680         int32x4x2_t b0 = vtrnq_s32(a4.val[0], a5.val[0]);
681         int32x4x2_t b2 = vtrnq_s32(a6.val[0], a7.val[0]);
682 
683         /*Transposing second half of transform stage 1 (2)*/
684         a0.val[0] = vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b2.val[0]));
685         a2.val[0] = vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b2.val[0]));
686         a1.val[0] = vcombine_s32(vget_low_s32(b0.val[1]), vget_low_s32(b2.val[1]));
687         a3.val[0] = vcombine_s32(vget_high_s32(b0.val[1]), vget_high_s32(b2.val[1]));
688         a0.val[1] = vcombine_s32(vget_low_s32(b1.val[0]), vget_low_s32(b3.val[0]));
689         a2.val[1] = vcombine_s32(vget_high_s32(b1.val[0]), vget_high_s32(b3.val[0]));
690         a1.val[1] = vcombine_s32(vget_low_s32(b1.val[1]), vget_low_s32(b3.val[1]));
691         a3.val[1] = vcombine_s32(vget_high_s32(b1.val[1]), vget_high_s32(b3.val[1]));
692 
693         o0_2 = vsubq_s32(a0.val[0], a3.val[1]); /*B0 - B7*/
694         o1_2 = vsubq_s32(a1.val[0], a2.val[1]); /*B1 - B6*/
695         o2_2 = vsubq_s32(a2.val[0], a1.val[1]); /*B2 - B5*/
696         o3_2 = vsubq_s32(a3.val[0], a0.val[1]); /*B3 - B4*/
697         e3_2 = vaddq_s32(a3.val[0], a0.val[1]); /*B3 + B4*/
698         e2_2 = vaddq_s32(a2.val[0], a1.val[1]); /*B2 + B5*/
699         e1_2 = vaddq_s32(a1.val[0], a2.val[1]); /*B1 + B6*/
700         e0_2 = vaddq_s32(a0.val[0], a3.val[1]); /*B0 + B7*/
701 
702         eo1_2 = vsubq_s32(e1_2, e2_2); /*B1 - B2 - B5 + B6*/
703         ee1_2 = vaddq_s32(e1_2, e2_2); /*B1 + B2 + B5 + B6*/
704         eo0_2 = vsubq_s32(e0_2, e3_2); /*B0 - B3 - B4 + B7*/
705         ee0_2 = vaddq_s32(e0_2, e3_2); /*B0 + B3 + B4 + B7*/
706 
707         /* F4 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7*/
708         h4 = vsubq_s32(ee0_2, ee1_2);
709         /* F0 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7*/
710         h0 = vaddq_s32(ee0_2, ee1_2);
711         /* Truncating last 11 bits in H0*/
712         row0 = vrshrn_n_s32(h0, 5);
713         /*First half-row of row 1 of transform stage 2 (H0) stored*/
714         vst1_s16(pi2_dst, row0);
715         /* Truncating last 11 bits in H4*/
716         row4 = vrshrn_n_s32(h4, 5);
717         /*First half-row of row 5 of transform stage 2 (H4) stored*/
718         vst1_s16(pi2_dst + 4 * dst_strd, row4);
719 
720         /* F6 = 36*(B0 - B3 - B4 + B7) */
721         h6 = vmulq_n_s32(eo0_2, 36);
722         /* F2 = 83*(B0 - B3 - B4 + B7) */
723         h2 = vmulq_n_s32(eo0_2, 83);
724         /*H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)*/
725         h2 = vmlaq_n_s32(h2, eo1_2, 36);
726         /*H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)*/
727         h6 = vmlsq_n_s32(h6, eo1_2, 83);
728         /* Truncating last 11 bits in H6*/
729         row6 = vrshrn_n_s32(h6, 11);
730         /*First half-row of row 7 of transform stage 2 (H6) stored*/
731         vst1_s16(pi2_dst + 6 * dst_strd, row6);
732         /* Truncating last 11 bits in H2*/
733         row2 = vrshrn_n_s32(h2, 11);
734         /*First half-row of row 3 of transform stage 2 (H2) stored*/
735         vst1_s16(pi2_dst + 2 * dst_strd, row2);
736 
737         h1 = vmulq_n_s32(o0_2, 89); /* H1 = 89*(B0 - B7) */
738         h3 = vmulq_n_s32(o0_2, 75); /* H3 = 75*(B0 - B7) */
739         h5 = vmulq_n_s32(o0_2, 50); /* H5 = 50*(B0 - B7) */
740         h7 = vmulq_n_s32(o0_2, 18); /* H7 = 18*(B0 - B7) */
741 
742         h7 = vmlsq_n_s32(h7, o1_2, 50); /* H7 = 18*(B0 - B7) - 50*(B1 - B6) */
743         h5 = vmlsq_n_s32(h5, o1_2, 89); /* H5 = 50*(B0 - B7) - 89*(B1 - B6) */
744         h3 = vmlsq_n_s32(h3, o1_2, 18); /* H3 = 75*(B0 - B7) - 18*(B1 - B6) */
745         h1 = vmlaq_n_s32(h1, o1_2, 75); /* H1 = 89*(B0 - B7) + 75*(B1 - B6) */
746 
747         /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) */
748         h1 = vmlaq_n_s32(h1, o2_2, 50);
749         /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) */
750         h3 = vmlsq_n_s32(h3, o2_2, 89);
751         /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) */
752         h5 = vmlaq_n_s32(h5, o2_2, 18);
753         /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) */
754         h7 = vmlaq_n_s32(h7, o2_2, 75);
755 
756         /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) */
757         h7 = vmlsq_n_s32(h7, o3_2, 89);
758         /* Truncating last 11 bits in H7*/
759         row7 = vrshrn_n_s32(h7, 11);
760         /*First half-row of row 8 of transform stage 2 (H7) stored*/
761         vst1_s16(pi2_dst + 7 * dst_strd, row7);
762         /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) */
763         h5 = vmlaq_n_s32(h5, o3_2, 75);
764         /* Truncating last 11 bits in H5*/
765         row5 = vrshrn_n_s32(h5, 11);
766         /*First half-row of row 6 of transform stage 2 (H5) stored*/
767         vst1_s16(pi2_dst + 5 * dst_strd, row5);
768         /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) */
769         h3 = vmlsq_n_s32(h3, o3_2, 50);
770         /* Truncating last 11 bits in H3*/
771         row3 = vrshrn_n_s32(h3, 11);
772         /*First half-row of row 4 of transform stage 2 (H3) stored*/
773         vst1_s16(pi2_dst + 3 * dst_strd, row3);
774         /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) */
775         h1 = vmlaq_n_s32(h1, o3_2, 18);
776         /* Truncating last 11 bits in H1*/
777         row1 = vrshrn_n_s32(h1, 11);
778         /*First half-row of row 2 of transform stage 2 (H1) stored*/
779         vst1_s16(pi2_dst + dst_strd, row1);
780     }
781     return sad;
782 }
783 
load(const uint8_t * a,int stride,uint8x8_t * b,CHROMA_PLANE_ID_T e_chroma_plane)784 static INLINE void load(const uint8_t *a, int stride, uint8x8_t *b,
785                         CHROMA_PLANE_ID_T e_chroma_plane)
786 {
787     int i;
788 
789     if(e_chroma_plane == NULL_PLANE)
790     {
791         for (i = 0; i < 16; i++)
792         {
793             b[i] = vld1_u8(a);
794             a += stride;
795         }
796     }
797     else
798     {
799         for (i = 0; i < 16; i++)
800         {
801             b[i] = vld2_u8(a).val[e_chroma_plane];
802             a += stride;
803         }
804     }
805 }
806 
807 // Store 8 16x8 values, assuming stride == 16.
store(WORD16 * a,int16x8_t * b)808 static INLINE void store(WORD16 *a, int16x8_t *b /*[8]*/)
809 {
810     int i;
811 
812     for (i = 0; i < 8; i++)
813     {
814         vst1q_s16(a, b[i]);
815         a += 16;
816     }
817 }
818 
cross_input_16(int16x8_t * a,int16x8_t * b)819 static INLINE void cross_input_16(int16x8_t *a /*[16]*/, int16x8_t *b /*[16]*/)
820 {
821     b[0] = vaddq_s16(a[0], a[15]);
822     b[1] = vaddq_s16(a[1], a[14]);
823     b[2] = vaddq_s16(a[2], a[13]);
824     b[3] = vaddq_s16(a[3], a[12]);
825     b[4] = vaddq_s16(a[4], a[11]);
826     b[5] = vaddq_s16(a[5], a[10]);
827     b[6] = vaddq_s16(a[6], a[9]);
828     b[7] = vaddq_s16(a[7], a[8]);
829 
830     b[8] = vsubq_s16(a[7], a[8]);
831     b[9] = vsubq_s16(a[6], a[9]);
832     b[10] = vsubq_s16(a[5], a[10]);
833     b[11] = vsubq_s16(a[4], a[11]);
834     b[12] = vsubq_s16(a[3], a[12]);
835     b[13] = vsubq_s16(a[2], a[13]);
836     b[14] = vsubq_s16(a[1], a[14]);
837     b[15] = vsubq_s16(a[0], a[15]);
838 }
839 
cross_input_32(int32x4x2_t * a,int32x4x2_t * b)840 static INLINE void cross_input_32(int32x4x2_t *a /*[16][2]*/, int32x4x2_t *b /*[16][2]*/)
841 {
842     WORD32 i;
843     for(i = 0; i < 2; i++)
844     {
845         b[0].val[i] = vaddq_s32(a[0].val[i], a[15].val[i]);
846         b[1].val[i] = vaddq_s32(a[1].val[i], a[14].val[i]);
847         b[2].val[i] = vaddq_s32(a[2].val[i], a[13].val[i]);
848         b[3].val[i] = vaddq_s32(a[3].val[i], a[12].val[i]);
849         b[4].val[i] = vaddq_s32(a[4].val[i], a[11].val[i]);
850         b[5].val[i] = vaddq_s32(a[5].val[i], a[10].val[i]);
851         b[6].val[i] = vaddq_s32(a[6].val[i], a[9].val[i]);
852         b[7].val[i] = vaddq_s32(a[7].val[i], a[8].val[i]);
853 
854         b[8].val[i] = vsubq_s32(a[7].val[i], a[8].val[i]);
855         b[9].val[i] = vsubq_s32(a[6].val[i], a[9].val[i]);
856         b[10].val[i] = vsubq_s32(a[5].val[i], a[10].val[i]);
857         b[11].val[i] = vsubq_s32(a[4].val[i], a[11].val[i]);
858         b[12].val[i] = vsubq_s32(a[3].val[i], a[12].val[i]);
859         b[13].val[i] = vsubq_s32(a[2].val[i], a[13].val[i]);
860         b[14].val[i] = vsubq_s32(a[1].val[i], a[14].val[i]);
861         b[15].val[i] = vsubq_s32(a[0].val[i], a[15].val[i]);
862     }
863 }
864 
diff(uint8x8_t * a,uint8x8_t * b,int16x8_t * c)865 static INLINE int32x4_t diff(uint8x8_t *a /*[16]*/, uint8x8_t *b /*[16]*/, int16x8_t *c /*[16]*/)
866 {
867     int i;
868     int16x8_t abs = vdupq_n_s16(0);
869 
870     for (i = 0; i < 16; i++)
871     {
872         c[i] = vreinterpretq_s16_u16(vsubl_u8(a[i], b[i]));
873         abs = vaddq_s16(abs, vabsq_s16(c[i]));
874     }
875     return vpaddlq_s16(abs);
876 }
877 
partial_round_shift(int32x4x2_t * a,int16x8_t * b)878 static INLINE void partial_round_shift(int32x4x2_t *a, int16x8_t *b /*[16]*/)
879 {
880     WORD32 shift = 13, add;
881     add = 1 << (shift - 1);
882 
883     const int32x4_t vecadd = vdupq_n_s32(add);
884     b[0] = vcombine_s16(
885         vshrn_n_s32(vaddq_s32(a[0].val[0], vecadd), 13),
886         vshrn_n_s32(vaddq_s32(a[0].val[1], vecadd), 13));
887     b[1] = vcombine_s16(
888         vshrn_n_s32(vaddq_s32(a[1].val[0], vecadd), 13),
889         vshrn_n_s32(vaddq_s32(a[1].val[1], vecadd), 13));
890     b[2] = vcombine_s16(
891         vshrn_n_s32(vaddq_s32(a[2].val[0], vecadd), 13),
892         vshrn_n_s32(vaddq_s32(a[2].val[1], vecadd), 13));
893     b[3] = vcombine_s16(
894         vshrn_n_s32(vaddq_s32(a[3].val[0], vecadd), 13),
895         vshrn_n_s32(vaddq_s32(a[3].val[1], vecadd), 13));
896     b[4] = vcombine_s16(
897         vshrn_n_s32(vaddq_s32(a[4].val[0], vecadd), 13),
898         vshrn_n_s32(vaddq_s32(a[4].val[1], vecadd), 13));
899     b[5] = vcombine_s16(
900         vshrn_n_s32(vaddq_s32(a[5].val[0], vecadd), 13),
901         vshrn_n_s32(vaddq_s32(a[5].val[1], vecadd), 13));
902     b[6] = vcombine_s16(
903         vshrn_n_s32(vaddq_s32(a[6].val[0], vecadd), 13),
904         vshrn_n_s32(vaddq_s32(a[6].val[1], vecadd), 13));
905     b[7] = vcombine_s16(
906         vshrn_n_s32(vaddq_s32(a[7].val[0], vecadd), 13),
907         vshrn_n_s32(vaddq_s32(a[7].val[1], vecadd), 13));
908     b[8] = vcombine_s16(
909         vshrn_n_s32(vaddq_s32(a[8].val[0], vecadd), 13),
910         vshrn_n_s32(vaddq_s32(a[8].val[1], vecadd), 13));
911     b[9] = vcombine_s16(
912         vshrn_n_s32(vaddq_s32(a[9].val[0], vecadd), 13),
913         vshrn_n_s32(vaddq_s32(a[9].val[1], vecadd), 13));
914     b[10] = vcombine_s16(
915         vshrn_n_s32(vaddq_s32(a[10].val[0], vecadd), 13),
916         vshrn_n_s32(vaddq_s32(a[10].val[1], vecadd), 13));
917     b[11] = vcombine_s16(
918         vshrn_n_s32(vaddq_s32(a[11].val[0], vecadd), 13),
919         vshrn_n_s32(vaddq_s32(a[11].val[1], vecadd), 13));
920     b[12] = vcombine_s16(
921         vshrn_n_s32(vaddq_s32(a[12].val[0], vecadd), 13),
922         vshrn_n_s32(vaddq_s32(a[12].val[1], vecadd), 13));
923     b[13] = vcombine_s16(
924         vshrn_n_s32(vaddq_s32(a[13].val[0], vecadd), 13),
925         vshrn_n_s32(vaddq_s32(a[13].val[1], vecadd), 13));
926     b[14] = vcombine_s16(
927         vshrn_n_s32(vaddq_s32(a[14].val[0], vecadd), 13),
928         vshrn_n_s32(vaddq_s32(a[14].val[1], vecadd), 13));
929     b[15] = vcombine_s16(
930         vshrn_n_s32(vaddq_s32(a[15].val[0], vecadd), 13),
931         vshrn_n_s32(vaddq_s32(a[15].val[1], vecadd), 13));
932 }
933 
934 static INLINE int32x4_t
add4(int32x4_t row1_low,int32x4_t row1_high,int32x4_t row2_low,int32x4_t row2_high)935     add4(int32x4_t row1_low, int32x4_t row1_high, int32x4_t row2_low, int32x4_t row2_high)
936 {
937     int32x4_t sum1, sum2;
938     sum1 = vaddq_s32(row1_low, row1_high);
939     sum2 = vaddq_s32(row2_low, row2_high);
940     return vaddq_s32(sum1, sum2);
941 }
942 
butterfly_one_coeff_16_32(int16x8_t a,int16x8_t b,int16_t c,int32x4x2_t * row1,int32x4x2_t * row2)943 static INLINE void butterfly_one_coeff_16_32(
944     int16x8_t a, int16x8_t b, int16_t c, int32x4x2_t *row1, int32x4x2_t *row2)
945 {
946     const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
947     const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
948     //printf("multiply done\n");
949     row1->val[0] = vmlal_n_s16(a0, vget_low_s16(b), c);
950     row1->val[1] = vmlal_n_s16(a1, vget_high_s16(b), c);
951     row2->val[0] = vmlsl_n_s16(a0, vget_low_s16(b), c);
952     row2->val[1] = vmlsl_n_s16(a1, vget_high_s16(b), c);
953 }
954 
butterfly_two_coeff_16_32(int16x8_t a,int16x8_t b,int16_t c0,int16_t c1,int32x4x2_t * row1,int32x4x2_t * row2)955 static INLINE void butterfly_two_coeff_16_32(
956     int16x8_t a, int16x8_t b, int16_t c0, int16_t c1, int32x4x2_t *row1, int32x4x2_t *row2)
957 {
958     const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
959     const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
960     const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
961     const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
962     row1->val[0] = vmlal_n_s16(a2, vget_low_s16(b), c0);
963     row1->val[1] = vmlal_n_s16(a3, vget_high_s16(b), c0);
964     row2->val[0] = vmlsl_n_s16(a0, vget_low_s16(b), c1);
965     row2->val[1] = vmlsl_n_s16(a1, vget_high_s16(b), c1);
966 }
967 
butterfly_one_coeff_32_32(int32x4x2_t a,int32x4x2_t b,int32_t c,int32x4x2_t * row1,int32x4x2_t * row2)968 static INLINE void butterfly_one_coeff_32_32(
969     int32x4x2_t a, int32x4x2_t b, int32_t c, int32x4x2_t *row1, int32x4x2_t *row2)
970 {
971     const int32x4_t a0 = vmulq_n_s32(a.val[0], c);
972     const int32x4_t a1 = vmulq_n_s32(a.val[1], c);
973     row1->val[0] = vmlaq_n_s32(a0, b.val[0], c);
974     row1->val[1] = vmlaq_n_s32(a1, b.val[1], c);
975     row2->val[0] = vmlsq_n_s32(a0, b.val[0], c);
976     row2->val[1] = vmlsq_n_s32(a1, b.val[1], c);
977 }
978 
butterfly_two_coeff_32_32(int32x4x2_t a,int32x4x2_t b,int32_t c0,int32_t c1,int32x4x2_t * row1,int32x4x2_t * row2)979 static INLINE void butterfly_two_coeff_32_32(
980     int32x4x2_t a, int32x4x2_t b, int32_t c0, int32_t c1, int32x4x2_t *row1, int32x4x2_t *row2)
981 {
982     const int32x4_t a0 = vmulq_n_s32(a.val[0], c0);
983     const int32x4_t a1 = vmulq_n_s32(a.val[1], c0);
984     const int32x4_t a2 = vmulq_n_s32(a.val[0], c1);
985     const int32x4_t a3 = vmulq_n_s32(a.val[1], c1);
986     row1->val[0] = vmlaq_n_s32(a2, b.val[0], c0);
987     row1->val[1] = vmlaq_n_s32(a3, b.val[1], c0);
988     row2->val[0] = vmlsq_n_s32(a0, b.val[0], c1);
989     row2->val[1] = vmlsq_n_s32(a1, b.val[1], c1);
990 }
991 
992 // Transpose 8x8 to a new location. Don't use transpose_neon.h because those
993 // are all in-place.
transpose_8x8(int32x4x2_t * a,int32x4x2_t * b)994 static INLINE void transpose_8x8(int32x4x2_t *a /*[8][2]*/, int32x4x2_t *b)
995 {
996     const int32x4x2_t c0 = vtrnq_s32(a[0].val[0], a[1].val[0]);
997     const int32x4x2_t c1 = vtrnq_s32(a[2].val[0], a[3].val[0]);
998     const int32x4x2_t c2 = vtrnq_s32(a[4].val[0], a[5].val[0]);
999     const int32x4x2_t c3 = vtrnq_s32(a[6].val[0], a[7].val[0]);
1000     const int32x4x2_t c4 = vtrnq_s32(a[0].val[1], a[1].val[1]);
1001     const int32x4x2_t c5 = vtrnq_s32(a[2].val[1], a[3].val[1]);
1002     const int32x4x2_t c6 = vtrnq_s32(a[4].val[1], a[5].val[1]);
1003     const int32x4x2_t c7 = vtrnq_s32(a[6].val[1], a[7].val[1]);
1004 
1005     const int32x4x2_t d0 = vtrnq_s64_to_s32(c0.val[0], c1.val[0]);
1006     const int32x4x2_t d1 = vtrnq_s64_to_s32(c0.val[1], c1.val[1]);
1007     const int32x4x2_t d2 = vtrnq_s64_to_s32(c2.val[0], c3.val[0]);
1008     const int32x4x2_t d3 = vtrnq_s64_to_s32(c2.val[1], c3.val[1]);
1009     const int32x4x2_t d4 = vtrnq_s64_to_s32(c4.val[0], c5.val[0]);
1010     const int32x4x2_t d5 = vtrnq_s64_to_s32(c4.val[1], c5.val[1]);
1011     const int32x4x2_t d6 = vtrnq_s64_to_s32(c6.val[0], c7.val[0]);
1012     const int32x4x2_t d7 = vtrnq_s64_to_s32(c6.val[1], c7.val[1]);
1013 
1014     b[0].val[0] = d0.val[0];
1015     b[0].val[1] = d2.val[0];
1016     b[1].val[0] = d1.val[0];
1017     b[1].val[1] = d3.val[0];
1018     b[2].val[0] = d0.val[1];
1019     b[2].val[1] = d2.val[1];
1020     b[3].val[0] = d1.val[1];
1021     b[3].val[1] = d3.val[1];
1022     b[4].val[0] = d4.val[0];
1023     b[4].val[1] = d6.val[0];
1024     b[5].val[0] = d5.val[0];
1025     b[5].val[1] = d7.val[0];
1026     b[6].val[0] = d4.val[1];
1027     b[6].val[1] = d6.val[1];
1028     b[7].val[0] = d5.val[1];
1029     b[7].val[1] = d7.val[1];
1030 }
1031 
dct_body_16_32(int16x8_t * in,int32x4x2_t * out)1032 static void dct_body_16_32(int16x8_t *in /*[16]*/, int32x4x2_t *out /*[16]*/)
1033 {
1034     int16x8_t s[8];
1035     int16x8_t x[4];
1036     int32x4x2_t tmp0, tmp1, tmp2, tmp3;
1037     int32x4x2_t tmp4, tmp5, tmp6, tmp7;
1038 
1039     s[0] = vaddq_s16(in[0], in[7]);
1040     s[1] = vaddq_s16(in[1], in[6]);
1041     s[2] = vaddq_s16(in[2], in[5]);
1042     s[3] = vaddq_s16(in[3], in[4]);
1043     s[4] = vsubq_s16(in[3], in[4]);
1044     s[5] = vsubq_s16(in[2], in[5]);
1045     s[6] = vsubq_s16(in[1], in[6]);
1046     s[7] = vsubq_s16(in[0], in[7]);
1047 
1048     x[0] = vaddq_s16(s[0], s[3]);
1049     x[1] = vaddq_s16(s[1], s[2]);
1050     x[2] = vsubq_s16(s[1], s[2]);
1051     x[3] = vsubq_s16(s[0], s[3]);
1052 
1053     // Type 1
1054     // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
1055     // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
1056     butterfly_one_coeff_16_32(x[0], x[1], 64, &out[0], &out[8]);
1057 
1058     // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
1059     // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
1060     butterfly_two_coeff_16_32(x[3], x[2], 36, 83, &out[4], &out[12]);
1061 
1062     //  Type 2
1063     butterfly_two_coeff_16_32(s[7], s[4], 18, 89, &tmp0, &tmp1);
1064     butterfly_two_coeff_16_32(s[5], s[6], 75, 50, &tmp2, &tmp3);
1065 
1066     out[2].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1067     out[2].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1068 
1069     out[14].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1070     out[14].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1071 
1072     butterfly_two_coeff_16_32(s[7], s[4], 75, 50, &tmp0, &tmp1);
1073     butterfly_two_coeff_16_32(s[5], s[6], -89, 18, &tmp2, &tmp3);
1074 
1075     out[10].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1076     out[10].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1077 
1078     out[6].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1079     out[6].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1080 
1081     //  Type 3
1082     butterfly_two_coeff_16_32(in[8], in[15], 9, -90, &tmp0, &tmp1);
1083     butterfly_two_coeff_16_32(in[9], in[14], 87, 25, &tmp2, &tmp3);
1084     butterfly_two_coeff_16_32(in[10], in[13], 43, -80, &tmp4, &tmp5);
1085     butterfly_two_coeff_16_32(in[11], in[12], 70, 57, &tmp6, &tmp7);
1086 
1087     out[1].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1088     out[1].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1089 
1090     out[15].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1091     out[15].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1092 
1093     butterfly_two_coeff_16_32(in[8], in[15], 87, -25, &tmp0, &tmp1);
1094     butterfly_two_coeff_16_32(in[9], in[14], -70, -57, &tmp2, &tmp3);
1095     butterfly_two_coeff_16_32(in[10], in[13], 9, -90, &tmp4, &tmp5);
1096     butterfly_two_coeff_16_32(in[11], in[12], -80, 43, &tmp6, &tmp7);
1097 
1098     out[3].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1099     out[3].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1100 
1101     out[13].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1102     out[13].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1103 
1104     butterfly_two_coeff_16_32(in[8], in[15], 43, -80, &tmp0, &tmp1);
1105     butterfly_two_coeff_16_32(in[9], in[14], 9, 90, &tmp2, &tmp3);
1106     butterfly_two_coeff_16_32(in[10], in[13], 57, 70, &tmp4, &tmp5);
1107     butterfly_two_coeff_16_32(in[11], in[12], -87, -25, &tmp6, &tmp7);
1108 
1109     out[5].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1110     out[5].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1111 
1112     out[11].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1113     out[11].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1114 
1115     butterfly_two_coeff_16_32(in[8], in[15], 70, -57, &tmp0, &tmp1);
1116     butterfly_two_coeff_16_32(in[9], in[14], -80, 43, &tmp2, &tmp3);
1117     butterfly_two_coeff_16_32(in[10], in[13], -87, 25, &tmp4, &tmp5);
1118     butterfly_two_coeff_16_32(in[11], in[12], 90, -9, &tmp6, &tmp7);
1119 
1120     out[7].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1121     out[7].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1122 
1123     out[9].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1124     out[9].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1125 }
1126 
dct_body_32_32(int32x4x2_t * in,int32x4x2_t * out)1127 static void dct_body_32_32(int32x4x2_t *in /*[16]*/, int32x4x2_t *out /*[16]*/)
1128 {
1129     int32x4x2_t s[8];
1130     int32x4x2_t x[4];
1131     int32x4x2_t tmp0, tmp1, tmp2, tmp3;
1132     int32x4x2_t tmp4, tmp5, tmp6, tmp7;
1133     WORD32 i;
1134 
1135     for(i = 0; i < 2; i++)
1136     {
1137         s[0].val[i] = vaddq_s32(in[0].val[i], in[7].val[i]);
1138         s[1].val[i] = vaddq_s32(in[1].val[i], in[6].val[i]);
1139         s[2].val[i] = vaddq_s32(in[2].val[i], in[5].val[i]);
1140         s[3].val[i] = vaddq_s32(in[3].val[i], in[4].val[i]);
1141         s[4].val[i] = vsubq_s32(in[3].val[i], in[4].val[i]);
1142         s[5].val[i] = vsubq_s32(in[2].val[i], in[5].val[i]);
1143         s[6].val[i] = vsubq_s32(in[1].val[i], in[6].val[i]);
1144         s[7].val[i] = vsubq_s32(in[0].val[i], in[7].val[i]);
1145 
1146         x[0].val[i] = vaddq_s32(s[0].val[i], s[3].val[i]);
1147         x[1].val[i] = vaddq_s32(s[1].val[i], s[2].val[i]);
1148         x[2].val[i] = vsubq_s32(s[1].val[i], s[2].val[i]);
1149         x[3].val[i] = vsubq_s32(s[0].val[i], s[3].val[i]);
1150     }
1151 
1152     // Type 1
1153     // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
1154     // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
1155     butterfly_one_coeff_32_32(x[0], x[1], 64, &out[0], &out[8]);
1156     // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
1157     // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
1158     butterfly_two_coeff_32_32(x[3], x[2], 36, 83, &out[4], &out[12]);
1159 
1160     //  Type 2
1161     butterfly_two_coeff_32_32(s[7], s[4], 18, 89, &tmp0, &tmp1);
1162     butterfly_two_coeff_32_32(s[5], s[6], 75, 50, &tmp2, &tmp3);
1163 
1164     out[2].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1165     out[2].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1166 
1167     out[14].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1168     out[14].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1169 
1170     butterfly_two_coeff_32_32(s[7], s[4], 75, 50, &tmp0, &tmp1);
1171     butterfly_two_coeff_32_32(s[5], s[6], -89, 18, &tmp2, &tmp3);
1172 
1173     out[10].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1174     out[10].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1175 
1176     out[6].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1177     out[6].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1178 
1179     //  Type 3
1180     butterfly_two_coeff_32_32(in[8], in[15], 9, -90, &tmp0, &tmp1);
1181     butterfly_two_coeff_32_32(in[9], in[14], 87, 25, &tmp2, &tmp3);
1182     butterfly_two_coeff_32_32(in[10], in[13], 43, -80, &tmp4, &tmp5);
1183     butterfly_two_coeff_32_32(in[11], in[12], 70, 57, &tmp6, &tmp7);
1184 
1185     out[1].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1186     out[1].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1187 
1188     out[15].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1189     out[15].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1190 
1191     butterfly_two_coeff_32_32(in[8], in[15], 87, -25, &tmp0, &tmp1);
1192     butterfly_two_coeff_32_32(in[9], in[14], -70, -57, &tmp2, &tmp3);
1193     butterfly_two_coeff_32_32(in[10], in[13], 9, -90, &tmp4, &tmp5);
1194     butterfly_two_coeff_32_32(in[11], in[12], -80, 43, &tmp6, &tmp7);
1195 
1196     out[3].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1197     out[3].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1198 
1199     out[13].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1200     out[13].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1201 
1202     butterfly_two_coeff_32_32(in[8], in[15], 43, -80, &tmp0, &tmp1);
1203     butterfly_two_coeff_32_32(in[9], in[14], 9, 90, &tmp2, &tmp3);
1204     butterfly_two_coeff_32_32(in[10], in[13], 57, 70, &tmp4, &tmp5);
1205     butterfly_two_coeff_32_32(in[11], in[12], -87, -25, &tmp6, &tmp7);
1206 
1207     out[5].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1208     out[5].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1209 
1210     out[11].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1211     out[11].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1212 
1213     butterfly_two_coeff_32_32(in[8], in[15], 70, -57, &tmp0, &tmp1);
1214     butterfly_two_coeff_32_32(in[9], in[14], -80, 43, &tmp2, &tmp3);
1215     butterfly_two_coeff_32_32(in[10], in[13], -87, 25, &tmp4, &tmp5);
1216     butterfly_two_coeff_32_32(in[11], in[12], 90, -9, &tmp6, &tmp7);
1217 
1218     out[7].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1219     out[7].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1220 
1221     out[9].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1222     out[9].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1223 }
1224 
1225 /**
1226  *******************************************************************************
1227  *
1228  * @brief
1229  *  This function performs residue calculation and forward  transform on
1230  * input pixels
1231  *
1232  * @par Description:
1233  *  Performs residue calculation by subtracting source and  prediction and
1234  * followed by forward transform
1235  *
1236  * @param[in] pu1_src
1237  *  Input 16x16 pixels
1238  *
1239  * @param[in] pu1_pred
1240  *  Prediction data
1241  *
1242  * @param[in] pi2_tmp
1243  *  Temporary buffer of size 16x16
1244  *
1245  * @param[out] pi2_dst
1246  *  Output 16x16 coefficients
1247  *
1248  * @param[in] src_strd
1249  *  Input stride
1250  *
1251  * @param[in] pred_strd
1252  *  Prediction Stride
1253  *
1254  * @param[in] dst_strd
1255  *  Output Stride
1256  *
1257  * @param[in] e_chroma_plane
1258  *  Enum singalling chroma plane
1259  *
1260  * @returns  Void
1261  *
1262  * @remarks
1263  *  None
1264  *
1265  *******************************************************************************
1266  */
ihevc_resi_trans_16x16_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)1267 UWORD32 ihevc_resi_trans_16x16_neon(
1268     UWORD8 *pu1_src,
1269     UWORD8 *pu1_pred,
1270     WORD32 *pi4_temp,
1271     WORD16 *pi2_dst,
1272     WORD32 src_strd,
1273     WORD32 pred_strd,
1274     WORD32 dst_strd,
1275     CHROMA_PLANE_ID_T e_chroma_plane)
1276 {
1277     UWORD32 u4_blk_sad = 0;
1278     WORD32 chroma_flag;
1279     uint8x8_t temp0[16], temp1[16];
1280     int16x8_t temp2[16], temp3[16];
1281     int32x4_t tmp_a, tmp_b;
1282     int64x2_t tmp_c;
1283     int32x2_t sad_v;
1284     int32x4x2_t out0[16], out1[16], temp4[16], temp5[16];
1285 
1286     (void)pi4_temp;
1287     chroma_flag = e_chroma_plane != NULL_PLANE;
1288     /* Residue + Forward Transform 1st stage */
1289     // Left half.
1290     load(pu1_src, src_strd, temp0, e_chroma_plane);
1291     load(pu1_pred, pred_strd, temp1, e_chroma_plane);
1292 
1293     tmp_a = diff(temp0, temp1, temp2);
1294     cross_input_16(temp2, temp3);
1295     dct_body_16_32(temp3, out0);
1296 
1297     // Right half.
1298     load(pu1_src + 8 * (1 + chroma_flag), src_strd, temp0, e_chroma_plane);
1299     load(pu1_pred + 8 * (1 + chroma_flag), pred_strd, temp1, e_chroma_plane);
1300 
1301     tmp_b = diff(temp0, temp1, temp2);
1302     cross_input_16(temp2, temp3);
1303     dct_body_16_32(temp3, out1);
1304 
1305     tmp_a = vaddq_s32(tmp_a, tmp_b);
1306     tmp_c = vpaddlq_s32(tmp_a);
1307     sad_v = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_c)),
1308                    vreinterpret_s32_s64(vget_high_s64(tmp_c)));
1309     u4_blk_sad = vget_lane_s32(sad_v, 0);
1310 
1311 
1312     // Transpose top left and top right quarters into one contiguous location to
1313     // process to the top half.
1314     transpose_8x8(&out0[0], &temp4[0]);
1315     transpose_8x8(&out1[0], &temp4[8]);
1316 
1317     cross_input_32(temp4, temp5);
1318     dct_body_32_32(temp5, temp4);
1319     partial_round_shift(temp4, temp2);
1320     transpose_s16_8x8(
1321         &temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]);
1322     transpose_s16_8x8(
1323         &temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], &temp2[13], &temp2[14], &temp2[15]);
1324 
1325     store(pi2_dst, &temp2[0]);
1326     store(pi2_dst + 8, &temp2[8]);
1327     pi2_dst += 8 * dst_strd;
1328 
1329     // Transpose bottom left and bottom right quarters into one contiguous
1330     // location to process to the bottom half.
1331     transpose_8x8(&out0[8], &out1[0]);
1332     transpose_s32_8x8(
1333         &out1[8], &out1[9], &out1[10], &out1[11], &out1[12], &out1[13], &out1[14], &out1[15]);
1334 
1335     cross_input_32(out1, temp5);
1336     dct_body_32_32(temp5, temp4);
1337     partial_round_shift(temp4, temp2);
1338     transpose_s16_8x8(
1339         &temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]);
1340     transpose_s16_8x8(
1341         &temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], &temp2[13], &temp2[14], &temp2[15]);
1342     store(pi2_dst, &temp2[0]);
1343     store(pi2_dst + 8, &temp2[8]);
1344 
1345     return u4_blk_sad;
1346 }
1347