xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_copy_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar  *
3*c83a76b0SSuyog Pawar  * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar  *
5*c83a76b0SSuyog Pawar  * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar  * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar  * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar  *
9*c83a76b0SSuyog Pawar  * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar  *
11*c83a76b0SSuyog Pawar  * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar  * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar  * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar  * limitations under the License.
16*c83a76b0SSuyog Pawar  *
17*c83a76b0SSuyog Pawar  *****************************************************************************
18*c83a76b0SSuyog Pawar  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar */
20*c83a76b0SSuyog Pawar /**
21*c83a76b0SSuyog Pawar *******************************************************************************
22*c83a76b0SSuyog Pawar * @file
23*c83a76b0SSuyog Pawar *  ihevce_copy_neon.c
24*c83a76b0SSuyog Pawar *
25*c83a76b0SSuyog Pawar * @brief
26*c83a76b0SSuyog Pawar *  Contains intrinsic definitions of functions for block copy
27*c83a76b0SSuyog Pawar *
28*c83a76b0SSuyog Pawar * @author
29*c83a76b0SSuyog Pawar *  ittiam
30*c83a76b0SSuyog Pawar *
31*c83a76b0SSuyog Pawar * @par List of Functions:
32*c83a76b0SSuyog Pawar *  - ihevce_2d_square_copy_luma_neon()
33*c83a76b0SSuyog Pawar *  - ihevce_copy_2d_neon()
34*c83a76b0SSuyog Pawar *  - ihevce_chroma_interleave_2d_copy_neon()
35*c83a76b0SSuyog Pawar *
36*c83a76b0SSuyog Pawar * @remarks
37*c83a76b0SSuyog Pawar *  None
38*c83a76b0SSuyog Pawar *
39*c83a76b0SSuyog Pawar *******************************************************************************
40*c83a76b0SSuyog Pawar */
41*c83a76b0SSuyog Pawar 
42*c83a76b0SSuyog Pawar /*****************************************************************************/
43*c83a76b0SSuyog Pawar /* File Includes                                                             */
44*c83a76b0SSuyog Pawar /*****************************************************************************/
45*c83a76b0SSuyog Pawar /* System include files */
46*c83a76b0SSuyog Pawar #include <string.h>
47*c83a76b0SSuyog Pawar #include <assert.h>
48*c83a76b0SSuyog Pawar #include <arm_neon.h>
49*c83a76b0SSuyog Pawar 
50*c83a76b0SSuyog Pawar /* User include files */
51*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
52*c83a76b0SSuyog Pawar #include "itt_video_api.h"
53*c83a76b0SSuyog Pawar #include "ihevc_platform_macros.h"
54*c83a76b0SSuyog Pawar 
55*c83a76b0SSuyog Pawar #include "ihevce_cmn_utils_instr_set_router.h"
56*c83a76b0SSuyog Pawar 
57*c83a76b0SSuyog Pawar /*****************************************************************************/
58*c83a76b0SSuyog Pawar /* Function Definitions                                                      */
59*c83a76b0SSuyog Pawar /*****************************************************************************/
60*c83a76b0SSuyog Pawar 
ihevce_chroma_interleave_2d_copy_neon(UWORD8 * pu1_uv_src,WORD32 src_strd,UWORD8 * pu1_uv_dst,WORD32 dst_strd,WORD32 w,WORD32 h,CHROMA_PLANE_ID_T e_chroma_plane)61*c83a76b0SSuyog Pawar void ihevce_chroma_interleave_2d_copy_neon(
62*c83a76b0SSuyog Pawar     UWORD8 *pu1_uv_src,
63*c83a76b0SSuyog Pawar     WORD32 src_strd,
64*c83a76b0SSuyog Pawar     UWORD8 *pu1_uv_dst,
65*c83a76b0SSuyog Pawar     WORD32 dst_strd,
66*c83a76b0SSuyog Pawar     WORD32 w,
67*c83a76b0SSuyog Pawar     WORD32 h,
68*c83a76b0SSuyog Pawar     CHROMA_PLANE_ID_T e_chroma_plane)
69*c83a76b0SSuyog Pawar {
70*c83a76b0SSuyog Pawar     (void)h;
71*c83a76b0SSuyog Pawar     assert(w == h);
72*c83a76b0SSuyog Pawar     assert((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
73*c83a76b0SSuyog Pawar 
74*c83a76b0SSuyog Pawar     if(w == 4)
75*c83a76b0SSuyog Pawar     {
76*c83a76b0SSuyog Pawar         uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
77*c83a76b0SSuyog Pawar 
78*c83a76b0SSuyog Pawar         for(; w > 0; w--)
79*c83a76b0SSuyog Pawar         {
80*c83a76b0SSuyog Pawar             uint8x8_t src_0, dst_0;
81*c83a76b0SSuyog Pawar 
82*c83a76b0SSuyog Pawar             // row 0
83*c83a76b0SSuyog Pawar             src_0 = vld1_u8(pu1_uv_src);
84*c83a76b0SSuyog Pawar             dst_0 = vld1_u8(pu1_uv_dst);
85*c83a76b0SSuyog Pawar             dst_0 = vbsl_u8(vreinterpret_u8_u16(select), src_0, dst_0);
86*c83a76b0SSuyog Pawar             vst1_u8(pu1_uv_dst, dst_0);
87*c83a76b0SSuyog Pawar             pu1_uv_src += src_strd;
88*c83a76b0SSuyog Pawar             pu1_uv_dst += dst_strd;
89*c83a76b0SSuyog Pawar         }
90*c83a76b0SSuyog Pawar     }
91*c83a76b0SSuyog Pawar     else
92*c83a76b0SSuyog Pawar     {
93*c83a76b0SSuyog Pawar         uint16x8_t select = vdupq_n_u16(0xff << (e_chroma_plane << 3));
94*c83a76b0SSuyog Pawar         WORD32 i, j;
95*c83a76b0SSuyog Pawar 
96*c83a76b0SSuyog Pawar         assert(w % 8 == 0);
97*c83a76b0SSuyog Pawar         for(j = 0; j < w; j += 1)
98*c83a76b0SSuyog Pawar         {
99*c83a76b0SSuyog Pawar             UWORD8 *dst_ol = pu1_uv_dst + j * dst_strd;
100*c83a76b0SSuyog Pawar             UWORD8 *src_ol = pu1_uv_src + j * src_strd;
101*c83a76b0SSuyog Pawar 
102*c83a76b0SSuyog Pawar             for(i = 0; i < w; i += 8)
103*c83a76b0SSuyog Pawar             {
104*c83a76b0SSuyog Pawar                 UWORD8 *dst_il = dst_ol + (i * 2);
105*c83a76b0SSuyog Pawar                 UWORD8 *src_il = src_ol + (i * 2);
106*c83a76b0SSuyog Pawar                 uint8x16_t src_0, dst_0;
107*c83a76b0SSuyog Pawar 
108*c83a76b0SSuyog Pawar                 // row 0
109*c83a76b0SSuyog Pawar                 src_0 = vld1q_u8(src_il);
110*c83a76b0SSuyog Pawar                 dst_0 = vld1q_u8(dst_il);
111*c83a76b0SSuyog Pawar                 dst_0 = vbslq_u8(vreinterpretq_u8_u16(select), src_0, dst_0);
112*c83a76b0SSuyog Pawar                 vst1q_u8(dst_il, dst_0);
113*c83a76b0SSuyog Pawar             }
114*c83a76b0SSuyog Pawar         }
115*c83a76b0SSuyog Pawar     }
116*c83a76b0SSuyog Pawar }
117*c83a76b0SSuyog Pawar 
copy_2d_neon(UWORD8 * pu1_dst,WORD32 dst_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 blk_wd,WORD32 blk_ht)118*c83a76b0SSuyog Pawar static void copy_2d_neon(
119*c83a76b0SSuyog Pawar     UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
120*c83a76b0SSuyog Pawar {
121*c83a76b0SSuyog Pawar     assert(blk_wd == 4 || blk_wd == 8 || blk_wd == 16 || blk_wd == 32 || (blk_wd % 64 == 0));
122*c83a76b0SSuyog Pawar 
123*c83a76b0SSuyog Pawar     if(blk_wd == 4)
124*c83a76b0SSuyog Pawar     {
125*c83a76b0SSuyog Pawar         for(; blk_ht > 0; blk_ht--)
126*c83a76b0SSuyog Pawar         {
127*c83a76b0SSuyog Pawar             memcpy(pu1_dst, pu1_src, 4);
128*c83a76b0SSuyog Pawar             pu1_src += src_strd;
129*c83a76b0SSuyog Pawar             pu1_dst += dst_strd;
130*c83a76b0SSuyog Pawar         }
131*c83a76b0SSuyog Pawar     }
132*c83a76b0SSuyog Pawar     else if(blk_wd == 8)
133*c83a76b0SSuyog Pawar     {
134*c83a76b0SSuyog Pawar         for(; blk_ht > 0; blk_ht--)
135*c83a76b0SSuyog Pawar         {
136*c83a76b0SSuyog Pawar             uint8x8_t src = vld1_u8(pu1_src);
137*c83a76b0SSuyog Pawar 
138*c83a76b0SSuyog Pawar             vst1_u8(pu1_dst, src);
139*c83a76b0SSuyog Pawar             pu1_src += src_strd;
140*c83a76b0SSuyog Pawar             pu1_dst += dst_strd;
141*c83a76b0SSuyog Pawar         }
142*c83a76b0SSuyog Pawar     }
143*c83a76b0SSuyog Pawar     else if(blk_wd == 16)
144*c83a76b0SSuyog Pawar     {
145*c83a76b0SSuyog Pawar         for(; blk_ht > 0; blk_ht--)
146*c83a76b0SSuyog Pawar         {
147*c83a76b0SSuyog Pawar             uint8x16_t src = vld1q_u8(pu1_src);
148*c83a76b0SSuyog Pawar 
149*c83a76b0SSuyog Pawar             vst1q_u8(pu1_dst, src);
150*c83a76b0SSuyog Pawar             pu1_src += src_strd;
151*c83a76b0SSuyog Pawar             pu1_dst += dst_strd;
152*c83a76b0SSuyog Pawar         }
153*c83a76b0SSuyog Pawar     }
154*c83a76b0SSuyog Pawar     else if(blk_wd == 32)
155*c83a76b0SSuyog Pawar     {
156*c83a76b0SSuyog Pawar         for(; blk_ht > 0; blk_ht--)
157*c83a76b0SSuyog Pawar         {
158*c83a76b0SSuyog Pawar             uint8x16_t src_0, src_1;
159*c83a76b0SSuyog Pawar 
160*c83a76b0SSuyog Pawar             // row 0
161*c83a76b0SSuyog Pawar             src_0 = vld1q_u8(pu1_src);
162*c83a76b0SSuyog Pawar             vst1q_u8(pu1_dst, src_0);
163*c83a76b0SSuyog Pawar             src_1 = vld1q_u8(pu1_src + 16);
164*c83a76b0SSuyog Pawar             vst1q_u8(pu1_dst + 16, src_1);
165*c83a76b0SSuyog Pawar             pu1_src += src_strd;
166*c83a76b0SSuyog Pawar             pu1_dst += dst_strd;
167*c83a76b0SSuyog Pawar         }
168*c83a76b0SSuyog Pawar     }
169*c83a76b0SSuyog Pawar     else if(blk_wd % 64 == 0)
170*c83a76b0SSuyog Pawar     {
171*c83a76b0SSuyog Pawar         WORD32 i, j;
172*c83a76b0SSuyog Pawar 
173*c83a76b0SSuyog Pawar         for(j = 0; j < blk_ht; j += 1)
174*c83a76b0SSuyog Pawar         {
175*c83a76b0SSuyog Pawar             UWORD8 *dst_ol = pu1_dst + j * dst_strd;
176*c83a76b0SSuyog Pawar             UWORD8 *src_ol = pu1_src + j * src_strd;
177*c83a76b0SSuyog Pawar 
178*c83a76b0SSuyog Pawar             for(i = 0; i < blk_wd; i += 64)
179*c83a76b0SSuyog Pawar             {
180*c83a76b0SSuyog Pawar                 uint8x16_t src_0, src_1, src_2, src_3;
181*c83a76b0SSuyog Pawar                 UWORD8 *dst_il = dst_ol + i;
182*c83a76b0SSuyog Pawar                 UWORD8 *src_il = src_ol + i;
183*c83a76b0SSuyog Pawar 
184*c83a76b0SSuyog Pawar                 src_0 = vld1q_u8(src_il);
185*c83a76b0SSuyog Pawar                 vst1q_u8(dst_il, src_0);
186*c83a76b0SSuyog Pawar                 src_1 = vld1q_u8(src_il + 16);
187*c83a76b0SSuyog Pawar                 vst1q_u8(dst_il + 16, src_1);
188*c83a76b0SSuyog Pawar                 src_2 = vld1q_u8(src_il + 32);
189*c83a76b0SSuyog Pawar                 vst1q_u8(dst_il + 32, src_2);
190*c83a76b0SSuyog Pawar                 src_3 = vld1q_u8(src_il + 48);
191*c83a76b0SSuyog Pawar                 vst1q_u8(dst_il + 48, src_3);
192*c83a76b0SSuyog Pawar             }
193*c83a76b0SSuyog Pawar         }
194*c83a76b0SSuyog Pawar     }
195*c83a76b0SSuyog Pawar }
196*c83a76b0SSuyog Pawar 
ihevce_2d_square_copy_luma_neon(void * p_dst,WORD32 dst_strd,void * p_src,WORD32 src_strd,WORD32 num_cols_to_copy,WORD32 unit_size)197*c83a76b0SSuyog Pawar void ihevce_2d_square_copy_luma_neon(
198*c83a76b0SSuyog Pawar     void *p_dst,
199*c83a76b0SSuyog Pawar     WORD32 dst_strd,
200*c83a76b0SSuyog Pawar     void *p_src,
201*c83a76b0SSuyog Pawar     WORD32 src_strd,
202*c83a76b0SSuyog Pawar     WORD32 num_cols_to_copy,
203*c83a76b0SSuyog Pawar     WORD32 unit_size)
204*c83a76b0SSuyog Pawar {
205*c83a76b0SSuyog Pawar     UWORD8 *pu1_dst = (UWORD8 *)p_dst;
206*c83a76b0SSuyog Pawar     UWORD8 *pu1_src = (UWORD8 *)p_src;
207*c83a76b0SSuyog Pawar 
208*c83a76b0SSuyog Pawar     copy_2d_neon(
209*c83a76b0SSuyog Pawar         pu1_dst,
210*c83a76b0SSuyog Pawar         dst_strd * unit_size,
211*c83a76b0SSuyog Pawar         pu1_src,
212*c83a76b0SSuyog Pawar         src_strd * unit_size,
213*c83a76b0SSuyog Pawar         num_cols_to_copy * unit_size,
214*c83a76b0SSuyog Pawar         num_cols_to_copy);
215*c83a76b0SSuyog Pawar }
216*c83a76b0SSuyog Pawar 
ihevce_copy_2d_neon(UWORD8 * pu1_dst,WORD32 dst_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 blk_wd,WORD32 blk_ht)217*c83a76b0SSuyog Pawar void ihevce_copy_2d_neon(
218*c83a76b0SSuyog Pawar     UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
219*c83a76b0SSuyog Pawar {
220*c83a76b0SSuyog Pawar     if(blk_wd == 0)
221*c83a76b0SSuyog Pawar         return;
222*c83a76b0SSuyog Pawar 
223*c83a76b0SSuyog Pawar     if(blk_wd > 64)
224*c83a76b0SSuyog Pawar     {
225*c83a76b0SSuyog Pawar         copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 64, blk_ht);
226*c83a76b0SSuyog Pawar         ihevce_copy_2d_neon(pu1_dst + 64, dst_strd, pu1_src + 64, src_strd, blk_wd - 64, blk_ht);
227*c83a76b0SSuyog Pawar     }
228*c83a76b0SSuyog Pawar     else if(blk_wd > 32)
229*c83a76b0SSuyog Pawar     {
230*c83a76b0SSuyog Pawar         copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 32, blk_ht);
231*c83a76b0SSuyog Pawar         ihevce_copy_2d_neon(pu1_dst + 32, dst_strd, pu1_src + 32, src_strd, blk_wd - 32, blk_ht);
232*c83a76b0SSuyog Pawar     }
233*c83a76b0SSuyog Pawar     else if(blk_wd >= 16)
234*c83a76b0SSuyog Pawar     {
235*c83a76b0SSuyog Pawar         if(blk_ht % 2 == 0)
236*c83a76b0SSuyog Pawar         {
237*c83a76b0SSuyog Pawar             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht);
238*c83a76b0SSuyog Pawar             ihevce_copy_2d_neon(
239*c83a76b0SSuyog Pawar                 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht);
240*c83a76b0SSuyog Pawar         }
241*c83a76b0SSuyog Pawar         else
242*c83a76b0SSuyog Pawar         {
243*c83a76b0SSuyog Pawar             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht - 1);
244*c83a76b0SSuyog Pawar             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
245*c83a76b0SSuyog Pawar             ihevce_copy_2d_neon(
246*c83a76b0SSuyog Pawar                 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht - 1);
247*c83a76b0SSuyog Pawar         }
248*c83a76b0SSuyog Pawar     }
249*c83a76b0SSuyog Pawar     else if(blk_wd >= 8)
250*c83a76b0SSuyog Pawar     {
251*c83a76b0SSuyog Pawar         if(blk_ht % 2 == 0)
252*c83a76b0SSuyog Pawar         {
253*c83a76b0SSuyog Pawar             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht);
254*c83a76b0SSuyog Pawar             ihevce_copy_2d_neon(pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht);
255*c83a76b0SSuyog Pawar         }
256*c83a76b0SSuyog Pawar         else
257*c83a76b0SSuyog Pawar         {
258*c83a76b0SSuyog Pawar             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht - 1);
259*c83a76b0SSuyog Pawar             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
260*c83a76b0SSuyog Pawar             ihevce_copy_2d_neon(
261*c83a76b0SSuyog Pawar                 pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht - 1);
262*c83a76b0SSuyog Pawar         }
263*c83a76b0SSuyog Pawar     }
264*c83a76b0SSuyog Pawar     else if(blk_wd >= 4)
265*c83a76b0SSuyog Pawar     {
266*c83a76b0SSuyog Pawar         if(blk_ht % 2 == 0)
267*c83a76b0SSuyog Pawar         {
268*c83a76b0SSuyog Pawar             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht);
269*c83a76b0SSuyog Pawar             ihevce_copy_2d_neon(pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht);
270*c83a76b0SSuyog Pawar         }
271*c83a76b0SSuyog Pawar         else
272*c83a76b0SSuyog Pawar         {
273*c83a76b0SSuyog Pawar             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht - 1);
274*c83a76b0SSuyog Pawar             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
275*c83a76b0SSuyog Pawar             ihevce_copy_2d_neon(
276*c83a76b0SSuyog Pawar                 pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht - 1);
277*c83a76b0SSuyog Pawar         }
278*c83a76b0SSuyog Pawar     }
279*c83a76b0SSuyog Pawar     else
280*c83a76b0SSuyog Pawar     {
281*c83a76b0SSuyog Pawar         ihevce_copy_2d(pu1_dst, dst_strd, pu1_src, src_strd, blk_wd, blk_ht);
282*c83a76b0SSuyog Pawar     }
283*c83a76b0SSuyog Pawar }
284