xref: /aosp_15_r20/external/libavc/common/x86/svc/isvc_mem_fns_ssse3.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvc_mem_fns_atom_intr.c
24  *
25  * @brief
26  *  Functions used for memory operations
27  *
28  * @author
29  *  Ittiam
30  *
31  * @par List of Functions:
32  *
33  * @remarks
34  *  None
35  *
36  *******************************************************************************
37  */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 #include <stdio.h>
43 #include <stddef.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <assert.h>
47 
48 #include "ih264_typedefs.h"
49 #include "isvc_mem_fns.h"
50 
51 #include <immintrin.h>
52 
53 /**
54 ********************************************************************************
55 *  @brief  copies a 2d blk from one location to another
56 *
57 *  @param[out] pu1_dst : dst pointer
58 *
59 *  @param[in] i4_dst_stride: stride of destination
60 *
61 *  @param[in] pu1_src : src ptr
62 *
63 *  @param[in] i4_src_stride: stride of src
64 *
65 *  @param[in] i4_blk_wd : blk width
66 *
67 *  @param[in] i4_blk_ht : blk height
68 *
69 *  @return void
70 ********************************************************************************
71 */
isvc_copy_2d_ssse3(UWORD8 * pu1_dst,WORD32 i4_dst_stride,UWORD8 * pu1_src,WORD32 i4_src_stride,WORD32 i4_blk_wd,WORD32 i4_blk_ht)72 void isvc_copy_2d_ssse3(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 *pu1_src,
73                         WORD32 i4_src_stride, WORD32 i4_blk_wd, WORD32 i4_blk_ht)
74 {
75     WORD32 i, j;
76     /* all 128 bit registers are named with a suffix mxnb, where m is the */
77     /* number of n bits packed in the register                            */
78 
79     if(((i4_blk_wd % 4) != 0) || ((i4_blk_ht % 4) != 0))
80     {
81         isvc_copy_2d(pu1_dst, i4_dst_stride, pu1_src, i4_src_stride, i4_blk_wd, i4_blk_ht);
82 
83         return;
84     }
85 
86     if(0 == (i4_blk_wd & 31)) /* wd multiple of 32 case */
87     {
88         __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
89         __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
90 
91         if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
92         {
93             __m128i src8_16x8b, src9_16x8b, src10_16x8b, src11_16x8b;
94             __m128i src12_16x8b, src13_16x8b, src14_16x8b, src15_16x8b;
95 
96             for(i = 0; i < i4_blk_ht; i += 8)
97             {
98                 for(j = 0; j < i4_blk_wd; j += 32)
99                 {
100                     src0_16x8b =
101                         _mm_loadu_si128((__m128i *) (pu1_src));  // i = 0
102                     src1_16x8b =
103                         _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride));  // i = 1
104                     src2_16x8b =
105                         _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
106                     src3_16x8b =
107                         _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
108                     src4_16x8b =
109                         _mm_loadu_si128((__m128i *) (pu1_src + 4 * i4_src_stride));  // i = 4
110                     src5_16x8b =
111                         _mm_loadu_si128((__m128i *) (pu1_src + 5 * i4_src_stride));  // i = 5
112                     src6_16x8b =
113                         _mm_loadu_si128((__m128i *) (pu1_src + 6 * i4_src_stride));  // i = 6
114                     src7_16x8b =
115                         _mm_loadu_si128((__m128i *) (pu1_src + 7 * i4_src_stride));  // i = 7
116                     /* Add 16 as offset */
117                     src8_16x8b =
118                         _mm_loadu_si128((__m128i *) (pu1_src + 16));  // i = 0
119                     src9_16x8b =
120                         _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride + 16));  // i = 1
121                     src10_16x8b =
122                         _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride + 16));  // i = 2
123                     src11_16x8b =
124                         _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride + 16));  // i = 3
125                     src12_16x8b =
126                         _mm_loadu_si128((__m128i *) (pu1_src + 4 * i4_src_stride + 16));  // i = 4
127                     src13_16x8b =
128                         _mm_loadu_si128((__m128i *) (pu1_src + 5 * i4_src_stride + 16));  // i = 5
129                     src14_16x8b =
130                         _mm_loadu_si128((__m128i *) (pu1_src + 6 * i4_src_stride + 16));  // i = 6
131                     src15_16x8b =
132                         _mm_loadu_si128((__m128i *) (pu1_src + 7 * i4_src_stride + 16));  // i = 7
133 
134                     _mm_storeu_si128((__m128i *) (pu1_dst), src0_16x8b);
135                     _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride), src1_16x8b);
136                     _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
137                     _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
138                     _mm_storeu_si128((__m128i *) (pu1_dst + 4 * i4_dst_stride), src4_16x8b);
139                     _mm_storeu_si128((__m128i *) (pu1_dst + 5 * i4_dst_stride), src5_16x8b);
140                     _mm_storeu_si128((__m128i *) (pu1_dst + 6 * i4_dst_stride), src6_16x8b);
141                     _mm_storeu_si128((__m128i *) (pu1_dst + 7 * i4_dst_stride), src7_16x8b);
142 
143                     _mm_storeu_si128((__m128i *) (pu1_dst + 16), src8_16x8b);
144                     _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride + 16), src9_16x8b);
145                     _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride + 16), src10_16x8b);
146                     _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride + 16), src11_16x8b);
147                     _mm_storeu_si128((__m128i *) (pu1_dst + 4 * i4_dst_stride + 16), src12_16x8b);
148                     _mm_storeu_si128((__m128i *) (pu1_dst + 5 * i4_dst_stride + 16), src13_16x8b);
149                     _mm_storeu_si128((__m128i *) (pu1_dst + 6 * i4_dst_stride + 16), src14_16x8b);
150                     _mm_storeu_si128((__m128i *) (pu1_dst + 7 * i4_dst_stride + 16), src15_16x8b);
151 
152                     pu1_src += 32;
153                     pu1_dst += 32;
154                 }
155 
156                 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
157                 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
158             }
159         }
160         else /* ht multiple of 4 case */
161         {
162             for(i = 0; i < i4_blk_ht; i += 4)
163             {
164                 for(j = 0; j < i4_blk_wd; j += 32)
165                 {
166                     src0_16x8b =
167                         _mm_loadu_si128((__m128i *) (pu1_src));  // i = 0
168                     src1_16x8b =
169                         _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride));  // i = 1
170                     src2_16x8b =
171                         _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
172                     src3_16x8b =
173                         _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
174                     /* Add 16 as offset */
175                     src4_16x8b =
176                         _mm_loadu_si128((__m128i *) (pu1_src + 16));  // i = 0
177                     src5_16x8b =
178                         _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride + 16));  // i = 1
179                     src6_16x8b =
180                         _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride + 16));  // i = 2
181                     src7_16x8b =
182                         _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride + 16));  // i = 3
183 
184                     _mm_storeu_si128((__m128i *) (pu1_dst), src0_16x8b);
185                     _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride), src1_16x8b);
186                     _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
187                     _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
188                     _mm_storeu_si128((__m128i *) (pu1_dst + 16), src4_16x8b);
189                     _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride + 16), src5_16x8b);
190                     _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride + 16), src6_16x8b);
191                     _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride + 16), src7_16x8b);
192 
193                     pu1_src += 32;
194                     pu1_dst += 32;
195                 }
196 
197                 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
198                 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
199             }
200         }
201     }
202     else if(0 == (i4_blk_wd & 15)) /* wd multiple of 16 case */
203     {
204         __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
205 
206         if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
207         {
208             __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
209 
210             for(i = 0; i < i4_blk_ht; i += 8)
211             {
212                 for(j = 0; j < i4_blk_wd; j += 16)
213                 {
214                     src0_16x8b =
215                         _mm_loadu_si128((__m128i *) (pu1_src + 0 * i4_src_stride));  // i = 0
216                     src1_16x8b =
217                         _mm_loadu_si128((__m128i *) (pu1_src + 1 * i4_src_stride));  // i = 1
218                     src2_16x8b =
219                         _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
220                     src3_16x8b =
221                         _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
222                     src4_16x8b =
223                         _mm_loadu_si128((__m128i *) (pu1_src + 4 * i4_src_stride));  // i = 4
224                     src5_16x8b =
225                         _mm_loadu_si128((__m128i *) (pu1_src + 5 * i4_src_stride));  // i = 5
226                     src6_16x8b =
227                         _mm_loadu_si128((__m128i *) (pu1_src + 6 * i4_src_stride));  // i = 6
228                     src7_16x8b =
229                         _mm_loadu_si128((__m128i *) (pu1_src + 7 * i4_src_stride));  // i = 7
230 
231                     _mm_storeu_si128((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
232                     _mm_storeu_si128((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
233                     _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
234                     _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
235                     _mm_storeu_si128((__m128i *) (pu1_dst + 4 * i4_dst_stride), src4_16x8b);
236                     _mm_storeu_si128((__m128i *) (pu1_dst + 5 * i4_dst_stride), src5_16x8b);
237                     _mm_storeu_si128((__m128i *) (pu1_dst + 6 * i4_dst_stride), src6_16x8b);
238                     _mm_storeu_si128((__m128i *) (pu1_dst + 7 * i4_dst_stride), src7_16x8b);
239 
240                     pu1_src += 16;
241                     pu1_dst += 16;
242                 }
243 
244                 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
245                 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
246             }
247         }
248         else /* ht multiple of 4 case */
249         {
250             for(i = 0; i < i4_blk_ht; i += 4)
251             {
252                 for(j = 0; j < i4_blk_wd; j += 16)
253                 {
254                     src0_16x8b =
255                         _mm_loadu_si128((__m128i *) (pu1_src + 0 * i4_src_stride));  // i = 0
256                     src1_16x8b =
257                         _mm_loadu_si128((__m128i *) (pu1_src + 1 * i4_src_stride));  // i = 1
258                     src2_16x8b =
259                         _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
260                     src3_16x8b =
261                         _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
262 
263                     _mm_storeu_si128((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
264                     _mm_storeu_si128((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
265                     _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
266                     _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
267 
268                     pu1_src += 16;
269                     pu1_dst += 16;
270                 }
271 
272                 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
273                 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
274             }
275         }
276     }
277     else if(0 == (i4_blk_wd & 7)) /* wd multiple of 8 case */
278     {
279         __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
280 
281         if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
282         {
283             __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
284 
285             for(i = 0; i < i4_blk_ht; i += 8)
286             {
287                 for(j = 0; j < i4_blk_wd; j += 8)
288                 {
289                     src0_16x8b =
290                         _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride));  // i = 0
291                     src1_16x8b =
292                         _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride));  // i = 1
293                     src2_16x8b =
294                         _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
295                     src3_16x8b =
296                         _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
297                     src4_16x8b =
298                         _mm_loadl_epi64((__m128i *) (pu1_src + 4 * i4_src_stride));  // i = 4
299                     src5_16x8b =
300                         _mm_loadl_epi64((__m128i *) (pu1_src + 5 * i4_src_stride));  // i = 5
301                     src6_16x8b =
302                         _mm_loadl_epi64((__m128i *) (pu1_src + 6 * i4_src_stride));  // i = 6
303                     src7_16x8b =
304                         _mm_loadl_epi64((__m128i *) (pu1_src + 7 * i4_src_stride));  // i = 7
305 
306                     _mm_storel_epi64((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
307                     _mm_storel_epi64((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
308                     _mm_storel_epi64((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
309                     _mm_storel_epi64((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
310                     _mm_storel_epi64((__m128i *) (pu1_dst + 4 * i4_dst_stride), src4_16x8b);
311                     _mm_storel_epi64((__m128i *) (pu1_dst + 5 * i4_dst_stride), src5_16x8b);
312                     _mm_storel_epi64((__m128i *) (pu1_dst + 6 * i4_dst_stride), src6_16x8b);
313                     _mm_storel_epi64((__m128i *) (pu1_dst + 7 * i4_dst_stride), src7_16x8b);
314 
315                     pu1_src += 8;
316                     pu1_dst += 8;
317                 }
318 
319                 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
320                 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
321             }
322         }
323         else /* ht multiple of 4 case */
324         {
325             for(i = 0; i < i4_blk_ht; i += 4)
326             {
327                 for(j = 0; j < i4_blk_wd; j += 8)
328                 {
329                     src0_16x8b =
330                         _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride));  // i = 0
331                     src1_16x8b =
332                         _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride));  // i = 1
333                     src2_16x8b =
334                         _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
335                     src3_16x8b =
336                         _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
337 
338                     _mm_storel_epi64((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
339                     _mm_storel_epi64((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
340                     _mm_storel_epi64((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
341                     _mm_storel_epi64((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
342 
343                     pu1_src += 8;
344                     pu1_dst += 8;
345                 }
346 
347                 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
348                 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
349             }
350         }
351     }
352     else /* wd multiple of 4 case */
353     {
354         __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
355         WORD32 src0, src1, src2, src3;
356         if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
357         {
358             __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
359             WORD32 src4, src5, src6, src7;
360 
361             for(i = 0; i < i4_blk_ht; i += 8)
362             {
363                 for(j = 0; j < i4_blk_wd; j += 4)
364                 {
365                     src0_16x8b =
366                         _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride));  // i = 0
367                     src1_16x8b =
368                         _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride));  // i = 1
369                     src2_16x8b =
370                         _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
371                     src3_16x8b =
372                         _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
373                     src4_16x8b =
374                         _mm_loadl_epi64((__m128i *) (pu1_src + 4 * i4_src_stride));  // i = 4
375                     src5_16x8b =
376                         _mm_loadl_epi64((__m128i *) (pu1_src + 5 * i4_src_stride));  // i = 5
377                     src6_16x8b =
378                         _mm_loadl_epi64((__m128i *) (pu1_src + 6 * i4_src_stride));  // i = 6
379                     src7_16x8b =
380                         _mm_loadl_epi64((__m128i *) (pu1_src + 7 * i4_src_stride));  // i = 7
381 
382                     src0 = _mm_cvtsi128_si32(src0_16x8b);
383                     src1 = _mm_cvtsi128_si32(src1_16x8b);
384                     src2 = _mm_cvtsi128_si32(src2_16x8b);
385                     src3 = _mm_cvtsi128_si32(src3_16x8b);
386                     src4 = _mm_cvtsi128_si32(src4_16x8b);
387                     src5 = _mm_cvtsi128_si32(src5_16x8b);
388                     src6 = _mm_cvtsi128_si32(src6_16x8b);
389                     src7 = _mm_cvtsi128_si32(src7_16x8b);
390 
391                     *(WORD32 *) (&pu1_dst[0 * i4_dst_stride]) = src0;
392                     *(WORD32 *) (&pu1_dst[1 * i4_dst_stride]) = src1;
393                     *(WORD32 *) (&pu1_dst[2 * i4_dst_stride]) = src2;
394                     *(WORD32 *) (&pu1_dst[3 * i4_dst_stride]) = src3;
395                     *(WORD32 *) (&pu1_dst[4 * i4_dst_stride]) = src4;
396                     *(WORD32 *) (&pu1_dst[5 * i4_dst_stride]) = src5;
397                     *(WORD32 *) (&pu1_dst[6 * i4_dst_stride]) = src6;
398                     *(WORD32 *) (&pu1_dst[7 * i4_dst_stride]) = src7;
399 
400                     pu1_src += 4;
401                     pu1_dst += 4;
402                 }
403 
404                 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
405                 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
406             }
407         }
408         else /* ht multiple of 4 case */
409         {
410             for(i = 0; i < i4_blk_ht; i += 4)
411             {
412                 for(j = 0; j < i4_blk_wd; j += 4)
413                 {
414                     src0_16x8b =
415                         _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride));  // i = 0
416                     src1_16x8b =
417                         _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride));  // i = 1
418                     src2_16x8b =
419                         _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride));  // i = 2
420                     src3_16x8b =
421                         _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride));  // i = 3
422 
423                     src0 = _mm_cvtsi128_si32(src0_16x8b);
424                     src1 = _mm_cvtsi128_si32(src1_16x8b);
425                     src2 = _mm_cvtsi128_si32(src2_16x8b);
426                     src3 = _mm_cvtsi128_si32(src3_16x8b);
427 
428                     *(WORD32 *) (&pu1_dst[0 * i4_dst_stride]) = src0;
429                     *(WORD32 *) (&pu1_dst[1 * i4_dst_stride]) = src1;
430                     *(WORD32 *) (&pu1_dst[2 * i4_dst_stride]) = src2;
431                     *(WORD32 *) (&pu1_dst[3 * i4_dst_stride]) = src3;
432 
433                     pu1_src += 4;
434                     pu1_dst += 4;
435                 }
436 
437                 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
438                 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
439             }
440         }
441     }
442 }
443