xref: /aosp_15_r20/external/libmpeg2/common/armv8/impeg2_inter_pred.s (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li//******************************************************************************
2*a97c2a1fSXin Li//*
3*a97c2a1fSXin Li//* Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li//*
5*a97c2a1fSXin Li//* Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li//* you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li//* You may obtain a copy of the License at:
8*a97c2a1fSXin Li//*
9*a97c2a1fSXin Li//* http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li//*
11*a97c2a1fSXin Li//* Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li//* distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li//* See the License for the specific language governing permissions and
15*a97c2a1fSXin Li//* limitations under the License.
16*a97c2a1fSXin Li//*
17*a97c2a1fSXin Li//*****************************************************************************
18*a97c2a1fSXin Li//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li//*/
20*a97c2a1fSXin Li
21*a97c2a1fSXin Li///*
22*a97c2a1fSXin Li////----------------------------------------------------------------------------
23*a97c2a1fSXin Li//// File Name            : impeg2_inter_pred.s
24*a97c2a1fSXin Li////
25*a97c2a1fSXin Li//// Description          : This file has motion compensation related
26*a97c2a1fSXin Li////                        interpolation functions on Neon + CortexA-8 platform
27*a97c2a1fSXin Li////
28*a97c2a1fSXin Li//// Reference Document   :
29*a97c2a1fSXin Li////
30*a97c2a1fSXin Li//// Revision History     :
31*a97c2a1fSXin Li////      Date            Author                  Detail Description
32*a97c2a1fSXin Li////   ------------    ----------------    ----------------------------------
33*a97c2a1fSXin Li////   18 jun 2010      S Hamsalekha              Created
34*a97c2a1fSXin Li////
35*a97c2a1fSXin Li////-------------------------------------------------------------------------
36*a97c2a1fSXin Li//*/
37*a97c2a1fSXin Li
38*a97c2a1fSXin Li///*
39*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
40*a97c2a1fSXin Li//// Include Files
41*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
42*a97c2a1fSXin Li//*/
43*a97c2a1fSXin Li//              PRESERVE8
44*a97c2a1fSXin Li.text
45*a97c2a1fSXin Li.include "impeg2_neon_macros.s"
46*a97c2a1fSXin Li
47*a97c2a1fSXin Li///*
48*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
49*a97c2a1fSXin Li//// Struct/Union Types and Define
50*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
51*a97c2a1fSXin Li//*/
52*a97c2a1fSXin Li
53*a97c2a1fSXin Li
54*a97c2a1fSXin Li///*
55*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
56*a97c2a1fSXin Li//// Static Global Data section variables
57*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
58*a97c2a1fSXin Li//*/
59*a97c2a1fSXin Li//// -------------------------- NONE --------------------------------------------
60*a97c2a1fSXin Li
61*a97c2a1fSXin Li
62*a97c2a1fSXin Li///*
63*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
64*a97c2a1fSXin Li//// Static Prototype Functions
65*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
66*a97c2a1fSXin Li//*/
67*a97c2a1fSXin Li//// -------------------------- NONE --------------------------------------------
68*a97c2a1fSXin Li
69*a97c2a1fSXin Li///*
70*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
71*a97c2a1fSXin Li//// Exported functions
72*a97c2a1fSXin Li//// ----------------------------------------------------------------------------
73*a97c2a1fSXin Li//*/
74*a97c2a1fSXin Li
75*a97c2a1fSXin Li
76*a97c2a1fSXin Li///*
77*a97c2a1fSXin Li////---------------------------------------------------------------------------
78*a97c2a1fSXin Li//// Function Name      :   impeg2_copy_mb_av8()
79*a97c2a1fSXin Li////
80*a97c2a1fSXin Li//// Detail Description : Copies one MB worth of data from src to the dst
81*a97c2a1fSXin Li////
82*a97c2a1fSXin Li//// Inputs             : x0 - pointer to src
83*a97c2a1fSXin Li////                      x1 - pointer to dst
84*a97c2a1fSXin Li////                      x2 - source width
85*a97c2a1fSXin Li////                      x3 - destination width
86*a97c2a1fSXin Li//// Registers Used     : v0, v1
87*a97c2a1fSXin Li////
88*a97c2a1fSXin Li//// Stack Usage        : 64 bytes
89*a97c2a1fSXin Li////
90*a97c2a1fSXin Li//// Outputs            :
91*a97c2a1fSXin Li////
92*a97c2a1fSXin Li//// Return Data        : None
93*a97c2a1fSXin Li////
94*a97c2a1fSXin Li//// Programming Note   : <program limitation>
95*a97c2a1fSXin Li////-----------------------------------------------------------------------------
96*a97c2a1fSXin Li//*/
97*a97c2a1fSXin Li
98*a97c2a1fSXin Li
99*a97c2a1fSXin Li
100*a97c2a1fSXin Li.global impeg2_copy_mb_av8
101*a97c2a1fSXin Li
102*a97c2a1fSXin Li
103*a97c2a1fSXin Liimpeg2_copy_mb_av8:
104*a97c2a1fSXin Li
105*a97c2a1fSXin Li//STMFD   x13!,{x4,x5,x12,x14}
106*a97c2a1fSXin Li    push_v_regs
107*a97c2a1fSXin Li
108*a97c2a1fSXin Li
109*a97c2a1fSXin Li    ldr             x4, [x0]            //src->y
110*a97c2a1fSXin Li    ldr             x5, [x1]            //dst->y
111*a97c2a1fSXin Li
112*a97c2a1fSXin Li    //Read one row of data from the src
113*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
114*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
115*a97c2a1fSXin Li
116*a97c2a1fSXin Li    ////Repeat 15 times for y
117*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
118*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
119*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
120*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
121*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
122*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
123*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
124*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
125*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
126*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
127*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
128*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
129*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
130*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
131*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
132*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
133*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
134*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
135*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
136*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
137*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
138*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
139*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
140*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
141*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
142*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
143*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
144*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
145*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
146*a97c2a1fSXin Li    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
147*a97c2a1fSXin Li
148*a97c2a1fSXin Li    lsr             x2, x2, #1          //src_offset /= 2
149*a97c2a1fSXin Li    lsr             x3, x3, #1          //dst_offset /= 2
150*a97c2a1fSXin Li
151*a97c2a1fSXin Li    ldr             x4, [x0, #8]        //src->u
152*a97c2a1fSXin Li    ldr             x5, [x1, #8]        //dst->u
153*a97c2a1fSXin Li
154*a97c2a1fSXin Li    //Read one row of data from the src
155*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
156*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
157*a97c2a1fSXin Li
158*a97c2a1fSXin Li    ////Repeat 7 times for u
159*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
160*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
161*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
162*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
163*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
164*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
165*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
166*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
167*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
168*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
169*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
170*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
171*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
172*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
173*a97c2a1fSXin Li
174*a97c2a1fSXin Li    ldr             x4, [x0, #16]       //src->v
175*a97c2a1fSXin Li    ldr             x5, [x1, #16]       //dst->v
176*a97c2a1fSXin Li
177*a97c2a1fSXin Li    //Read one row of data from the src
178*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
179*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
180*a97c2a1fSXin Li
181*a97c2a1fSXin Li    ////Repeat 7 times for v
182*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
183*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
184*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
185*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
186*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
187*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
188*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
189*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
190*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
191*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
192*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
193*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
194*a97c2a1fSXin Li    ld1             {v0.8b}, [x4], x2   //Load and increment src
195*a97c2a1fSXin Li    st1             {v0.8b}, [x5], x3   //Store and increment dst
196*a97c2a1fSXin Li
197*a97c2a1fSXin Li//LDMFD   x13!,{x4,x5,x12,PC}
198*a97c2a1fSXin Li    pop_v_regs
199*a97c2a1fSXin Li    ret
200*a97c2a1fSXin Li
201*a97c2a1fSXin Li
202*a97c2a1fSXin Li///*
203*a97c2a1fSXin Li////---------------------------------------------------------------------------
204*a97c2a1fSXin Li//// Function Name      :   impeg2_mc_fullx_halfy_8x8_av8()
205*a97c2a1fSXin Li////
206*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the
207*a97c2a1fSXin Li////                      current frame buffer.This function is called for
208*a97c2a1fSXin Li////                      blocks that are not coded and have motion vectors
209*a97c2a1fSXin Li////                      with a half pel resolution.
210*a97c2a1fSXin Li////
211*a97c2a1fSXin Li//// Inputs             : x0 - out    : Current Block Pointer
212*a97c2a1fSXin Li////                      x1 - ref     : Refernce Block Pointer
213*a97c2a1fSXin Li////                      x2 - ref_wid   : Refernce Block Width
214*a97c2a1fSXin Li////                      x3 - out_wid    @ Current Block Width
215*a97c2a1fSXin Li////
216*a97c2a1fSXin Li//// Registers Used     : x14, D0-D9
217*a97c2a1fSXin Li////
218*a97c2a1fSXin Li//// Stack Usage        : 64 bytes
219*a97c2a1fSXin Li////
220*a97c2a1fSXin Li//// Outputs            : The Motion Compensated Block
221*a97c2a1fSXin Li////
222*a97c2a1fSXin Li//// Return Data        : None
223*a97c2a1fSXin Li////
224*a97c2a1fSXin Li//// Programming Note   : <program limitation>
225*a97c2a1fSXin Li////-----------------------------------------------------------------------------
226*a97c2a1fSXin Li//*/
227*a97c2a1fSXin Li
228*a97c2a1fSXin Li.global impeg2_mc_fullx_halfy_8x8_av8
229*a97c2a1fSXin Li
230*a97c2a1fSXin Liimpeg2_mc_fullx_halfy_8x8_av8:
231*a97c2a1fSXin Li
232*a97c2a1fSXin Li//STMFD       x13!,{x12,x14}
233*a97c2a1fSXin Li    push_v_regs
234*a97c2a1fSXin Li    add             x14, x1, x2
235*a97c2a1fSXin Li    lsl             x2, x2, #1
236*a97c2a1fSXin Li
237*a97c2a1fSXin Li///* Load 8 + 1 rows from reference block */
238*a97c2a1fSXin Li///* Do the addition with out rounding off as rounding value is 1 */
239*a97c2a1fSXin Li    ld1             {v0.8b}, [x1], x2   //// first row hence x1 = D0
240*a97c2a1fSXin Li    ld1             {v2.8b}, [x14], x2  //// second row hence x2 = D2
241*a97c2a1fSXin Li    ld1             {v4.8b}, [x1], x2   //// third row hence x3 = D4
242*a97c2a1fSXin Li    ld1             {v6.8b}, [x14], x2  //// fourth row hence x4 = D6
243*a97c2a1fSXin Li    ld1             {v1.8b}, [x1], x2   //// fifth row hence x5 = D1
244*a97c2a1fSXin Li    ld1             {v3.8b}, [x14], x2  //// sixth row hence x6 = D3
245*a97c2a1fSXin Li    urhadd          v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
246*a97c2a1fSXin Li    ld1             {v5.8b}, [x1], x2   //// seventh row hence x7 = D5
247*a97c2a1fSXin Li    urhadd          v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
248*a97c2a1fSXin Li    urhadd          v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
249*a97c2a1fSXin Li    ld1             {v7.8b}, [x14], x2  //// eighth row hence x8 = D7
250*a97c2a1fSXin Li    urhadd          v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
251*a97c2a1fSXin Li    urhadd          v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
252*a97c2a1fSXin Li    ld1             {v8.8b}, [x1], x2   //// ninth row hence x9 = D8
253*a97c2a1fSXin Li    urhadd          v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
254*a97c2a1fSXin Li    urhadd          v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
255*a97c2a1fSXin Li
256*a97c2a1fSXin Li    add             x14, x0, x3
257*a97c2a1fSXin Li    lsl             x3, x3, #1
258*a97c2a1fSXin Li
259*a97c2a1fSXin Li///* Store the eight rows calculated above */
260*a97c2a1fSXin Li    st1             {v2.8b}, [x14], x3  //// second row hence D2
261*a97c2a1fSXin Li    urhadd          v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
262*a97c2a1fSXin Li    st1             {v0.8b}, [x0], x3   //// first row hence D0
263*a97c2a1fSXin Li    st1             {v9.8b}, [x14], x3  //// fourth row hence D9
264*a97c2a1fSXin Li    st1             {v4.8b}, [x0], x3   //// third row hence D4
265*a97c2a1fSXin Li    st1             {v3.8b}, [x14], x3  //// sixth row hence x6 = D3
266*a97c2a1fSXin Li    st1             {v1.8b}, [x0], x3   //// fifth row hence x5 = D1
267*a97c2a1fSXin Li    st1             {v7.8b}, [x14], x3  //// eighth row hence x8 = D7
268*a97c2a1fSXin Li    st1             {v5.8b}, [x0], x3   //// seventh row hence x7 = D5
269*a97c2a1fSXin Li
270*a97c2a1fSXin Li// LDMFD sp!,{x12,pc}
271*a97c2a1fSXin Li    pop_v_regs
272*a97c2a1fSXin Li    ret
273*a97c2a1fSXin Li
274*a97c2a1fSXin Li
275*a97c2a1fSXin Li
276*a97c2a1fSXin Li
277*a97c2a1fSXin Li
278*a97c2a1fSXin Li///*
279*a97c2a1fSXin Li////---------------------------------------------------------------------------
280*a97c2a1fSXin Li//// Function Name      :   impeg2_mc_halfx_fully_8x8_av8()
281*a97c2a1fSXin Li////
282*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the
283*a97c2a1fSXin Li////                      current frame buffer.This function is called for
284*a97c2a1fSXin Li////                      blocks that are not coded and have motion vectors
285*a97c2a1fSXin Li////                      with a half pel resolutionand VopRoundingType is 0 ..
286*a97c2a1fSXin Li////
287*a97c2a1fSXin Li//// Inputs             : x0 - out    : Current Block Pointer
288*a97c2a1fSXin Li////                      x1 - ref     : Refernce Block Pointer
289*a97c2a1fSXin Li////                      x2 - ref_wid   : Refernce Block Width
290*a97c2a1fSXin Li////                      x3 - out_wid    @ Current Block Width
291*a97c2a1fSXin Li////
292*a97c2a1fSXin Li//// Registers Used     : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
293*a97c2a1fSXin Li
294*a97c2a1fSXin Li////
295*a97c2a1fSXin Li//// Stack Usage        : 64 bytes
296*a97c2a1fSXin Li////
297*a97c2a1fSXin Li//// Outputs            : The Motion Compensated Block
298*a97c2a1fSXin Li////
299*a97c2a1fSXin Li//// Return Data        : None
300*a97c2a1fSXin Li////
301*a97c2a1fSXin Li//// Programming Note   : <program limitation>
302*a97c2a1fSXin Li////-----------------------------------------------------------------------------
303*a97c2a1fSXin Li//*/
304*a97c2a1fSXin Li
305*a97c2a1fSXin Li
306*a97c2a1fSXin Li
307*a97c2a1fSXin Li.global impeg2_mc_halfx_fully_8x8_av8
308*a97c2a1fSXin Li
309*a97c2a1fSXin Li
310*a97c2a1fSXin Li
311*a97c2a1fSXin Liimpeg2_mc_halfx_fully_8x8_av8:
312*a97c2a1fSXin Li
313*a97c2a1fSXin Li    // STMFD sp!,{x12,x14}
314*a97c2a1fSXin Li    push_v_regs
315*a97c2a1fSXin Li
316*a97c2a1fSXin Li    add             x14, x1, x2, lsl #2
317*a97c2a1fSXin Li
318*a97c2a1fSXin Li    add             x12, x0, x3, lsl#2
319*a97c2a1fSXin Li
320*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
321*a97c2a1fSXin Li
322*a97c2a1fSXin Li    ld1             {v2.8b, v3.8b}, [x14], x2 // row5
323*a97c2a1fSXin Li
324*a97c2a1fSXin Li
325*a97c2a1fSXin Li    ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
326*a97c2a1fSXin Li
327*a97c2a1fSXin Li    ld1             {v6.8b, v7.8b}, [x14], x2 //row6
328*a97c2a1fSXin Li
329*a97c2a1fSXin Li
330*a97c2a1fSXin Li    ext             v8.8b, v0.8b , v1.8b , #1
331*a97c2a1fSXin Li
332*a97c2a1fSXin Li    ext             v12.8b, v2.8b , v3.8b , #1
333*a97c2a1fSXin Li
334*a97c2a1fSXin Li    ext             v16.8b, v4.8b , v5.8b , #1
335*a97c2a1fSXin Li
336*a97c2a1fSXin Li    ext             v20.8b, v6.8b , v7.8b , #1
337*a97c2a1fSXin Li
338*a97c2a1fSXin Li
339*a97c2a1fSXin Li    ld1             {v9.8b, v10.8b}, [x1], x2 //load row3
340*a97c2a1fSXin Li
341*a97c2a1fSXin Li    ld1             {v13.8b, v14.8b}, [x14], x2 //load row7
342*a97c2a1fSXin Li
343*a97c2a1fSXin Li    ld1             {v17.8b, v18.8b}, [x1], x2 //load  row4
344*a97c2a1fSXin Li
345*a97c2a1fSXin Li    ld1             {v21.8b, v22.8b}, [x14], x2 //load  row8
346*a97c2a1fSXin Li
347*a97c2a1fSXin Li
348*a97c2a1fSXin Li    ext             v1.8b, v9.8b , v10.8b , #1
349*a97c2a1fSXin Li
350*a97c2a1fSXin Li    ext             v3.8b, v13.8b , v14.8b , #1
351*a97c2a1fSXin Li
352*a97c2a1fSXin Li
353*a97c2a1fSXin Li
354*a97c2a1fSXin Li    ext             v5.8b, v17.8b , v18.8b , #1
355*a97c2a1fSXin Li
356*a97c2a1fSXin Li    ext             v7.8b, v21.8b , v22.8b , #1
357*a97c2a1fSXin Li
358*a97c2a1fSXin Li
359*a97c2a1fSXin Li    urhadd          v0.16b, v0.16b , v8.16b //operate on row1 and row3
360*a97c2a1fSXin Li    urhadd          v1.16b, v1.16b , v9.16b //operate on row1 and row3
361*a97c2a1fSXin Li
362*a97c2a1fSXin Li    urhadd          v2.16b, v2.16b , v12.16b //operate on row5 and row7
363*a97c2a1fSXin Li    urhadd          v3.16b, v3.16b , v13.16b //operate on row5 and row7
364*a97c2a1fSXin Li
365*a97c2a1fSXin Li
366*a97c2a1fSXin Li    urhadd          v4.16b, v4.16b , v16.16b //operate on row2 and row4
367*a97c2a1fSXin Li    urhadd          v5.16b, v5.16b , v17.16b //operate on row2 and row4
368*a97c2a1fSXin Li
369*a97c2a1fSXin Li
370*a97c2a1fSXin Li    urhadd          v6.16b, v6.16b , v20.16b //operate on row6 and row8
371*a97c2a1fSXin Li    urhadd          v7.16b, v7.16b , v21.16b //operate on row6 and row8
372*a97c2a1fSXin Li
373*a97c2a1fSXin Li    st1             {v0.8b}, [x0], x3   //store row1
374*a97c2a1fSXin Li
375*a97c2a1fSXin Li    st1             {v2.8b}, [x12], x3  //store row5
376*a97c2a1fSXin Li
377*a97c2a1fSXin Li    st1             {v4.8b}, [x0], x3   //store row2
378*a97c2a1fSXin Li
379*a97c2a1fSXin Li    st1             {v6.8b}, [x12], x3  //store row6
380*a97c2a1fSXin Li
381*a97c2a1fSXin Li    st1             {v1.8b}, [x0], x3   //store row3
382*a97c2a1fSXin Li
383*a97c2a1fSXin Li    st1             {v3.8b}, [x12], x3  //store row7
384*a97c2a1fSXin Li
385*a97c2a1fSXin Li    st1             {v5.8b}, [x0], x3   //store row4
386*a97c2a1fSXin Li
387*a97c2a1fSXin Li    st1             {v7.8b}, [x12], x3  //store row8
388*a97c2a1fSXin Li
389*a97c2a1fSXin Li
390*a97c2a1fSXin Li
391*a97c2a1fSXin Li    // LDMFD sp!,{x12,pc}
392*a97c2a1fSXin Li    pop_v_regs
393*a97c2a1fSXin Li    ret
394*a97c2a1fSXin Li
395*a97c2a1fSXin Li
396*a97c2a1fSXin Li
397*a97c2a1fSXin Li
398*a97c2a1fSXin Li
399*a97c2a1fSXin Li
400*a97c2a1fSXin Li
401*a97c2a1fSXin Li///*
402*a97c2a1fSXin Li////---------------------------------------------------------------------------
403*a97c2a1fSXin Li//// Function Name      :   impeg2_mc_halfx_halfy_8x8_av8()
404*a97c2a1fSXin Li////
405*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the
406*a97c2a1fSXin Li////                      current frame buffer.This function is called for
407*a97c2a1fSXin Li////                      blocks that are not coded and have motion vectors
408*a97c2a1fSXin Li////                      with a half pel resolutionand VopRoundingType is 0 ..
409*a97c2a1fSXin Li////
410*a97c2a1fSXin Li//// Inputs             : x0 - out    : Current Block Pointer
411*a97c2a1fSXin Li////                      x1 - ref     : Refernce Block Pointer
412*a97c2a1fSXin Li////                      x2 - ref_wid   : Refernce Block Width
413*a97c2a1fSXin Li////                      x3 - out_wid    @ Current Block Width
414*a97c2a1fSXin Li////
415*a97c2a1fSXin Li//// Registers Used     : x14, v0-v18, v22, v24, v26, v28, v30
416*a97c2a1fSXin Li
417*a97c2a1fSXin Li////
418*a97c2a1fSXin Li//// Stack Usage        : 64 bytes
419*a97c2a1fSXin Li////
420*a97c2a1fSXin Li//// Outputs            : The Motion Compensated Block
421*a97c2a1fSXin Li////
422*a97c2a1fSXin Li//// Return Data        : None
423*a97c2a1fSXin Li////
424*a97c2a1fSXin Li//// Programming Note   : <program limitation>
425*a97c2a1fSXin Li////-----------------------------------------------------------------------------
426*a97c2a1fSXin Li//*/
427*a97c2a1fSXin Li
428*a97c2a1fSXin Li
429*a97c2a1fSXin Li.global impeg2_mc_halfx_halfy_8x8_av8
430*a97c2a1fSXin Li
431*a97c2a1fSXin Liimpeg2_mc_halfx_halfy_8x8_av8:
432*a97c2a1fSXin Li
433*a97c2a1fSXin Li    // STMFD sp!,{x12,x14}
434*a97c2a1fSXin Li    push_v_regs
435*a97c2a1fSXin Li
436*a97c2a1fSXin Li    add             x14, x1, x2, lsl #2
437*a97c2a1fSXin Li
438*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
439*a97c2a1fSXin Li
440*a97c2a1fSXin Li    ld1             {v2.8b, v3.8b}, [x14], x2 // row5
441*a97c2a1fSXin Li
442*a97c2a1fSXin Li    ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
443*a97c2a1fSXin Li
444*a97c2a1fSXin Li    ld1             {v6.8b, v7.8b}, [x14], x2 //row6
445*a97c2a1fSXin Li
446*a97c2a1fSXin Li    ext             v1.8b, v0.8b , v1.8b , #1
447*a97c2a1fSXin Li
448*a97c2a1fSXin Li
449*a97c2a1fSXin Li
450*a97c2a1fSXin Li    ext             v3.8b, v2.8b , v3.8b , #1
451*a97c2a1fSXin Li
452*a97c2a1fSXin Li
453*a97c2a1fSXin Li
454*a97c2a1fSXin Li    ext             v5.8b, v4.8b , v5.8b , #1
455*a97c2a1fSXin Li
456*a97c2a1fSXin Li    ext             v7.8b, v6.8b , v7.8b , #1
457*a97c2a1fSXin Li
458*a97c2a1fSXin Li
459*a97c2a1fSXin Li
460*a97c2a1fSXin Li
461*a97c2a1fSXin Li    ld1             {v8.8b, v9.8b}, [x1], x2 //load row3
462*a97c2a1fSXin Li
463*a97c2a1fSXin Li
464*a97c2a1fSXin Li
465*a97c2a1fSXin Li    ld1             {v10.8b, v11.8b}, [x14], x2 //load row7
466*a97c2a1fSXin Li
467*a97c2a1fSXin Li    ld1             {v12.8b, v13.8b}, [x1], x2 //load  row4
468*a97c2a1fSXin Li
469*a97c2a1fSXin Li    ld1             {v14.8b, v15.8b}, [x14], x2 //load  row8
470*a97c2a1fSXin Li
471*a97c2a1fSXin Li    ext             v9.8b, v8.8b , v9.8b , #1
472*a97c2a1fSXin Li
473*a97c2a1fSXin Li    ld1             {v16.8b, v17.8b}, [x14], x2 //load  row9
474*a97c2a1fSXin Li
475*a97c2a1fSXin Li
476*a97c2a1fSXin Li
477*a97c2a1fSXin Li
478*a97c2a1fSXin Li
479*a97c2a1fSXin Li    ext             v11.8b, v10.8b , v11.8b , #1
480*a97c2a1fSXin Li
481*a97c2a1fSXin Li
482*a97c2a1fSXin Li
483*a97c2a1fSXin Li    ext             v13.8b, v12.8b , v13.8b , #1
484*a97c2a1fSXin Li
485*a97c2a1fSXin Li
486*a97c2a1fSXin Li
487*a97c2a1fSXin Li    ext             v15.8b, v14.8b , v15.8b , #1
488*a97c2a1fSXin Li
489*a97c2a1fSXin Li    ext             v17.8b, v16.8b , v17.8b , #1
490*a97c2a1fSXin Li
491*a97c2a1fSXin Li
492*a97c2a1fSXin Li    //interpolation in x direction
493*a97c2a1fSXin Li
494*a97c2a1fSXin Li    uaddl           v0.8h, v0.8b, v1.8b //operate row1
495*a97c2a1fSXin Li
496*a97c2a1fSXin Li    uaddl           v2.8h, v2.8b, v3.8b //operate row5
497*a97c2a1fSXin Li
498*a97c2a1fSXin Li    uaddl           v4.8h, v4.8b, v5.8b //operate row2
499*a97c2a1fSXin Li
500*a97c2a1fSXin Li    uaddl           v6.8h, v6.8b, v7.8b //operate row6
501*a97c2a1fSXin Li
502*a97c2a1fSXin Li    uaddl           v8.8h, v8.8b, v9.8b //operate row3
503*a97c2a1fSXin Li
504*a97c2a1fSXin Li    uaddl           v10.8h, v10.8b, v11.8b //operate row7
505*a97c2a1fSXin Li
506*a97c2a1fSXin Li    uaddl           v12.8h, v12.8b, v13.8b //operate row4
507*a97c2a1fSXin Li
508*a97c2a1fSXin Li    uaddl           v14.8h, v14.8b, v15.8b //operate row8
509*a97c2a1fSXin Li
510*a97c2a1fSXin Li    uaddl           v16.8h, v16.8b, v17.8b //operate row9
511*a97c2a1fSXin Li
512*a97c2a1fSXin Li    //interpolation in y direction
513*a97c2a1fSXin Li
514*a97c2a1fSXin Li    add             x14, x0, x3, lsl #2
515*a97c2a1fSXin Li
516*a97c2a1fSXin Li
517*a97c2a1fSXin Li
518*a97c2a1fSXin Li    add             v18.8h, v0.8h , v4.8h //operate row1 and row2
519*a97c2a1fSXin Li
520*a97c2a1fSXin Li    add             v26.8h, v2.8h , v6.8h //operate row5 and row6
521*a97c2a1fSXin Li
522*a97c2a1fSXin Li    add             v20.8h, v4.8h , v8.8h //operate row2 and row3
523*a97c2a1fSXin Li
524*a97c2a1fSXin Li    add             v28.8h, v6.8h , v10.8h //operate row6 and row7
525*a97c2a1fSXin Li
526*a97c2a1fSXin Li    rshrn           v18.8b, v18.8h, #2  //row1
527*a97c2a1fSXin Li
528*a97c2a1fSXin Li    rshrn           v26.8b, v26.8h, #2  //row5
529*a97c2a1fSXin Li
530*a97c2a1fSXin Li    rshrn           v20.8b, v20.8h, #2  //row2
531*a97c2a1fSXin Li
532*a97c2a1fSXin Li    rshrn           v28.8b, v28.8h, #2  //row6
533*a97c2a1fSXin Li
534*a97c2a1fSXin Li    add             v22.8h, v8.8h , v12.8h //operate row3 and row4
535*a97c2a1fSXin Li
536*a97c2a1fSXin Li    st1             {v18.8b}, [x0], x3  //store row1
537*a97c2a1fSXin Li
538*a97c2a1fSXin Li    add             v30.8h, v10.8h , v14.8h //operate row7 and row8
539*a97c2a1fSXin Li
540*a97c2a1fSXin Li    st1             {v26.8b}, [x14], x3 //store row5
541*a97c2a1fSXin Li
542*a97c2a1fSXin Li    add             v24.8h, v12.8h , v2.8h //operate row4 and row5
543*a97c2a1fSXin Li
544*a97c2a1fSXin Li    st1             {v20.8b}, [x0], x3  //store row2
545*a97c2a1fSXin Li
546*a97c2a1fSXin Li    add             v14.8h, v14.8h , v16.8h //operate row8 and row9
547*a97c2a1fSXin Li
548*a97c2a1fSXin Li    st1             {v28.8b}, [x14], x3 //store row6
549*a97c2a1fSXin Li
550*a97c2a1fSXin Li
551*a97c2a1fSXin Li
552*a97c2a1fSXin Li    rshrn           v22.8b, v22.8h, #2  //row3
553*a97c2a1fSXin Li
554*a97c2a1fSXin Li    rshrn           v30.8b, v30.8h, #2  //row7
555*a97c2a1fSXin Li
556*a97c2a1fSXin Li    rshrn           v24.8b, v24.8h, #2  //row4
557*a97c2a1fSXin Li
558*a97c2a1fSXin Li    rshrn           v14.8b, v14.8h, #2  //row8
559*a97c2a1fSXin Li
560*a97c2a1fSXin Li
561*a97c2a1fSXin Li    st1             {v22.8b}, [x0], x3  //store row3
562*a97c2a1fSXin Li    st1             {v30.8b}, [x14], x3 //store row7
563*a97c2a1fSXin Li    st1             {v24.8b}, [x0], x3  //store row4
564*a97c2a1fSXin Li    st1             {v14.8b}, [x14], x3 //store row8
565*a97c2a1fSXin Li
566*a97c2a1fSXin Li
567*a97c2a1fSXin Li
568*a97c2a1fSXin Li    // LDMFD sp!,{x12,pc}
569*a97c2a1fSXin Li    pop_v_regs
570*a97c2a1fSXin Li    ret
571*a97c2a1fSXin Li
572*a97c2a1fSXin Li
573*a97c2a1fSXin Li
574*a97c2a1fSXin Li
575*a97c2a1fSXin Li///*
576*a97c2a1fSXin Li////---------------------------------------------------------------------------
577*a97c2a1fSXin Li//// Function Name      :   impeg2_mc_fullx_fully_8x8_av8()
578*a97c2a1fSXin Li////
579*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the
580*a97c2a1fSXin Li////                      current frame buffer.This function is called for
581*a97c2a1fSXin Li////                      blocks that are not coded and have motion vectors
582*a97c2a1fSXin Li////                      with a half pel resolutionand ..
583*a97c2a1fSXin Li////
584*a97c2a1fSXin Li//// Inputs             : x0 - out    : Current Block Pointer
585*a97c2a1fSXin Li////                      x1 - ref     : Refernce Block Pointer
586*a97c2a1fSXin Li////                      x2 - ref_wid   : Refernce Block Width
587*a97c2a1fSXin Li////                      x3 - out_wid    @ Current Block Width
588*a97c2a1fSXin Li////
589*a97c2a1fSXin Li//// Registers Used     : x12, x14, v0-v3
590*a97c2a1fSXin Li
591*a97c2a1fSXin Li////
592*a97c2a1fSXin Li//// Stack Usage        : 64 bytes
593*a97c2a1fSXin Li////
594*a97c2a1fSXin Li//// Outputs            : The Motion Compensated Block
595*a97c2a1fSXin Li////
596*a97c2a1fSXin Li//// Return Data        : None
597*a97c2a1fSXin Li////
598*a97c2a1fSXin Li//// Programming Note   : <program limitation>
599*a97c2a1fSXin Li////-----------------------------------------------------------------------------
600*a97c2a1fSXin Li//*/
601*a97c2a1fSXin Li
602*a97c2a1fSXin Li
603*a97c2a1fSXin Li.global impeg2_mc_fullx_fully_8x8_av8
604*a97c2a1fSXin Liimpeg2_mc_fullx_fully_8x8_av8:
605*a97c2a1fSXin Li
606*a97c2a1fSXin Li
607*a97c2a1fSXin Li    // STMFD sp!,{x12,x14}
608*a97c2a1fSXin Li    push_v_regs
609*a97c2a1fSXin Li
610*a97c2a1fSXin Li    add             x14, x1, x2, lsl #2
611*a97c2a1fSXin Li
612*a97c2a1fSXin Li    add             x12, x0, x3, lsl #2
613*a97c2a1fSXin Li
614*a97c2a1fSXin Li
615*a97c2a1fSXin Li    ld1             {v0.8b}, [x1], x2   //load row1
616*a97c2a1fSXin Li
617*a97c2a1fSXin Li    ld1             {v1.8b}, [x14], x2  //load row4
618*a97c2a1fSXin Li
619*a97c2a1fSXin Li    ld1             {v2.8b}, [x1], x2   //load row2
620*a97c2a1fSXin Li
621*a97c2a1fSXin Li    ld1             {v3.8b}, [x14], x2  //load row5
622*a97c2a1fSXin Li
623*a97c2a1fSXin Li
624*a97c2a1fSXin Li    st1             {v0.8b}, [x0], x3   //store row1
625*a97c2a1fSXin Li
626*a97c2a1fSXin Li    st1             {v1.8b}, [x12], x3  //store row4
627*a97c2a1fSXin Li
628*a97c2a1fSXin Li    st1             {v2.8b}, [x0], x3   //store row2
629*a97c2a1fSXin Li
630*a97c2a1fSXin Li    st1             {v3.8b}, [x12], x3  //store row5
631*a97c2a1fSXin Li
632*a97c2a1fSXin Li
633*a97c2a1fSXin Li    ld1             {v0.8b}, [x1], x2   //load row3
634*a97c2a1fSXin Li
635*a97c2a1fSXin Li    ld1             {v1.8b}, [x14], x2  //load row6
636*a97c2a1fSXin Li
637*a97c2a1fSXin Li    ld1             {v2.8b}, [x1], x2   //load row4
638*a97c2a1fSXin Li
639*a97c2a1fSXin Li    ld1             {v3.8b}, [x14], x2  //load row8
640*a97c2a1fSXin Li
641*a97c2a1fSXin Li
642*a97c2a1fSXin Li    st1             {v0.8b}, [x0], x3   //store row3
643*a97c2a1fSXin Li
644*a97c2a1fSXin Li    st1             {v1.8b}, [x12], x3  //store row6
645*a97c2a1fSXin Li
646*a97c2a1fSXin Li    st1             {v2.8b}, [x0], x3   //store row4
647*a97c2a1fSXin Li
648*a97c2a1fSXin Li    st1             {v3.8b}, [x12], x3  //store row8
649*a97c2a1fSXin Li
650*a97c2a1fSXin Li
651*a97c2a1fSXin Li    // LDMFD sp!,{x12,pc}
652*a97c2a1fSXin Li    pop_v_regs
653*a97c2a1fSXin Li    ret
654*a97c2a1fSXin Li
655*a97c2a1fSXin Li
656*a97c2a1fSXin Li
657*a97c2a1fSXin Li
658*a97c2a1fSXin Li///*
659*a97c2a1fSXin Li////---------------------------------------------------------------------------
660*a97c2a1fSXin Li//// Function Name      :   impeg2_interpolate_av8()
661*a97c2a1fSXin Li////
662*a97c2a1fSXin Li//// Detail Description : interpolates two buffers and adds pred
663*a97c2a1fSXin Li////
664*a97c2a1fSXin Li//// Inputs             : x0 - pointer to src1
665*a97c2a1fSXin Li////                      x1 - pointer to src2
666*a97c2a1fSXin Li////                      x2 - dest buf
667*a97c2a1fSXin Li////                         x3 - dst stride
668*a97c2a1fSXin Li//// Registers Used     : x12, v0-v15
669*a97c2a1fSXin Li////
670*a97c2a1fSXin Li//// Stack Usage        : 64 bytes
671*a97c2a1fSXin Li////
672*a97c2a1fSXin Li//// Outputs            : The Motion Compensated Block
673*a97c2a1fSXin Li////
674*a97c2a1fSXin Li//// Return Data        : None
675*a97c2a1fSXin Li////
676*a97c2a1fSXin Li//// Programming Note   : <program limitation>
677*a97c2a1fSXin Li////-----------------------------------------------------------------------------
678*a97c2a1fSXin Li//*/
679*a97c2a1fSXin Li
680*a97c2a1fSXin Li
681*a97c2a1fSXin Li.global impeg2_interpolate_av8
682*a97c2a1fSXin Li
683*a97c2a1fSXin Li
684*a97c2a1fSXin Liimpeg2_interpolate_av8:
685*a97c2a1fSXin Li
686*a97c2a1fSXin Li//STMFD    x13!,{x4-x7,x12,x14}
687*a97c2a1fSXin Li    push_v_regs
688*a97c2a1fSXin Li
689*a97c2a1fSXin Li    ldr             x4, [x0, #0]        //ptr_y src1
690*a97c2a1fSXin Li
691*a97c2a1fSXin Li    ldr             x5, [x1, #0]        //ptr_y src2
692*a97c2a1fSXin Li
693*a97c2a1fSXin Li    ldr             x7, [x2, #0]        //ptr_y dst buf
694*a97c2a1fSXin Li
695*a97c2a1fSXin Li    mov             x12, #4             //counter for number of blocks
696*a97c2a1fSXin Li
697*a97c2a1fSXin Li
698*a97c2a1fSXin Liinterp_lumablocks_stride:
699*a97c2a1fSXin Li    ld1             {v0.16b}, [x4], #16 //row1 src1
700*a97c2a1fSXin Li
701*a97c2a1fSXin Li    ld1             {v2.16b}, [x4], #16 //row2 src1
702*a97c2a1fSXin Li
703*a97c2a1fSXin Li    ld1             {v4.16b}, [x4], #16 //row3 src1
704*a97c2a1fSXin Li
705*a97c2a1fSXin Li    ld1             {v6.16b}, [x4], #16 //row4 src1
706*a97c2a1fSXin Li
707*a97c2a1fSXin Li
708*a97c2a1fSXin Li    ld1             {v8.16b}, [x5], #16 //row1 src2
709*a97c2a1fSXin Li
710*a97c2a1fSXin Li    ld1             {v10.16b}, [x5], #16 //row2 src2
711*a97c2a1fSXin Li
712*a97c2a1fSXin Li    ld1             {v12.16b}, [x5], #16 //row3 src2
713*a97c2a1fSXin Li
714*a97c2a1fSXin Li    ld1             {v14.16b}, [x5], #16 //row4 src2
715*a97c2a1fSXin Li
716*a97c2a1fSXin Li    urhadd          v0.16b, v0.16b , v8.16b //operate on row1
717*a97c2a1fSXin Li
718*a97c2a1fSXin Li    urhadd          v2.16b, v2.16b , v10.16b //operate on row2
719*a97c2a1fSXin Li
720*a97c2a1fSXin Li    urhadd          v4.16b, v4.16b , v12.16b //operate on row3
721*a97c2a1fSXin Li
722*a97c2a1fSXin Li    urhadd          v6.16b, v6.16b , v14.16b //operate on row4
723*a97c2a1fSXin Li    st1             {v0.16b}, [x7], x3  //row1
724*a97c2a1fSXin Li
725*a97c2a1fSXin Li    st1             {v2.16b}, [x7], x3  //row2
726*a97c2a1fSXin Li
727*a97c2a1fSXin Li    st1             {v4.16b}, [x7], x3  //row3
728*a97c2a1fSXin Li
729*a97c2a1fSXin Li    st1             {v6.16b}, [x7], x3  //row4
730*a97c2a1fSXin Li
731*a97c2a1fSXin Li    subs            x12, x12, #1
732*a97c2a1fSXin Li
733*a97c2a1fSXin Li    bne             interp_lumablocks_stride
734*a97c2a1fSXin Li
735*a97c2a1fSXin Li
736*a97c2a1fSXin Li    lsr             x3, x3, #1          //stride >> 1
737*a97c2a1fSXin Li
738*a97c2a1fSXin Li    ldr             x4, [x0, #8]        //ptr_u src1
739*a97c2a1fSXin Li
740*a97c2a1fSXin Li    ldr             x5, [x1, #8]        //ptr_u src2
741*a97c2a1fSXin Li
742*a97c2a1fSXin Li    ldr             x7 , [x2, #8]       //ptr_u dst buf
743*a97c2a1fSXin Li
744*a97c2a1fSXin Li    mov             x12, #2             //counter for number of blocks
745*a97c2a1fSXin Li
746*a97c2a1fSXin Li
747*a97c2a1fSXin Li
748*a97c2a1fSXin Li//chroma blocks
749*a97c2a1fSXin Li
750*a97c2a1fSXin Liinterp_chromablocks_stride:
751*a97c2a1fSXin Li    ld1             {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
752*a97c2a1fSXin Li
753*a97c2a1fSXin Li    ld1             {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
754*a97c2a1fSXin Li
755*a97c2a1fSXin Li    ld1             {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
756*a97c2a1fSXin Li
757*a97c2a1fSXin Li    ld1             {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
758*a97c2a1fSXin Li
759*a97c2a1fSXin Li
760*a97c2a1fSXin Li    ld1             {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
761*a97c2a1fSXin Li
762*a97c2a1fSXin Li    ld1             {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
763*a97c2a1fSXin Li
764*a97c2a1fSXin Li    ld1             {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
765*a97c2a1fSXin Li
766*a97c2a1fSXin Li    ld1             {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
767*a97c2a1fSXin Li
768*a97c2a1fSXin Li    urhadd          v0.16b, v0.16b , v8.16b //operate on row1 & 2
769*a97c2a1fSXin Li    urhadd          v1.16b, v1.16b , v9.16b //operate on row1 & 2
770*a97c2a1fSXin Li
771*a97c2a1fSXin Li    urhadd          v2.16b, v2.16b , v10.16b //operate on row3 & 4
772*a97c2a1fSXin Li    urhadd          v3.16b, v3.16b , v11.16b //operate on row3 & 4
773*a97c2a1fSXin Li
774*a97c2a1fSXin Li    urhadd          v4.16b, v4.16b , v12.16b //operate on row5 & 6
775*a97c2a1fSXin Li    urhadd          v5.16b, v5.16b , v13.16b //operate on row5 & 6
776*a97c2a1fSXin Li
777*a97c2a1fSXin Li    urhadd          v6.16b, v6.16b , v14.16b //operate on row7 & 8
778*a97c2a1fSXin Li    urhadd          v7.16b, v7.16b , v15.16b //operate on row7 & 8
779*a97c2a1fSXin Li
780*a97c2a1fSXin Li    st1             {v0.8b}, [x7], x3   //row1
781*a97c2a1fSXin Li
782*a97c2a1fSXin Li    st1             {v1.8b}, [x7], x3   //row2
783*a97c2a1fSXin Li
784*a97c2a1fSXin Li    st1             {v2.8b}, [x7], x3   //row3
785*a97c2a1fSXin Li
786*a97c2a1fSXin Li    st1             {v3.8b}, [x7], x3   //row4
787*a97c2a1fSXin Li
788*a97c2a1fSXin Li    st1             {v4.8b}, [x7], x3   //row5
789*a97c2a1fSXin Li
790*a97c2a1fSXin Li    st1             {v5.8b}, [x7], x3   //row6
791*a97c2a1fSXin Li
792*a97c2a1fSXin Li    st1             {v6.8b}, [x7], x3   //row7
793*a97c2a1fSXin Li
794*a97c2a1fSXin Li    st1             {v7.8b}, [x7], x3   //row8
795*a97c2a1fSXin Li
796*a97c2a1fSXin Li
797*a97c2a1fSXin Li    ldr             x4, [x0, #16]       //ptr_v src1
798*a97c2a1fSXin Li
799*a97c2a1fSXin Li    ldr             x5, [x1, #16]       //ptr_v src2
800*a97c2a1fSXin Li
801*a97c2a1fSXin Li    ldr             x7, [x2, #16]       //ptr_v dst buf
802*a97c2a1fSXin Li
803*a97c2a1fSXin Li    subs            x12, x12, #1
804*a97c2a1fSXin Li
805*a97c2a1fSXin Li    bne             interp_chromablocks_stride
806*a97c2a1fSXin Li
807*a97c2a1fSXin Li
808*a97c2a1fSXin Li    //LDMFD  x13!,{x4-x7,x12,PC}
809*a97c2a1fSXin Li    pop_v_regs
810*a97c2a1fSXin Li    ret
811*a97c2a1fSXin Li
812*a97c2a1fSXin Li
813*a97c2a1fSXin Li
814*a97c2a1fSXin Li
815