xref: /aosp_15_r20/external/libmpeg2/common/arm/impeg2_idct.s (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li@/******************************************************************************
2*a97c2a1fSXin Li@ *
3*a97c2a1fSXin Li@ * Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li@ *
5*a97c2a1fSXin Li@ * Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li@ * you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li@ * You may obtain a copy of the License at:
8*a97c2a1fSXin Li@ *
9*a97c2a1fSXin Li@ * http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li@ *
11*a97c2a1fSXin Li@ * Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li@ * distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li@ * See the License for the specific language governing permissions and
15*a97c2a1fSXin Li@ * limitations under the License.
16*a97c2a1fSXin Li@ *
17*a97c2a1fSXin Li@ *****************************************************************************
18*a97c2a1fSXin Li@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li@*/
20*a97c2a1fSXin Li
21*a97c2a1fSXin Li@/*
22*a97c2a1fSXin Li@//----------------------------------------------------------------------------
23*a97c2a1fSXin Li@// File Name            : impeg2_idct.s
24*a97c2a1fSXin Li@//
25*a97c2a1fSXin Li@// Description          : This file has the Idct Implementations for the
26*a97c2a1fSXin Li@//                        MPEG2 SP decoder on neon platform.
27*a97c2a1fSXin Li@//
28*a97c2a1fSXin Li@// Reference Document   :
29*a97c2a1fSXin Li@//
30*a97c2a1fSXin Li@// Revision History     :
31*a97c2a1fSXin Li@//      Date            Author                  Detail Description
32*a97c2a1fSXin Li@//   ------------    ----------------    ----------------------------------
33*a97c2a1fSXin Li@//   Feb 22, 2008     Naveen Kumar T                Created
34*a97c2a1fSXin Li@//
35*a97c2a1fSXin Li@//-------------------------------------------------------------------------
36*a97c2a1fSXin Li@*/
37*a97c2a1fSXin Li
38*a97c2a1fSXin Li@/*
39*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
40*a97c2a1fSXin Li@// Include Files
41*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
42*a97c2a1fSXin Li@*/
43*a97c2a1fSXin Li
44*a97c2a1fSXin Li.text
45*a97c2a1fSXin Li.p2align 2
46*a97c2a1fSXin Li.equ idct_stg1_shift       ,            12
47*a97c2a1fSXin Li.equ idct_stg2_shift       ,            16
48*a97c2a1fSXin Li.equ idct_stg1_round     ,          (1 << (idct_stg1_shift - 1))
49*a97c2a1fSXin Li.equ idct_stg2_round     ,          (1 << (idct_stg2_shift - 1))
50*a97c2a1fSXin Li@/*
51*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
52*a97c2a1fSXin Li@// Struct/Union Types and Define
53*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
54*a97c2a1fSXin Li@*/
55*a97c2a1fSXin Li
56*a97c2a1fSXin Li@/*
57*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
58*a97c2a1fSXin Li@// Static Global Data section variables
59*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
60*a97c2a1fSXin Li@*/
61*a97c2a1fSXin Li@//--------------------------- NONE --------------------------------------------
62*a97c2a1fSXin Li
63*a97c2a1fSXin Li@/*
64*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
65*a97c2a1fSXin Li@// Static Prototype Functions
66*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
67*a97c2a1fSXin Li@*/
68*a97c2a1fSXin Li@// -------------------------- NONE --------------------------------------------
69*a97c2a1fSXin Li
70*a97c2a1fSXin Li@/*
71*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
72*a97c2a1fSXin Li@// Exported functions
73*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
74*a97c2a1fSXin Li@*/
75*a97c2a1fSXin Li
76*a97c2a1fSXin Li    .extern gai2_impeg2_idct_q15
77*a97c2a1fSXin Li.hidden gai2_impeg2_idct_q15
78*a97c2a1fSXin Li    .extern gai2_impeg2_idct_q11
79*a97c2a1fSXin Li.hidden gai2_impeg2_idct_q11
80*a97c2a1fSXin Li    .extern gai2_impeg2_idct_first_col_q15
81*a97c2a1fSXin Li.hidden gai2_impeg2_idct_first_col_q15
82*a97c2a1fSXin Li    .extern gai2_impeg2_idct_first_col_q11
83*a97c2a1fSXin Li.hidden gai2_impeg2_idct_first_col_q11
84*a97c2a1fSXin Li    .extern gai2_impeg2_mismatch_stg2_additive
85*a97c2a1fSXin Li.hidden gai2_impeg2_mismatch_stg2_additive
86*a97c2a1fSXin Li
87*a97c2a1fSXin Ligai2_impeg2_idct_q15_addr1:
88*a97c2a1fSXin Li    .long gai2_impeg2_idct_q15 - q15lbl1 - 8
89*a97c2a1fSXin Ligai2_impeg2_idct_q15_addr2:
90*a97c2a1fSXin Li    .long gai2_impeg2_idct_q15 - q15lbl2 - 8
91*a97c2a1fSXin Ligai2_impeg2_idct_q11_addr1:
92*a97c2a1fSXin Li    .long gai2_impeg2_idct_q11 - q11lbl1 - 8
93*a97c2a1fSXin Ligai2_impeg2_idct_q11_addr2:
94*a97c2a1fSXin Li    .long gai2_impeg2_idct_q11 - q11lbl2 - 8
95*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q15_addr1:
96*a97c2a1fSXin Li    .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl1 - 8
97*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q15_addr2:
98*a97c2a1fSXin Li    .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl2 - 8
99*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q15_addr3:
100*a97c2a1fSXin Li    .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl3 - 8
101*a97c2a1fSXin Ligai2_impeg2_mismatch_stg2_additive_addr:
102*a97c2a1fSXin Li    .long gai2_impeg2_mismatch_stg2_additive - additive_lbl - 8
103*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q11_addr1:
104*a97c2a1fSXin Li    .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl1 - 8
105*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q11_addr2:
106*a97c2a1fSXin Li    .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl2 - 8
107*a97c2a1fSXin Li
108*a97c2a1fSXin Li    .global impeg2_idct_recon_dc_a9q
109*a97c2a1fSXin Liimpeg2_idct_recon_dc_a9q:
110*a97c2a1fSXin Li    stmfd           sp!, {r4, r6, r12, lr}
111*a97c2a1fSXin Li    vpush           {d8-d15}
112*a97c2a1fSXin Li    @//r0: pi2_src
113*a97c2a1fSXin Li    @//r1: pi2_tmp - not used, used as pred_strd
114*a97c2a1fSXin Li    @//r2: pu1_pred
115*a97c2a1fSXin Li    @//r3: pu1_dst
116*a97c2a1fSXin Li    @//r4: used as scratch
117*a97c2a1fSXin Li    @//r5:
118*a97c2a1fSXin Li
119*a97c2a1fSXin Li    ldr             r1, [sp, #84]       @//pred_strd
120*a97c2a1fSXin Li    ldr             r6, [sp, #88]       @//dst_strd
121*a97c2a1fSXin Li
122*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_q15_addr1
123*a97c2a1fSXin Liq15lbl1:
124*a97c2a1fSXin Li    add             r14, r14, pc
125*a97c2a1fSXin Li    ldrsh           r12, [r14]
126*a97c2a1fSXin Li    ldrsh           r4, [r0]
127*a97c2a1fSXin Li
128*a97c2a1fSXin Li    vld1.8          d0, [r2], r1
129*a97c2a1fSXin Li    mul             r4, r4, r12
130*a97c2a1fSXin Li
131*a97c2a1fSXin Li    vld1.8          d1, [r2], r1
132*a97c2a1fSXin Li    add             r4, #idct_stg1_round
133*a97c2a1fSXin Li
134*a97c2a1fSXin Li    vld1.8          d2, [r2], r1
135*a97c2a1fSXin Li    asr             r4, r4, #idct_stg1_shift
136*a97c2a1fSXin Li
137*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_q11_addr1
138*a97c2a1fSXin Liq11lbl1:
139*a97c2a1fSXin Li    add             r14, r14, pc
140*a97c2a1fSXin Li    ldrsh           r12, [r14]
141*a97c2a1fSXin Li
142*a97c2a1fSXin Li    vld1.8          d3, [r2], r1
143*a97c2a1fSXin Li    mul             r4, r4, r12
144*a97c2a1fSXin Li
145*a97c2a1fSXin Li    vld1.8          d4, [r2], r1
146*a97c2a1fSXin Li    add             r4, #idct_stg2_round
147*a97c2a1fSXin Li
148*a97c2a1fSXin Li    vld1.8          d5, [r2], r1
149*a97c2a1fSXin Li    asr             r4, r4, #idct_stg2_shift
150*a97c2a1fSXin Li
151*a97c2a1fSXin Li    vld1.8          d6, [r2], r1
152*a97c2a1fSXin Li    vdup.s16        q15, r4
153*a97c2a1fSXin Li
154*a97c2a1fSXin Li
155*a97c2a1fSXin Li    vld1.8          d7, [r2], r1
156*a97c2a1fSXin Li
157*a97c2a1fSXin Li    vaddw.u8        q4, q15, d0
158*a97c2a1fSXin Li
159*a97c2a1fSXin Li    vaddw.u8        q5, q15, d1
160*a97c2a1fSXin Li    vqmovun.s16     d0, q4
161*a97c2a1fSXin Li
162*a97c2a1fSXin Li    vaddw.u8        q6, q15, d2
163*a97c2a1fSXin Li    vqmovun.s16     d1, q5
164*a97c2a1fSXin Li    vst1.8          d0, [r3], r6
165*a97c2a1fSXin Li
166*a97c2a1fSXin Li    vaddw.u8        q7, q15, d3
167*a97c2a1fSXin Li    vqmovun.s16     d2, q6
168*a97c2a1fSXin Li    vst1.8          d1, [r3], r6
169*a97c2a1fSXin Li
170*a97c2a1fSXin Li    vaddw.u8        q8, q15, d4
171*a97c2a1fSXin Li    vqmovun.s16     d3, q7
172*a97c2a1fSXin Li    vst1.8          d2, [r3], r6
173*a97c2a1fSXin Li
174*a97c2a1fSXin Li    vaddw.u8        q9, q15, d5
175*a97c2a1fSXin Li    vqmovun.s16     d4, q8
176*a97c2a1fSXin Li    vst1.8          d3, [r3], r6
177*a97c2a1fSXin Li
178*a97c2a1fSXin Li    vaddw.u8        q10, q15, d6
179*a97c2a1fSXin Li    vqmovun.s16     d5, q9
180*a97c2a1fSXin Li    vst1.8          d4, [r3], r6
181*a97c2a1fSXin Li
182*a97c2a1fSXin Li    vaddw.u8        q11, q15, d7
183*a97c2a1fSXin Li    vqmovun.s16     d6, q10
184*a97c2a1fSXin Li    vst1.8          d5, [r3], r6
185*a97c2a1fSXin Li
186*a97c2a1fSXin Li    vqmovun.s16     d7, q11
187*a97c2a1fSXin Li    vst1.8          d6, [r3], r6
188*a97c2a1fSXin Li
189*a97c2a1fSXin Li
190*a97c2a1fSXin Li    vst1.8          d7, [r3], r6
191*a97c2a1fSXin Li
192*a97c2a1fSXin Li    vpop            {d8-d15}
193*a97c2a1fSXin Li    ldmfd           sp!, {r4, r6, r12, pc}
194*a97c2a1fSXin Li
195*a97c2a1fSXin Li
196*a97c2a1fSXin Li
197*a97c2a1fSXin Li
198*a97c2a1fSXin Li    .global impeg2_idct_recon_dc_mismatch_a9q
199*a97c2a1fSXin Liimpeg2_idct_recon_dc_mismatch_a9q:
200*a97c2a1fSXin Li    stmfd           sp!, {r4-r12, lr}
201*a97c2a1fSXin Li    vpush           {d8-d15}
202*a97c2a1fSXin Li
203*a97c2a1fSXin Li    ldr             r1, [sp, #108]      @//pred_strd
204*a97c2a1fSXin Li    ldr             r6, [sp, #112]      @//dst_strd
205*a97c2a1fSXin Li
206*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_q15_addr2
207*a97c2a1fSXin Liq15lbl2:
208*a97c2a1fSXin Li    add             r14, r14, pc
209*a97c2a1fSXin Li    ldrsh           r12, [r14]
210*a97c2a1fSXin Li    ldrsh           r4, [r0]
211*a97c2a1fSXin Li
212*a97c2a1fSXin Li    mul             r4, r4, r12
213*a97c2a1fSXin Li    add             r4, #idct_stg1_round
214*a97c2a1fSXin Li    asr             r4, r4, #idct_stg1_shift
215*a97c2a1fSXin Li
216*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_q11_addr2
217*a97c2a1fSXin Liq11lbl2:
218*a97c2a1fSXin Li    add             r14, r14, pc
219*a97c2a1fSXin Li    ldrsh           r12, [r14]
220*a97c2a1fSXin Li    mul             r4, r4, r12
221*a97c2a1fSXin Li    vdup.s32        q0, r4
222*a97c2a1fSXin Li
223*a97c2a1fSXin Li    mov             r14, #16            @//Increment for table read
224*a97c2a1fSXin Li    ldr             r4, gai2_impeg2_mismatch_stg2_additive_addr
225*a97c2a1fSXin Liadditive_lbl:
226*a97c2a1fSXin Li    add             r4, r4, pc
227*a97c2a1fSXin Li
228*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
229*a97c2a1fSXin Li
230*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
231*a97c2a1fSXin Li    vmovl.s16       q4, d2
232*a97c2a1fSXin Li    vmovl.s16       q5, d3
233*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
234*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
235*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
236*a97c2a1fSXin Li    vqmovun.s16     d30, q7
237*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
238*a97c2a1fSXin Li
239*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
240*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
241*a97c2a1fSXin Li    vmovl.s16       q4, d2
242*a97c2a1fSXin Li    vmovl.s16       q5, d3
243*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
244*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
245*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
246*a97c2a1fSXin Li    vqmovun.s16     d30, q7
247*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
248*a97c2a1fSXin Li
249*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
250*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
251*a97c2a1fSXin Li    vmovl.s16       q4, d2
252*a97c2a1fSXin Li    vmovl.s16       q5, d3
253*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
254*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
255*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
256*a97c2a1fSXin Li    vqmovun.s16     d30, q7
257*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
258*a97c2a1fSXin Li
259*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
260*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
261*a97c2a1fSXin Li    vmovl.s16       q4, d2
262*a97c2a1fSXin Li    vmovl.s16       q5, d3
263*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
264*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
265*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
266*a97c2a1fSXin Li    vqmovun.s16     d30, q7
267*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
268*a97c2a1fSXin Li
269*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
270*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
271*a97c2a1fSXin Li    vmovl.s16       q4, d2
272*a97c2a1fSXin Li    vmovl.s16       q5, d3
273*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
274*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
275*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
276*a97c2a1fSXin Li    vqmovun.s16     d30, q7
277*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
278*a97c2a1fSXin Li
279*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
280*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
281*a97c2a1fSXin Li    vmovl.s16       q4, d2
282*a97c2a1fSXin Li    vmovl.s16       q5, d3
283*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
284*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
285*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
286*a97c2a1fSXin Li    vqmovun.s16     d30, q7
287*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
288*a97c2a1fSXin Li
289*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
290*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
291*a97c2a1fSXin Li    vmovl.s16       q4, d2
292*a97c2a1fSXin Li    vmovl.s16       q5, d3
293*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
294*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
295*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
296*a97c2a1fSXin Li    vqmovun.s16     d30, q7
297*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
298*a97c2a1fSXin Li
299*a97c2a1fSXin Li    vld1.16         {q1}, [r4], r14
300*a97c2a1fSXin Li    vld1.8          d30, [r2], r1
301*a97c2a1fSXin Li    vmovl.s16       q4, d2
302*a97c2a1fSXin Li    vmovl.s16       q5, d3
303*a97c2a1fSXin Li    vraddhn.s32     d12, q0, q4
304*a97c2a1fSXin Li    vraddhn.s32     d13, q0, q5
305*a97c2a1fSXin Li    vaddw.u8        q7, q6, d30
306*a97c2a1fSXin Li    vqmovun.s16     d30, q7
307*a97c2a1fSXin Li    vst1.8          d30, [r3], r6
308*a97c2a1fSXin Li
309*a97c2a1fSXin Li
310*a97c2a1fSXin Li    vpop            {d8-d15}
311*a97c2a1fSXin Li    ldmfd           sp!, {r4-r12, pc}
312*a97c2a1fSXin Li
313*a97c2a1fSXin Li
314*a97c2a1fSXin Li
315*a97c2a1fSXin Li
316*a97c2a1fSXin Li@/**
317*a97c2a1fSXin Li@ *******************************************************************************
318*a97c2a1fSXin Li@ *
319*a97c2a1fSXin Li@ * ;brief
320*a97c2a1fSXin Li@ *  This function performs Inverse transform  and reconstruction for 8x8
321*a97c2a1fSXin Li@ * input block
322*a97c2a1fSXin Li@ *
323*a97c2a1fSXin Li@ * ;par Description:
324*a97c2a1fSXin Li@ *  Performs inverse transform and adds the prediction  data and clips output
325*a97c2a1fSXin Li@ * to 8 bit
326*a97c2a1fSXin Li@ *
327*a97c2a1fSXin Li@ * ;param[in] pi2_src
328*a97c2a1fSXin Li@ *  Input 8x8 coefficients
329*a97c2a1fSXin Li@ *
330*a97c2a1fSXin Li@ * ;param[in] pi2_tmp
331*a97c2a1fSXin Li@ *  Temporary 8x8 buffer for storing inverse
332*a97c2a1fSXin Li@ *
333*a97c2a1fSXin Li@ *  transform
334*a97c2a1fSXin Li@ *  1st stage output
335*a97c2a1fSXin Li@ *
336*a97c2a1fSXin Li@ * ;param[in] pu1_pred
337*a97c2a1fSXin Li@ *  Prediction 8x8 block
338*a97c2a1fSXin Li@ *
339*a97c2a1fSXin Li@ * ;param[out] pu1_dst
340*a97c2a1fSXin Li@ *  Output 8x8 block
341*a97c2a1fSXin Li@ *
342*a97c2a1fSXin Li@ * ;param[in] src_strd
343*a97c2a1fSXin Li@ *  Input stride
344*a97c2a1fSXin Li@ *
345*a97c2a1fSXin Li@ * ;param[in] pred_strd
346*a97c2a1fSXin Li@ *  Prediction stride
347*a97c2a1fSXin Li@ *
348*a97c2a1fSXin Li@ * ;param[in] dst_strd
349*a97c2a1fSXin Li@ *  Output Stride
350*a97c2a1fSXin Li@ *
351*a97c2a1fSXin Li@ * ;param[in] shift
352*a97c2a1fSXin Li@ *  Output shift
353*a97c2a1fSXin Li@ *
354*a97c2a1fSXin Li@ * ;param[in] zero_cols
355*a97c2a1fSXin Li@ *  Zero columns in pi2_src
356*a97c2a1fSXin Li@ *
357*a97c2a1fSXin Li@ * ;returns  Void
358*a97c2a1fSXin Li@ *
359*a97c2a1fSXin Li@ * ;remarks
360*a97c2a1fSXin Li@ *  None
361*a97c2a1fSXin Li@ *
362*a97c2a1fSXin Li@ *******************************************************************************
363*a97c2a1fSXin Li@ */
364*a97c2a1fSXin Li
365*a97c2a1fSXin Li@void impeg2_itrans_recon_8x8(WORD16 *pi2_src,
366*a97c2a1fSXin Li@                            WORD16 *pi2_tmp,
367*a97c2a1fSXin Li@                            UWORD8 *pu1_pred,
368*a97c2a1fSXin Li@                            UWORD8 *pu1_dst,
369*a97c2a1fSXin Li@                            WORD32 src_strd,
370*a97c2a1fSXin Li@                            WORD32 pred_strd,
371*a97c2a1fSXin Li@                            WORD32 dst_strd,
372*a97c2a1fSXin Li@                            WORD32 zero_cols
373*a97c2a1fSXin Li@                            WORD32 zero_rows               )
374*a97c2a1fSXin Li
375*a97c2a1fSXin Li@**************Variables Vs Registers*************************
376*a97c2a1fSXin Li@   r0 => *pi2_src
377*a97c2a1fSXin Li@   r1 => *pi2_tmp
378*a97c2a1fSXin Li@   r2 => *pu1_pred
379*a97c2a1fSXin Li@   r3 => *pu1_dst
380*a97c2a1fSXin Li@   src_strd
381*a97c2a1fSXin Li@   pred_strd
382*a97c2a1fSXin Li@   dst_strd
383*a97c2a1fSXin Li@   zero_cols
384*a97c2a1fSXin Li
385*a97c2a1fSXin Li
386*a97c2a1fSXin Li
387*a97c2a1fSXin Li    .global impeg2_idct_recon_a9q
388*a97c2a1fSXin Liimpeg2_idct_recon_a9q:
389*a97c2a1fSXin Li@//Register Usage Reference     - loading and Until IDCT of columns
390*a97c2a1fSXin Li@// Cosine Constants    -   D0
391*a97c2a1fSXin Li@// Sine Constants      -   D1
392*a97c2a1fSXin Li@// Row 0 First Half    -   D2      -   y0
393*a97c2a1fSXin Li@// Row 1 First Half    -   D6      -   y1
394*a97c2a1fSXin Li@// Row 2 First Half    -   D3      -   y2
395*a97c2a1fSXin Li@// Row 3 First Half    -   D7      -   y3
396*a97c2a1fSXin Li@// Row 4 First Half    -   D10     -   y4
397*a97c2a1fSXin Li@// Row 5 First Half    -   D14     -   y5
398*a97c2a1fSXin Li@// Row 6 First Half    -   D11     -   y6
399*a97c2a1fSXin Li@// Row 7 First Half    -   D15     -   y7
400*a97c2a1fSXin Li
401*a97c2a1fSXin Li@// Row 0 Second Half   -   D4      -   y0
402*a97c2a1fSXin Li@// Row 1 Second Half   -   D8      -   y1
403*a97c2a1fSXin Li@// Row 2 Second Half   -   D5      -   y2
404*a97c2a1fSXin Li@// Row 3 Second Half   -   D9      -   y3
405*a97c2a1fSXin Li@// Row 4 Second Half   -   D12     -   y4
406*a97c2a1fSXin Li@// Row 5 Second Half   -   D16     -   y5
407*a97c2a1fSXin Li@// Row 6 Second Half   -   D13     -   y6
408*a97c2a1fSXin Li@// Row 7 Second Half   -   D17     -   y7
409*a97c2a1fSXin Li
410*a97c2a1fSXin Li    @// Copy the input pointer to another register
411*a97c2a1fSXin Li    @// Step 1 : load all constants
412*a97c2a1fSXin Li    stmfd           sp!, {r4-r12, lr}
413*a97c2a1fSXin Li    vpush           {d8-d15}
414*a97c2a1fSXin Li
415*a97c2a1fSXin Li    ldr             r8, [sp, #108]        @ prediction stride
416*a97c2a1fSXin Li    ldr             r7, [sp, #112]        @ destination stride
417*a97c2a1fSXin Li    ldr             r6, [sp, #104]            @ src stride
418*a97c2a1fSXin Li    ldr             r12, [sp, #116]
419*a97c2a1fSXin Li    ldr             r11, [sp, #120]
420*a97c2a1fSXin Li
421*a97c2a1fSXin Li    mov             r6, r6, lsl #1      @ x sizeof(word16)
422*a97c2a1fSXin Li    add             r9, r0, r6, lsl #1  @ 2 rows
423*a97c2a1fSXin Li
424*a97c2a1fSXin Li    add             r10, r6, r6, lsl #1 @ 3 rows
425*a97c2a1fSXin Li
426*a97c2a1fSXin Li    sub             r10, r10, #8        @ - 4 cols * sizeof(WORD16)
427*a97c2a1fSXin Li    sub             r5, r6, #8          @ src_strd - 4 cols * sizeof(WORD16)
428*a97c2a1fSXin Li
429*a97c2a1fSXin Li
430*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_first_col_q15_addr1
431*a97c2a1fSXin Lifcq15_lbl1:
432*a97c2a1fSXin Li    add             r14, r14, pc
433*a97c2a1fSXin Li    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
434*a97c2a1fSXin Li
435*a97c2a1fSXin Li    @//Step 2 Load all the input data
436*a97c2a1fSXin Li    @//Step 3 Operate first 4 colums at a time
437*a97c2a1fSXin Li
438*a97c2a1fSXin Li    and             r11, r11, #0xff
439*a97c2a1fSXin Li    and             r12, r12, #0xff
440*a97c2a1fSXin Li
441*a97c2a1fSXin Li    cmp             r11, #0xf0
442*a97c2a1fSXin Li    bge             skip_last4_rows
443*a97c2a1fSXin Li
444*a97c2a1fSXin Li
445*a97c2a1fSXin Li    vld1.16         d2, [r0]!
446*a97c2a1fSXin Li    vld1.16         d3, [r9]!
447*a97c2a1fSXin Li    vld1.16         d4, [r0], r5
448*a97c2a1fSXin Li    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
449*a97c2a1fSXin Li    vld1.16         d5, [r9], r5
450*a97c2a1fSXin Li    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
451*a97c2a1fSXin Li    vld1.16         d6, [r0]!
452*a97c2a1fSXin Li    vld1.16         d7, [r9]!
453*a97c2a1fSXin Li    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
454*a97c2a1fSXin Li    vld1.16         d8, [r0], r10
455*a97c2a1fSXin Li    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
456*a97c2a1fSXin Li    vld1.16         d9, [r9], r10
457*a97c2a1fSXin Li    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
458*a97c2a1fSXin Li    vld1.16         d10, [r0]!
459*a97c2a1fSXin Li    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
460*a97c2a1fSXin Li    vld1.16         d11, [r9]!
461*a97c2a1fSXin Li    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
462*a97c2a1fSXin Li    vld1.16         d12, [r0], r5
463*a97c2a1fSXin Li    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
464*a97c2a1fSXin Li    vld1.16         d13, [r9], r5
465*a97c2a1fSXin Li    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
466*a97c2a1fSXin Li    vld1.16         d14, [r0]!
467*a97c2a1fSXin Li    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
468*a97c2a1fSXin Li    vld1.16         d15, [r9]!
469*a97c2a1fSXin Li    vmull.s16       q11, d10, d0[0]     @// y4 * cos4(part of c0 and c1)
470*a97c2a1fSXin Li    vld1.16         d16, [r0], r10
471*a97c2a1fSXin Li    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
472*a97c2a1fSXin Li    vld1.16         d17, [r9], r10
473*a97c2a1fSXin Li
474*a97c2a1fSXin Li    @/* This following was activated when alignment is not there */
475*a97c2a1fSXin Li@// VLD1.16     D2,[r0]!
476*a97c2a1fSXin Li@// VLD1.16     D3,[r2]!
477*a97c2a1fSXin Li@// VLD1.16     D4,[r0]!
478*a97c2a1fSXin Li@// VLD1.16     D5,[r2]!
479*a97c2a1fSXin Li@// VLD1.16     D6,[r0]!
480*a97c2a1fSXin Li@// VLD1.16     D7,[r2]!
481*a97c2a1fSXin Li@// VLD1.16     D8,[r0],r3
482*a97c2a1fSXin Li@// VLD1.16     D9,[r2],r3
483*a97c2a1fSXin Li@// VLD1.16     D10,[r0]!
484*a97c2a1fSXin Li@// VLD1.16     D11,[r2]!
485*a97c2a1fSXin Li@// VLD1.16     D12,[r0]!
486*a97c2a1fSXin Li@// VLD1.16     D13,[r2]!
487*a97c2a1fSXin Li@// VLD1.16     D14,[r0]!
488*a97c2a1fSXin Li@// VLD1.16     D15,[r2]!
489*a97c2a1fSXin Li@// VLD1.16     D16,[r0],r3
490*a97c2a1fSXin Li@// VLD1.16     D17,[r2],r3
491*a97c2a1fSXin Li
492*a97c2a1fSXin Li
493*a97c2a1fSXin Li
494*a97c2a1fSXin Li
495*a97c2a1fSXin Li    vmlal.s16       q12, d14, d1[1]     @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
496*a97c2a1fSXin Li    vmlsl.s16       q13, d14, d0[1]     @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
497*a97c2a1fSXin Li    vmlal.s16       q14, d14, d1[3]     @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
498*a97c2a1fSXin Li    vmlal.s16       q15, d14, d0[3]     @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
499*a97c2a1fSXin Li
500*a97c2a1fSXin Li    vmlsl.s16       q9, d11, d0[2]      @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
501*a97c2a1fSXin Li    vmlal.s16       q3, d11, d1[2]      @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
502*a97c2a1fSXin Li
503*a97c2a1fSXin Li    vadd.s32        q5, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
504*a97c2a1fSXin Li    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
505*a97c2a1fSXin Li
506*a97c2a1fSXin Li    vmlal.s16       q12, d15, d1[3]     @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
507*a97c2a1fSXin Li    vmlsl.s16       q13, d15, d1[1]     @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
508*a97c2a1fSXin Li    vmlal.s16       q14, d15, d0[3]     @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
509*a97c2a1fSXin Li    vmlsl.s16       q15, d15, d0[1]     @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
510*a97c2a1fSXin Li
511*a97c2a1fSXin Li    vadd.s32        q7, q5, q3          @// a0 = c0 + d0(part of r0,r7)
512*a97c2a1fSXin Li    vsub.s32        q5, q5, q3          @// a3 = c0 - d0(part of r3,r4)
513*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
514*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
515*a97c2a1fSXin Li
516*a97c2a1fSXin Li    vadd.s32        q10, q7, q12        @// a0 + b0(part of r0)
517*a97c2a1fSXin Li    vsub.s32        q3, q7, q12         @// a0 - b0(part of r7)
518*a97c2a1fSXin Li
519*a97c2a1fSXin Li    vadd.s32        q12, q11, q14       @// a2 + b2(part of r2)
520*a97c2a1fSXin Li    vsub.s32        q11, q11, q14       @// a2 - b2(part of r5)
521*a97c2a1fSXin Li
522*a97c2a1fSXin Li    vadd.s32        q14, q9, q13        @// a1 + b1(part of r1)
523*a97c2a1fSXin Li    vsub.s32        q9, q9, q13         @// a1 - b1(part of r6)
524*a97c2a1fSXin Li
525*a97c2a1fSXin Li    vadd.s32        q13, q5, q15        @// a3 + b3(part of r3)
526*a97c2a1fSXin Li    vsub.s32        q15, q5, q15        @// a3 - b3(part of r4)
527*a97c2a1fSXin Li
528*a97c2a1fSXin Li    vqrshrn.s32     d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
529*a97c2a1fSXin Li    vqrshrn.s32     d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
530*a97c2a1fSXin Li    vqrshrn.s32     d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
531*a97c2a1fSXin Li    vqrshrn.s32     d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
532*a97c2a1fSXin Li    vqrshrn.s32     d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
533*a97c2a1fSXin Li    vqrshrn.s32     d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
534*a97c2a1fSXin Li    vqrshrn.s32     d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
535*a97c2a1fSXin Li    vqrshrn.s32     d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
536*a97c2a1fSXin Li
537*a97c2a1fSXin Li
538*a97c2a1fSXin Li    b               last4_cols
539*a97c2a1fSXin Li
540*a97c2a1fSXin Li
541*a97c2a1fSXin Li
542*a97c2a1fSXin Liskip_last4_rows:
543*a97c2a1fSXin Li
544*a97c2a1fSXin Li
545*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_first_col_q15_addr2
546*a97c2a1fSXin Lifcq15_lbl2:
547*a97c2a1fSXin Li    add             r14, r14, pc
548*a97c2a1fSXin Li    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
549*a97c2a1fSXin Li
550*a97c2a1fSXin Li    vld1.16         d2, [r0]!
551*a97c2a1fSXin Li    vld1.16         d3, [r9]!
552*a97c2a1fSXin Li    vld1.16         d4, [r0], r5
553*a97c2a1fSXin Li    vld1.16         d5, [r9], r5
554*a97c2a1fSXin Li    vld1.16         d6, [r0]!
555*a97c2a1fSXin Li    vld1.16         d7, [r9]!
556*a97c2a1fSXin Li    vld1.16         d8, [r0], r10
557*a97c2a1fSXin Li    vld1.16         d9, [r9], r10
558*a97c2a1fSXin Li
559*a97c2a1fSXin Li
560*a97c2a1fSXin Li
561*a97c2a1fSXin Li    vmov.s16        q6, #0
562*a97c2a1fSXin Li    vmov.s16        q8, #0
563*a97c2a1fSXin Li
564*a97c2a1fSXin Li
565*a97c2a1fSXin Li
566*a97c2a1fSXin Li
567*a97c2a1fSXin Li    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
568*a97c2a1fSXin Li    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
569*a97c2a1fSXin Li    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
570*a97c2a1fSXin Li    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
571*a97c2a1fSXin Li
572*a97c2a1fSXin Li    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
573*a97c2a1fSXin Li    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
574*a97c2a1fSXin Li    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
575*a97c2a1fSXin Li    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
576*a97c2a1fSXin Li
577*a97c2a1fSXin Li    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
578*a97c2a1fSXin Li    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
579*a97c2a1fSXin Li
580*a97c2a1fSXin Li    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
581*a97c2a1fSXin Li
582*a97c2a1fSXin Li
583*a97c2a1fSXin Li    vadd.s32        q7, q10, q3         @// a0 = c0 + d0(part of r0,r7)
584*a97c2a1fSXin Li    vsub.s32        q5, q10, q3         @// a3 = c0 - d0(part of r3,r4)
585*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
586*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
587*a97c2a1fSXin Li
588*a97c2a1fSXin Li    vadd.s32        q10, q7, q12        @// a0 + b0(part of r0)
589*a97c2a1fSXin Li    vsub.s32        q3, q7, q12         @// a0 - b0(part of r7)
590*a97c2a1fSXin Li
591*a97c2a1fSXin Li    vadd.s32        q12, q11, q14       @// a2 + b2(part of r2)
592*a97c2a1fSXin Li    vsub.s32        q11, q11, q14       @// a2 - b2(part of r5)
593*a97c2a1fSXin Li
594*a97c2a1fSXin Li    vadd.s32        q14, q9, q13        @// a1 + b1(part of r1)
595*a97c2a1fSXin Li    vsub.s32        q9, q9, q13         @// a1 - b1(part of r6)
596*a97c2a1fSXin Li
597*a97c2a1fSXin Li    vadd.s32        q13, q5, q15        @// a3 + b3(part of r3)
598*a97c2a1fSXin Li    vsub.s32        q15, q5, q15        @// a3 - b3(part of r4)
599*a97c2a1fSXin Li
600*a97c2a1fSXin Li    vqrshrn.s32     d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
601*a97c2a1fSXin Li    vqrshrn.s32     d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
602*a97c2a1fSXin Li    vqrshrn.s32     d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
603*a97c2a1fSXin Li    vqrshrn.s32     d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
604*a97c2a1fSXin Li    vqrshrn.s32     d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
605*a97c2a1fSXin Li    vqrshrn.s32     d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
606*a97c2a1fSXin Li    vqrshrn.s32     d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
607*a97c2a1fSXin Li    vqrshrn.s32     d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
608*a97c2a1fSXin Li
609*a97c2a1fSXin Li
610*a97c2a1fSXin Lilast4_cols:
611*a97c2a1fSXin Li
612*a97c2a1fSXin Li
613*a97c2a1fSXin Li    cmp             r12, #0xf0
614*a97c2a1fSXin Li    bge             skip_last4cols
615*a97c2a1fSXin Li
616*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_first_col_q15_addr3
617*a97c2a1fSXin Lifcq15_lbl3:
618*a97c2a1fSXin Li    add             r14, r14, pc
619*a97c2a1fSXin Li    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
620*a97c2a1fSXin Li
621*a97c2a1fSXin Li    vmull.s16       q12, d8, d0[1]      @// y1 * cos1(part of b0)
622*a97c2a1fSXin Li    vmull.s16       q13, d8, d0[3]      @// y1 * cos3(part of b1)
623*a97c2a1fSXin Li    vmull.s16       q14, d8, d1[1]      @// y1 * sin3(part of b2)
624*a97c2a1fSXin Li    vmull.s16       q15, d8, d1[3]      @// y1 * sin1(part of b3)
625*a97c2a1fSXin Li
626*a97c2a1fSXin Li    vmlal.s16       q12, d9, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
627*a97c2a1fSXin Li    vmlsl.s16       q13, d9, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
628*a97c2a1fSXin Li    vmlsl.s16       q14, d9, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
629*a97c2a1fSXin Li    vmlsl.s16       q15, d9, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
630*a97c2a1fSXin Li
631*a97c2a1fSXin Li    vmull.s16       q9, d5, d1[2]       @// y2 * sin2 (Q4 is freed by this time)(part of d1)
632*a97c2a1fSXin Li    vmull.s16       q4, d5, d0[2]       @// y2 * cos2(part of d0)
633*a97c2a1fSXin Li
634*a97c2a1fSXin Li    vmull.s16       q10, d4, d0[0]      @// y0 * cos4(part of c0 and c1)
635*a97c2a1fSXin Li    vmull.s16       q11, d12, d0[0]     @// y4 * cos4(part of c0 and c1)
636*a97c2a1fSXin Li
637*a97c2a1fSXin Li    vmlal.s16       q12, d16, d1[1]     @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
638*a97c2a1fSXin Li    vmlsl.s16       q13, d16, d0[1]     @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
639*a97c2a1fSXin Li    vmlal.s16       q14, d16, d1[3]     @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
640*a97c2a1fSXin Li    vmlal.s16       q15, d16, d0[3]     @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
641*a97c2a1fSXin Li
642*a97c2a1fSXin Li    vmlsl.s16       q9, d13, d0[2]      @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
643*a97c2a1fSXin Li    vmlal.s16       q4, d13, d1[2]      @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
644*a97c2a1fSXin Li
645*a97c2a1fSXin Li    vadd.s32        q6, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
646*a97c2a1fSXin Li    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
647*a97c2a1fSXin Li
648*a97c2a1fSXin Li    vmlal.s16       q12, d17, d1[3]     @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
649*a97c2a1fSXin Li    vmlsl.s16       q13, d17, d1[1]     @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
650*a97c2a1fSXin Li    vmlal.s16       q14, d17, d0[3]     @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
651*a97c2a1fSXin Li    vmlsl.s16       q15, d17, d0[1]     @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
652*a97c2a1fSXin Li
653*a97c2a1fSXin Li    vadd.s32        q8, q6, q4          @// a0 = c0 + d0(part of e0,e7)
654*a97c2a1fSXin Li    vsub.s32        q6, q6, q4          @// a3 = c0 - d0(part of e3,e4)
655*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of e2,e5)
656*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of e1,e6)
657*a97c2a1fSXin Li
658*a97c2a1fSXin Li    vadd.s32        q10, q8, q12        @// a0 + b0(part of e0)
659*a97c2a1fSXin Li    vsub.s32        q4, q8, q12         @// a0 - b0(part of e7)
660*a97c2a1fSXin Li
661*a97c2a1fSXin Li    vadd.s32        q12, q11, q14       @// a2 + b2(part of e2)
662*a97c2a1fSXin Li    vsub.s32        q11, q11, q14       @// a2 - b2(part of e5)
663*a97c2a1fSXin Li
664*a97c2a1fSXin Li    vadd.s32        q14, q9, q13        @// a1 + b1(part of e1)
665*a97c2a1fSXin Li    vsub.s32        q9, q9, q13         @// a1 - b1(part of e6)
666*a97c2a1fSXin Li
667*a97c2a1fSXin Li    vadd.s32        q13, q6, q15        @// a3 + b3(part of e3)
668*a97c2a1fSXin Li    vsub.s32        q15, q6, q15        @// a3 - b3(part of r4)
669*a97c2a1fSXin Li
670*a97c2a1fSXin Li    vqrshrn.s32     d4, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
671*a97c2a1fSXin Li    vqrshrn.s32     d17, q4, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
672*a97c2a1fSXin Li    vqrshrn.s32     d5, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
673*a97c2a1fSXin Li    vqrshrn.s32     d16, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
674*a97c2a1fSXin Li    vqrshrn.s32     d8, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
675*a97c2a1fSXin Li    vqrshrn.s32     d13, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
676*a97c2a1fSXin Li    vqrshrn.s32     d9, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
677*a97c2a1fSXin Li    vqrshrn.s32     d12, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
678*a97c2a1fSXin Li    b               end_skip_last4cols
679*a97c2a1fSXin Li
680*a97c2a1fSXin Li
681*a97c2a1fSXin Li
682*a97c2a1fSXin Liskip_last4cols:
683*a97c2a1fSXin Li
684*a97c2a1fSXin Li
685*a97c2a1fSXin Li
686*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_first_col_q11_addr1
687*a97c2a1fSXin Lifcq11_lbl1:
688*a97c2a1fSXin Li    add             r14, r14, pc
689*a97c2a1fSXin Li    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
690*a97c2a1fSXin Li
691*a97c2a1fSXin Li
692*a97c2a1fSXin Li
693*a97c2a1fSXin Li    vtrn.16         q1, q3              @//[r3,r1],[r2,r0] first qudrant transposing
694*a97c2a1fSXin Li
695*a97c2a1fSXin Li    vtrn.16         q5, q7              @//[r7,r5],[r6,r4] third qudrant transposing
696*a97c2a1fSXin Li
697*a97c2a1fSXin Li
698*a97c2a1fSXin Li    vtrn.32         d6, d7              @//r0,r1,r2,r3 first qudrant transposing continued.....
699*a97c2a1fSXin Li    vtrn.32         d2, d3              @//r0,r1,r2,r3 first qudrant transposing continued.....
700*a97c2a1fSXin Li
701*a97c2a1fSXin Li    vtrn.32         d10, d11            @//r4,r5,r6,r7 third qudrant transposing continued.....
702*a97c2a1fSXin Li    vtrn.32         d14, d15            @//r4,r5,r6,r7 third qudrant transposing continued.....
703*a97c2a1fSXin Li
704*a97c2a1fSXin Li
705*a97c2a1fSXin Li    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
706*a97c2a1fSXin Li    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
707*a97c2a1fSXin Li    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
708*a97c2a1fSXin Li    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
709*a97c2a1fSXin Li
710*a97c2a1fSXin Li    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
711*a97c2a1fSXin Li    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
712*a97c2a1fSXin Li    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
713*a97c2a1fSXin Li    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
714*a97c2a1fSXin Li
715*a97c2a1fSXin Li    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
716*a97c2a1fSXin Li@   VMULL.S16   Q11,D4,D0[0]                    ;// y4 * cos4(part of c0 and c1)
717*a97c2a1fSXin Li
718*a97c2a1fSXin Li    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
719*a97c2a1fSXin Li    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
720*a97c2a1fSXin Li
721*a97c2a1fSXin Li
722*a97c2a1fSXin Li
723*a97c2a1fSXin Li
724*a97c2a1fSXin Li    vsub.s32        q11, q10, q3        @// a3 = c0 - d0(part of r3,r4)
725*a97c2a1fSXin Li    vadd.s32        q2, q10, q3         @// a0 = c0 + d0(part of r0,r7)
726*a97c2a1fSXin Li
727*a97c2a1fSXin Li
728*a97c2a1fSXin Li    vadd.s32        q1, q2, q12
729*a97c2a1fSXin Li
730*a97c2a1fSXin Li    vsub.s32        q3, q2, q12
731*a97c2a1fSXin Li
732*a97c2a1fSXin Li    vadd.s32        q4, q11, q15
733*a97c2a1fSXin Li
734*a97c2a1fSXin Li    vsub.s32        q12, q11, q15
735*a97c2a1fSXin Li
736*a97c2a1fSXin Li    vqrshrn.s32     d5, q4, #idct_stg2_shift
737*a97c2a1fSXin Li    vqrshrn.s32     d2, q1, #idct_stg2_shift
738*a97c2a1fSXin Li    vqrshrn.s32     d9, q3, #idct_stg2_shift
739*a97c2a1fSXin Li    vqrshrn.s32     d6, q12, #idct_stg2_shift
740*a97c2a1fSXin Li
741*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
742*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
743*a97c2a1fSXin Li
744*a97c2a1fSXin Li
745*a97c2a1fSXin Li    vadd.s32        q15, q11, q14
746*a97c2a1fSXin Li
747*a97c2a1fSXin Li    vsub.s32        q12, q11, q14
748*a97c2a1fSXin Li
749*a97c2a1fSXin Li    vadd.s32        q14, q9, q13
750*a97c2a1fSXin Li
751*a97c2a1fSXin Li    vsub.s32        q11, q9, q13
752*a97c2a1fSXin Li    vqrshrn.s32     d4, q15, #idct_stg2_shift
753*a97c2a1fSXin Li    vqrshrn.s32     d7, q12, #idct_stg2_shift
754*a97c2a1fSXin Li    vqrshrn.s32     d3, q14, #idct_stg2_shift
755*a97c2a1fSXin Li    vqrshrn.s32     d8, q11, #idct_stg2_shift
756*a97c2a1fSXin Li
757*a97c2a1fSXin Li
758*a97c2a1fSXin Li
759*a97c2a1fSXin Li
760*a97c2a1fSXin Li
761*a97c2a1fSXin Li
762*a97c2a1fSXin Li
763*a97c2a1fSXin Li
764*a97c2a1fSXin Li
765*a97c2a1fSXin Li
766*a97c2a1fSXin Li    vmull.s16       q12, d14, d0[1]     @// y1 * cos1(part of b0)
767*a97c2a1fSXin Li
768*a97c2a1fSXin Li    vmull.s16       q13, d14, d0[3]     @// y1 * cos3(part of b1)
769*a97c2a1fSXin Li    vmull.s16       q14, d14, d1[1]     @// y1 * sin3(part of b2)
770*a97c2a1fSXin Li    vmull.s16       q15, d14, d1[3]     @// y1 * sin1(part of b3)
771*a97c2a1fSXin Li
772*a97c2a1fSXin Li    vmlal.s16       q12, d15, d0[3]     @// y1 * cos1 + y3 * cos3(part of b0)
773*a97c2a1fSXin Li    vtrn.16         d2, d3
774*a97c2a1fSXin Li    vmlsl.s16       q13, d15, d1[3]     @// y1 * cos3 - y3 * sin1(part of b1)
775*a97c2a1fSXin Li    vtrn.16         d4, d5
776*a97c2a1fSXin Li    vmlsl.s16       q14, d15, d0[1]     @// y1 * sin3 - y3 * cos1(part of b2)
777*a97c2a1fSXin Li    vtrn.16         d6, d7
778*a97c2a1fSXin Li    vmlsl.s16       q15, d15, d1[1]     @// y1 * sin1 - y3 * sin3(part of b3)
779*a97c2a1fSXin Li    vtrn.16         d8, d9
780*a97c2a1fSXin Li    vmull.s16       q10, d10, d0[0]     @// y0 * cos4(part of c0 and c1)
781*a97c2a1fSXin Li    vtrn.32         d2, d4
782*a97c2a1fSXin Li
783*a97c2a1fSXin Li    vtrn.32         d3, d5
784*a97c2a1fSXin Li    vmull.s16       q9, d11, d1[2]      @// y2 * sin2 (Q7 is freed by this time)(part of d1)
785*a97c2a1fSXin Li    vtrn.32         d6, d8
786*a97c2a1fSXin Li    vmull.s16       q7, d11, d0[2]      @// y2 * cos2(part of d0)
787*a97c2a1fSXin Li    vtrn.32         d7, d9
788*a97c2a1fSXin Li
789*a97c2a1fSXin Li
790*a97c2a1fSXin Li    add             r4, r2, r8, lsl #1  @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
791*a97c2a1fSXin Li
792*a97c2a1fSXin Li
793*a97c2a1fSXin Li    add             r5, r8, r8, lsl #1  @
794*a97c2a1fSXin Li
795*a97c2a1fSXin Li
796*a97c2a1fSXin Li    add             r0, r3, r7, lsl #1  @ r0 points to 3rd row of dest data
797*a97c2a1fSXin Li
798*a97c2a1fSXin Li
799*a97c2a1fSXin Li    add             r10, r7, r7, lsl #1 @
800*a97c2a1fSXin Li
801*a97c2a1fSXin Li
802*a97c2a1fSXin Li    vswp            d3, d6
803*a97c2a1fSXin Li
804*a97c2a1fSXin Li
805*a97c2a1fSXin Li    vswp            d5, d8
806*a97c2a1fSXin Li
807*a97c2a1fSXin Li
808*a97c2a1fSXin Li    vsub.s32        q11, q10, q7        @// a3 = c0 - d0(part of r3,r4)
809*a97c2a1fSXin Li    vadd.s32        q6, q10, q7         @// a0 = c0 + d0(part of r0,r7)
810*a97c2a1fSXin Li
811*a97c2a1fSXin Li
812*a97c2a1fSXin Li    vadd.s32        q0, q6, q12
813*a97c2a1fSXin Li
814*a97c2a1fSXin Li
815*a97c2a1fSXin Li    vsub.s32        q12, q6, q12
816*a97c2a1fSXin Li
817*a97c2a1fSXin Li
818*a97c2a1fSXin Li    vadd.s32        q6, q11, q15
819*a97c2a1fSXin Li
820*a97c2a1fSXin Li
821*a97c2a1fSXin Li    vsub.s32        q7, q11, q15
822*a97c2a1fSXin Li
823*a97c2a1fSXin Li    vqrshrn.s32     d10, q0, #idct_stg2_shift
824*a97c2a1fSXin Li    vqrshrn.s32     d17, q12, #idct_stg2_shift
825*a97c2a1fSXin Li    vqrshrn.s32     d13, q6, #idct_stg2_shift
826*a97c2a1fSXin Li    vqrshrn.s32     d14, q7, #idct_stg2_shift
827*a97c2a1fSXin Li
828*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
829*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
830*a97c2a1fSXin Li
831*a97c2a1fSXin Li
832*a97c2a1fSXin Li    vadd.s32        q0, q11, q14
833*a97c2a1fSXin Li
834*a97c2a1fSXin Li
835*a97c2a1fSXin Li    vsub.s32        q12, q11, q14
836*a97c2a1fSXin Li
837*a97c2a1fSXin Li
838*a97c2a1fSXin Li    vadd.s32        q14, q9, q13
839*a97c2a1fSXin Li
840*a97c2a1fSXin Li
841*a97c2a1fSXin Li    vsub.s32        q13, q9, q13
842*a97c2a1fSXin Li    vld1.8          d18, [r2], r8
843*a97c2a1fSXin Li
844*a97c2a1fSXin Li    vqrshrn.s32     d12, q0, #idct_stg2_shift
845*a97c2a1fSXin Li    vld1.8          d20, [r2], r5
846*a97c2a1fSXin Li
847*a97c2a1fSXin Li
848*a97c2a1fSXin Li    vqrshrn.s32     d15, q12, #idct_stg2_shift
849*a97c2a1fSXin Li    vld1.8          d19, [r2], r8
850*a97c2a1fSXin Li
851*a97c2a1fSXin Li
852*a97c2a1fSXin Li
853*a97c2a1fSXin Li
854*a97c2a1fSXin Li    vqrshrn.s32     d11, q14, #idct_stg2_shift
855*a97c2a1fSXin Li    vld1.8          d22, [r4], r8
856*a97c2a1fSXin Li
857*a97c2a1fSXin Li
858*a97c2a1fSXin Li
859*a97c2a1fSXin Li
860*a97c2a1fSXin Li    vqrshrn.s32     d16, q13, #idct_stg2_shift
861*a97c2a1fSXin Li    vld1.8          d21, [r2], r5
862*a97c2a1fSXin Li
863*a97c2a1fSXin Li
864*a97c2a1fSXin Li    b               pred_buff_addition
865*a97c2a1fSXin Liend_skip_last4cols:
866*a97c2a1fSXin Li
867*a97c2a1fSXin Li    ldr             r14, gai2_impeg2_idct_first_col_q11_addr2
868*a97c2a1fSXin Lifcq11_lbl2:
869*a97c2a1fSXin Li    add             r14, r14, pc
870*a97c2a1fSXin Li    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
871*a97c2a1fSXin Li
872*a97c2a1fSXin Li
873*a97c2a1fSXin Li@/* Now the Idct of columns is done, transpose so that row idct done efficiently(step5) */
874*a97c2a1fSXin Li    vtrn.16         q1, q3              @//[r3,r1],[r2,r0] first qudrant transposing
875*a97c2a1fSXin Li    vtrn.16         q2, q4              @//[r3,r1],[r2,r0] second qudrant transposing
876*a97c2a1fSXin Li    vtrn.16         q5, q7              @//[r7,r5],[r6,r4] third qudrant transposing
877*a97c2a1fSXin Li    vtrn.16         q6, q8              @//[r7,r5],[r6,r4] fourth qudrant transposing
878*a97c2a1fSXin Li
879*a97c2a1fSXin Li    vtrn.32         d6, d7              @//r0,r1,r2,r3 first qudrant transposing continued.....
880*a97c2a1fSXin Li    vtrn.32         d2, d3              @//r0,r1,r2,r3 first qudrant transposing continued.....
881*a97c2a1fSXin Li    vtrn.32         d4, d5              @//r0,r1,r2,r3 second qudrant transposing continued.....
882*a97c2a1fSXin Li    vtrn.32         d8, d9              @//r0,r1,r2,r3 second qudrant transposing continued.....
883*a97c2a1fSXin Li    vtrn.32         d10, d11            @//r4,r5,r6,r7 third qudrant transposing continued.....
884*a97c2a1fSXin Li    vtrn.32         d14, d15            @//r4,r5,r6,r7 third qudrant transposing continued.....
885*a97c2a1fSXin Li    vtrn.32         d12, d13            @//r4,r5,r6,r7 fourth qudrant transposing continued.....
886*a97c2a1fSXin Li    vtrn.32         d16, d17            @//r4,r5,r6,r7 fourth qudrant transposing continued.....
887*a97c2a1fSXin Li
888*a97c2a1fSXin Li    @//step6 Operate on first four rows and find their idct
889*a97c2a1fSXin Li    @//Register Usage Reference     - storing and IDCT of rows
890*a97c2a1fSXin Li@// Cosine Constants    -   D0
891*a97c2a1fSXin Li@// Sine Constants      -   D1
892*a97c2a1fSXin Li@// Element 0 First four    -   D2      -   y0
893*a97c2a1fSXin Li@// Element 1 First four    -   D6      -   y1
894*a97c2a1fSXin Li@// Element 2 First four    -   D3      -   y2
895*a97c2a1fSXin Li@// Element 3 First four    -   D7      -   y3
896*a97c2a1fSXin Li@// Element 4 First four    -   D4      -   y4
897*a97c2a1fSXin Li@// Element 5 First four    -   D8      -   y5
898*a97c2a1fSXin Li@// Element 6 First four    -   D5      -   y6
899*a97c2a1fSXin Li@// Element 7 First four    -   D9      -   y7
900*a97c2a1fSXin Li@// Element 0 Second four   -   D10     -   y0
901*a97c2a1fSXin Li@// Element 1 Second four   -   D14     -   y1
902*a97c2a1fSXin Li@// Element 2 Second four   -   D11     -   y2
903*a97c2a1fSXin Li@// Element 3 Second four   -   D15     -   y3
904*a97c2a1fSXin Li@// Element 4 Second four   -   D12     -   y4
905*a97c2a1fSXin Li@// Element 5 Second four   -   D16     -   y5
906*a97c2a1fSXin Li@// Element 6 Second four   -   D13     -   y6
907*a97c2a1fSXin Li@// Element 7 Second four   -   D17     -   y7
908*a97c2a1fSXin Li
909*a97c2a1fSXin Li    @// Map between first kernel code seq and current
910*a97c2a1fSXin Li@//     D2  ->  D2
911*a97c2a1fSXin Li@//     D6  ->  D6
912*a97c2a1fSXin Li@//     D3  ->  D3
913*a97c2a1fSXin Li@//     D7  ->  D7
914*a97c2a1fSXin Li@//     D10 ->  D4
915*a97c2a1fSXin Li@//     D14 ->  D8
916*a97c2a1fSXin Li@//     D11 ->  D5
917*a97c2a1fSXin Li@//     D15 ->  D9
918*a97c2a1fSXin Li@//     Q3  ->  Q3
919*a97c2a1fSXin Li@//     Q5  ->  Q2
920*a97c2a1fSXin Li@//     Q7  ->  Q4
921*a97c2a1fSXin Li
922*a97c2a1fSXin Li    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
923*a97c2a1fSXin Li    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
924*a97c2a1fSXin Li    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
925*a97c2a1fSXin Li    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
926*a97c2a1fSXin Li
927*a97c2a1fSXin Li    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
928*a97c2a1fSXin Li    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
929*a97c2a1fSXin Li    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
930*a97c2a1fSXin Li    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
931*a97c2a1fSXin Li
932*a97c2a1fSXin Li    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
933*a97c2a1fSXin Li    vmull.s16       q11, d4, d0[0]      @// y4 * cos4(part of c0 and c1)
934*a97c2a1fSXin Li
935*a97c2a1fSXin Li    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
936*a97c2a1fSXin Li    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
937*a97c2a1fSXin Li
938*a97c2a1fSXin Li
939*a97c2a1fSXin Li    vmlal.s16       q12, d8, d1[1]      @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
940*a97c2a1fSXin Li    vmlsl.s16       q13, d8, d0[1]      @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
941*a97c2a1fSXin Li    vmlal.s16       q14, d8, d1[3]      @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
942*a97c2a1fSXin Li    vmlal.s16       q15, d8, d0[3]      @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
943*a97c2a1fSXin Li
944*a97c2a1fSXin Li    vmlsl.s16       q9, d5, d0[2]       @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
945*a97c2a1fSXin Li    vmlal.s16       q3, d5, d1[2]       @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
946*a97c2a1fSXin Li
947*a97c2a1fSXin Li    vadd.s32        q1, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
948*a97c2a1fSXin Li    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
949*a97c2a1fSXin Li
950*a97c2a1fSXin Li    vmlal.s16       q12, d9, d1[3]      @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
951*a97c2a1fSXin Li    vmlsl.s16       q13, d9, d1[1]      @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
952*a97c2a1fSXin Li    vmlal.s16       q14, d9, d0[3]      @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
953*a97c2a1fSXin Li    vmlsl.s16       q15, d9, d0[1]      @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
954*a97c2a1fSXin Li
955*a97c2a1fSXin Li    vsub.s32        q11, q1, q3         @// a3 = c0 - d0(part of r3,r4)
956*a97c2a1fSXin Li    vadd.s32        q2, q1, q3          @// a0 = c0 + d0(part of r0,r7)
957*a97c2a1fSXin Li
958*a97c2a1fSXin Li
959*a97c2a1fSXin Li    vadd.s32        q1, q2, q12
960*a97c2a1fSXin Li
961*a97c2a1fSXin Li    vsub.s32        q3, q2, q12
962*a97c2a1fSXin Li
963*a97c2a1fSXin Li    vadd.s32        q4, q11, q15
964*a97c2a1fSXin Li
965*a97c2a1fSXin Li    vsub.s32        q12, q11, q15
966*a97c2a1fSXin Li
967*a97c2a1fSXin Li    vqrshrn.s32     d5, q4, #idct_stg2_shift
968*a97c2a1fSXin Li    vqrshrn.s32     d2, q1, #idct_stg2_shift
969*a97c2a1fSXin Li    vqrshrn.s32     d9, q3, #idct_stg2_shift
970*a97c2a1fSXin Li    vqrshrn.s32     d6, q12, #idct_stg2_shift
971*a97c2a1fSXin Li
972*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
973*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
974*a97c2a1fSXin Li
975*a97c2a1fSXin Li
976*a97c2a1fSXin Li    vadd.s32        q15, q11, q14
977*a97c2a1fSXin Li
978*a97c2a1fSXin Li    vsub.s32        q12, q11, q14
979*a97c2a1fSXin Li
980*a97c2a1fSXin Li    vadd.s32        q14, q9, q13
981*a97c2a1fSXin Li
982*a97c2a1fSXin Li    vsub.s32        q11, q9, q13
983*a97c2a1fSXin Li    vqrshrn.s32     d4, q15, #idct_stg2_shift
984*a97c2a1fSXin Li    vqrshrn.s32     d7, q12, #idct_stg2_shift
985*a97c2a1fSXin Li    vqrshrn.s32     d3, q14, #idct_stg2_shift
986*a97c2a1fSXin Li    vqrshrn.s32     d8, q11, #idct_stg2_shift
987*a97c2a1fSXin Li
988*a97c2a1fSXin Li
989*a97c2a1fSXin Li
990*a97c2a1fSXin Li
991*a97c2a1fSXin Li
992*a97c2a1fSXin Li
993*a97c2a1fSXin Li
994*a97c2a1fSXin Li
995*a97c2a1fSXin Li
996*a97c2a1fSXin Li
997*a97c2a1fSXin Li    vmull.s16       q12, d14, d0[1]     @// y1 * cos1(part of b0)
998*a97c2a1fSXin Li
999*a97c2a1fSXin Li    vmull.s16       q13, d14, d0[3]     @// y1 * cos3(part of b1)
1000*a97c2a1fSXin Li    vmull.s16       q14, d14, d1[1]     @// y1 * sin3(part of b2)
1001*a97c2a1fSXin Li    vmull.s16       q15, d14, d1[3]     @// y1 * sin1(part of b3)
1002*a97c2a1fSXin Li
1003*a97c2a1fSXin Li    vmlal.s16       q12, d15, d0[3]     @// y1 * cos1 + y3 * cos3(part of b0)
1004*a97c2a1fSXin Li    vtrn.16         d2, d3
1005*a97c2a1fSXin Li    vmlsl.s16       q13, d15, d1[3]     @// y1 * cos3 - y3 * sin1(part of b1)
1006*a97c2a1fSXin Li    vtrn.16         d4, d5
1007*a97c2a1fSXin Li    vmlsl.s16       q14, d15, d0[1]     @// y1 * sin3 - y3 * cos1(part of b2)
1008*a97c2a1fSXin Li    vtrn.16         d6, d7
1009*a97c2a1fSXin Li    vmlsl.s16       q15, d15, d1[1]     @// y1 * sin1 - y3 * sin3(part of b3)
1010*a97c2a1fSXin Li    vtrn.16         d8, d9
1011*a97c2a1fSXin Li    vmull.s16       q10, d10, d0[0]     @// y0 * cos4(part of c0 and c1)
1012*a97c2a1fSXin Li    vtrn.32         d2, d4
1013*a97c2a1fSXin Li    vmull.s16       q11, d12, d0[0]     @// y4 * cos4(part of c0 and c1)
1014*a97c2a1fSXin Li    vtrn.32         d3, d5
1015*a97c2a1fSXin Li    vmull.s16       q9, d11, d1[2]      @// y2 * sin2 (Q7 is freed by this time)(part of d1)
1016*a97c2a1fSXin Li    vtrn.32         d6, d8
1017*a97c2a1fSXin Li    vmull.s16       q7, d11, d0[2]      @// y2 * cos2(part of d0)
1018*a97c2a1fSXin Li    vtrn.32         d7, d9
1019*a97c2a1fSXin Li    vmlal.s16       q12, d16, d1[1]     @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
1020*a97c2a1fSXin Li
1021*a97c2a1fSXin Li    add             r4, r2, r8, lsl #1  @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
1022*a97c2a1fSXin Li    vmlsl.s16       q13, d16, d0[1]     @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
1023*a97c2a1fSXin Li
1024*a97c2a1fSXin Li    add             r5, r8, r8, lsl #1  @
1025*a97c2a1fSXin Li    vmlal.s16       q14, d16, d1[3]     @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
1026*a97c2a1fSXin Li
1027*a97c2a1fSXin Li    add             r0, r3, r7, lsl #1  @ r0 points to 3rd row of dest data
1028*a97c2a1fSXin Li    vmlal.s16       q15, d16, d0[3]     @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
1029*a97c2a1fSXin Li
1030*a97c2a1fSXin Li    add             r10, r7, r7, lsl #1 @
1031*a97c2a1fSXin Li    vmlsl.s16       q9, d13, d0[2]      @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
1032*a97c2a1fSXin Li
1033*a97c2a1fSXin Li
1034*a97c2a1fSXin Li    vmlal.s16       q7, d13, d1[2]      @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
1035*a97c2a1fSXin Li
1036*a97c2a1fSXin Li    vadd.s32        q6, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
1037*a97c2a1fSXin Li    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
1038*a97c2a1fSXin Li
1039*a97c2a1fSXin Li    vmlal.s16       q12, d17, d1[3]     @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
1040*a97c2a1fSXin Li    vswp            d3, d6
1041*a97c2a1fSXin Li    vmlsl.s16       q13, d17, d1[1]     @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
1042*a97c2a1fSXin Li
1043*a97c2a1fSXin Li    vswp            d5, d8
1044*a97c2a1fSXin Li    vmlal.s16       q14, d17, d0[3]     @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
1045*a97c2a1fSXin Li    vmlsl.s16       q15, d17, d0[1]     @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
1046*a97c2a1fSXin Li
1047*a97c2a1fSXin Li    vsub.s32        q11, q6, q7         @// a3 = c0 - d0(part of r3,r4)
1048*a97c2a1fSXin Li    vadd.s32        q6, q6, q7          @// a0 = c0 + d0(part of r0,r7)
1049*a97c2a1fSXin Li
1050*a97c2a1fSXin Li
1051*a97c2a1fSXin Li    vadd.s32        q0, q6, q12
1052*a97c2a1fSXin Li
1053*a97c2a1fSXin Li
1054*a97c2a1fSXin Li    vsub.s32        q12, q6, q12
1055*a97c2a1fSXin Li
1056*a97c2a1fSXin Li
1057*a97c2a1fSXin Li    vadd.s32        q6, q11, q15
1058*a97c2a1fSXin Li
1059*a97c2a1fSXin Li
1060*a97c2a1fSXin Li    vsub.s32        q7, q11, q15
1061*a97c2a1fSXin Li
1062*a97c2a1fSXin Li    vqrshrn.s32     d10, q0, #idct_stg2_shift
1063*a97c2a1fSXin Li    vqrshrn.s32     d17, q12, #idct_stg2_shift
1064*a97c2a1fSXin Li    vqrshrn.s32     d13, q6, #idct_stg2_shift
1065*a97c2a1fSXin Li    vqrshrn.s32     d14, q7, #idct_stg2_shift
1066*a97c2a1fSXin Li
1067*a97c2a1fSXin Li    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
1068*a97c2a1fSXin Li    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
1069*a97c2a1fSXin Li
1070*a97c2a1fSXin Li
1071*a97c2a1fSXin Li    vadd.s32        q0, q11, q14
1072*a97c2a1fSXin Li
1073*a97c2a1fSXin Li
1074*a97c2a1fSXin Li    vsub.s32        q12, q11, q14
1075*a97c2a1fSXin Li
1076*a97c2a1fSXin Li
1077*a97c2a1fSXin Li    vadd.s32        q14, q9, q13
1078*a97c2a1fSXin Li
1079*a97c2a1fSXin Li
1080*a97c2a1fSXin Li    vsub.s32        q13, q9, q13
1081*a97c2a1fSXin Li    vld1.8          d18, [r2], r8
1082*a97c2a1fSXin Li
1083*a97c2a1fSXin Li    vqrshrn.s32     d12, q0, #idct_stg2_shift
1084*a97c2a1fSXin Li    vld1.8          d20, [r2], r5
1085*a97c2a1fSXin Li
1086*a97c2a1fSXin Li
1087*a97c2a1fSXin Li    vqrshrn.s32     d15, q12, #idct_stg2_shift
1088*a97c2a1fSXin Li    vld1.8          d19, [r2], r8
1089*a97c2a1fSXin Li
1090*a97c2a1fSXin Li
1091*a97c2a1fSXin Li
1092*a97c2a1fSXin Li
1093*a97c2a1fSXin Li    vqrshrn.s32     d11, q14, #idct_stg2_shift
1094*a97c2a1fSXin Li    vld1.8          d22, [r4], r8
1095*a97c2a1fSXin Li
1096*a97c2a1fSXin Li
1097*a97c2a1fSXin Li
1098*a97c2a1fSXin Li
1099*a97c2a1fSXin Li    vqrshrn.s32     d16, q13, #idct_stg2_shift
1100*a97c2a1fSXin Li    vld1.8          d21, [r2], r5
1101*a97c2a1fSXin Li
1102*a97c2a1fSXin Li
1103*a97c2a1fSXin Li
1104*a97c2a1fSXin Li
1105*a97c2a1fSXin Lipred_buff_addition:
1106*a97c2a1fSXin Li
1107*a97c2a1fSXin Li
1108*a97c2a1fSXin Li    vtrn.16         d10, d11
1109*a97c2a1fSXin Li    vld1.8          d24, [r4], r5
1110*a97c2a1fSXin Li
1111*a97c2a1fSXin Li    vtrn.16         d12, d13
1112*a97c2a1fSXin Li    vld1.8          d23, [r4], r8
1113*a97c2a1fSXin Li
1114*a97c2a1fSXin Li    vaddw.u8        q1, q1, d18
1115*a97c2a1fSXin Li    vld1.8          d25, [r4], r5
1116*a97c2a1fSXin Li
1117*a97c2a1fSXin Li    vtrn.16         d14, d15
1118*a97c2a1fSXin Li    vaddw.u8        q2, q2, d22
1119*a97c2a1fSXin Li
1120*a97c2a1fSXin Li    vtrn.16         d16, d17
1121*a97c2a1fSXin Li    vaddw.u8        q3, q3, d20
1122*a97c2a1fSXin Li
1123*a97c2a1fSXin Li    vtrn.32         d10, d12
1124*a97c2a1fSXin Li    vaddw.u8        q4, q4, d24
1125*a97c2a1fSXin Li
1126*a97c2a1fSXin Li    vtrn.32         d11, d13
1127*a97c2a1fSXin Li    vtrn.32         d14, d16
1128*a97c2a1fSXin Li    vtrn.32         d15, d17
1129*a97c2a1fSXin Li
1130*a97c2a1fSXin Li    vswp            d11, d14
1131*a97c2a1fSXin Li    vswp            d13, d16
1132*a97c2a1fSXin Li
1133*a97c2a1fSXin Li@ Row values stored in the q register.
1134*a97c2a1fSXin Li
1135*a97c2a1fSXin Li@Q1 :r0
1136*a97c2a1fSXin Li@Q3: r1
1137*a97c2a1fSXin Li@Q2: r2
1138*a97c2a1fSXin Li@Q4: r3
1139*a97c2a1fSXin Li@Q5: r4
1140*a97c2a1fSXin Li@Q7: r5
1141*a97c2a1fSXin Li@Q6: r6
1142*a97c2a1fSXin Li@Q8: r7
1143*a97c2a1fSXin Li
1144*a97c2a1fSXin Li
1145*a97c2a1fSXin Li
1146*a97c2a1fSXin Li@/// Adding the prediction buffer
1147*a97c2a1fSXin Li
1148*a97c2a1fSXin Li
1149*a97c2a1fSXin Li
1150*a97c2a1fSXin Li
1151*a97c2a1fSXin Li
1152*a97c2a1fSXin Li
1153*a97c2a1fSXin Li
1154*a97c2a1fSXin Li
1155*a97c2a1fSXin Li
1156*a97c2a1fSXin Li    @ Load prediction data
1157*a97c2a1fSXin Li
1158*a97c2a1fSXin Li
1159*a97c2a1fSXin Li
1160*a97c2a1fSXin Li
1161*a97c2a1fSXin Li
1162*a97c2a1fSXin Li    @Adding recon with prediction
1163*a97c2a1fSXin Li
1164*a97c2a1fSXin Li
1165*a97c2a1fSXin Li
1166*a97c2a1fSXin Li
1167*a97c2a1fSXin Li
1168*a97c2a1fSXin Li    vaddw.u8        q5, q5, d19
1169*a97c2a1fSXin Li    vqmovun.s16     d2, q1
1170*a97c2a1fSXin Li    vaddw.u8        q7, q7, d21
1171*a97c2a1fSXin Li    vqmovun.s16     d4, q2
1172*a97c2a1fSXin Li    vaddw.u8        q6, q6, d23
1173*a97c2a1fSXin Li    vqmovun.s16     d6, q3
1174*a97c2a1fSXin Li    vaddw.u8        q8, q8, d25
1175*a97c2a1fSXin Li    vqmovun.s16     d8, q4
1176*a97c2a1fSXin Li
1177*a97c2a1fSXin Li
1178*a97c2a1fSXin Li
1179*a97c2a1fSXin Li
1180*a97c2a1fSXin Li
1181*a97c2a1fSXin Li
1182*a97c2a1fSXin Li
1183*a97c2a1fSXin Li    vst1.8          {d2}, [r3], r7
1184*a97c2a1fSXin Li    vqmovun.s16     d10, q5
1185*a97c2a1fSXin Li    vst1.8          {d6}, [r3], r10
1186*a97c2a1fSXin Li    vqmovun.s16     d14, q7
1187*a97c2a1fSXin Li    vst1.8          {d4}, [r0], r7
1188*a97c2a1fSXin Li    vqmovun.s16     d12, q6
1189*a97c2a1fSXin Li    vst1.8          {d8}, [r0], r10
1190*a97c2a1fSXin Li    vqmovun.s16     d16, q8
1191*a97c2a1fSXin Li
1192*a97c2a1fSXin Li
1193*a97c2a1fSXin Li
1194*a97c2a1fSXin Li
1195*a97c2a1fSXin Li
1196*a97c2a1fSXin Li
1197*a97c2a1fSXin Li
1198*a97c2a1fSXin Li    vst1.8          {d10}, [r3], r7
1199*a97c2a1fSXin Li    vst1.8          {d14}, [r3], r10
1200*a97c2a1fSXin Li    vst1.8          {d12}, [r0], r7
1201*a97c2a1fSXin Li    vst1.8          {d16}, [r0], r10
1202*a97c2a1fSXin Li
1203*a97c2a1fSXin Li
1204*a97c2a1fSXin Li
1205*a97c2a1fSXin Li
1206*a97c2a1fSXin Li
1207*a97c2a1fSXin Li    vpop            {d8-d15}
1208*a97c2a1fSXin Li    ldmfd           sp!, {r4-r12, pc}
1209*a97c2a1fSXin Li
1210*a97c2a1fSXin Li
1211*a97c2a1fSXin Li
1212