xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_resi_trans.s (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1*c83a76b0SSuyog Pawar@/******************************************************************************
2*c83a76b0SSuyog Pawar@ *
3*c83a76b0SSuyog Pawar@ * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar@ *
5*c83a76b0SSuyog Pawar@ * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar@ * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar@ * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar@ *
9*c83a76b0SSuyog Pawar@ * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar@ *
11*c83a76b0SSuyog Pawar@ * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar@ * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar@ * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar@ * limitations under the License.
16*c83a76b0SSuyog Pawar@ *
17*c83a76b0SSuyog Pawar@ *****************************************************************************
18*c83a76b0SSuyog Pawar@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar@*/
20*c83a76b0SSuyog Pawar
21*c83a76b0SSuyog Pawar.text
22*c83a76b0SSuyog Pawar.align 4
23*c83a76b0SSuyog Pawar
24*c83a76b0SSuyog Pawar@/**
25*c83a76b0SSuyog Pawar@/*******************************************************************************
26*c83a76b0SSuyog Pawar@/*
27*c83a76b0SSuyog Pawar@/* @brief
28*c83a76b0SSuyog Pawar@/*  Residue calculation and Forward Transform for 4x4 block with 8-bit input
29*c83a76b0SSuyog Pawar@/*
30*c83a76b0SSuyog Pawar@/* @par Description:
31*c83a76b0SSuyog Pawar@/*  Performs residue calculation by subtracting source and  prediction and
32*c83a76b0SSuyog Pawar@/*  followed by forward transform
33*c83a76b0SSuyog Pawar@/*
34*c83a76b0SSuyog Pawar@/* @param[in] pu1_src
35*c83a76b0SSuyog Pawar@/*  Input 4x4 pixels
36*c83a76b0SSuyog Pawar@/*
37*c83a76b0SSuyog Pawar@/* @param[in] pu1_pred
38*c83a76b0SSuyog Pawar@/*  Prediction data
39*c83a76b0SSuyog Pawar@/*
40*c83a76b0SSuyog Pawar@/* @param[in] pi4_tmp
41*c83a76b0SSuyog Pawar@/*  Temporary buffer of size 4x4
42*c83a76b0SSuyog Pawar@/*
43*c83a76b0SSuyog Pawar@/* @param[out] pi2_dst
44*c83a76b0SSuyog Pawar@/*  Output 4x4 coefficients
45*c83a76b0SSuyog Pawar@/*
46*c83a76b0SSuyog Pawar@/* @param[in] src_strd
47*c83a76b0SSuyog Pawar@/*  Input stride
48*c83a76b0SSuyog Pawar@/*
49*c83a76b0SSuyog Pawar@/* @param[in] pred_strd
50*c83a76b0SSuyog Pawar@/*  Prediction Stride
51*c83a76b0SSuyog Pawar@/*
52*c83a76b0SSuyog Pawar@/* @param[in] dst_strd
53*c83a76b0SSuyog Pawar@/*  Output Stride
54*c83a76b0SSuyog Pawar@/*
55*c83a76b0SSuyog Pawar@/* @param[in] chr_plane
56*c83a76b0SSuyog Pawar@/*  Chroma plane
57*c83a76b0SSuyog Pawar@/*
58*c83a76b0SSuyog Pawar@/* @returns  Void
59*c83a76b0SSuyog Pawar@/*
60*c83a76b0SSuyog Pawar@/* @remarks
61*c83a76b0SSuyog Pawar@/*  None
62*c83a76b0SSuyog Pawar@/*
63*c83a76b0SSuyog Pawar@/*******************************************************************************
64*c83a76b0SSuyog Pawar@/*/
65*c83a76b0SSuyog Pawar
66*c83a76b0SSuyog Pawar@/**************Variables Vs Registers*****************************************
67*c83a76b0SSuyog Pawar@    r0 => *pu1_src
68*c83a76b0SSuyog Pawar@    r1 => *pu1_pred
69*c83a76b0SSuyog Pawar@    r2 => *pi4_temp
70*c83a76b0SSuyog Pawar@    r3 => *pi2_dst
71*c83a76b0SSuyog Pawar@    r4 => src_strd
72*c83a76b0SSuyog Pawar@    r5 => pred_strd
73*c83a76b0SSuyog Pawar@    r6 => dst_strd
74*c83a76b0SSuyog Pawar@    r7 => chroma_plane
75*c83a76b0SSuyog Pawar
76*c83a76b0SSuyog Pawar    .global ihevc_resi_trans_4x4_a9q
77*c83a76b0SSuyog Pawar
78*c83a76b0SSuyog Pawarihevc_resi_trans_4x4_a9q:
79*c83a76b0SSuyog Pawar
80*c83a76b0SSuyog Pawar    STMFD          sp!, {r4-r7, r14}   @ store all the register components from caller function to memory
81*c83a76b0SSuyog Pawar    LDR            r4, [sp,#20]        @ r4 contains src_strd
82*c83a76b0SSuyog Pawar    LDR            r5, [sp,#24]        @ r5 contains pred_strd
83*c83a76b0SSuyog Pawar    LDR            r6, [sp,#28]        @ r6 contains dst_strd
84*c83a76b0SSuyog Pawar    LDR            r7, [sp,#32]        @ r7 chroma plane
85*c83a76b0SSuyog Pawar
86*c83a76b0SSuyog Pawar    CMP            r7, #-1
87*c83a76b0SSuyog Pawar    BEQ            NON_INTERLEAVE_LOAD @if flag == NULL_PLANE, use non-interleaving loads
88*c83a76b0SSuyog Pawar
89*c83a76b0SSuyog Pawar    VLD1.64        d0, [r0], r4        @ load row 0 src
90*c83a76b0SSuyog Pawar    VLD1.64        d4, [r0], r4        @ load row 1 src
91*c83a76b0SSuyog Pawar    VLD1.64        d1, [r0], r4        @ load row 2 src
92*c83a76b0SSuyog Pawar    VLD1.64        d5, [r0], r4        @ load row 3 src
93*c83a76b0SSuyog Pawar    VUZP.8         d0, d4              @ de-interleaving unzip instruction to get luma data of pu1_src in d0
94*c83a76b0SSuyog Pawar    VUZP.8         d1, d5              @ de-interleaving unzip instruction to get luma data of pu1_src in d1
95*c83a76b0SSuyog Pawar
96*c83a76b0SSuyog Pawar    VLD1.64        d2, [r1], r5        @ load row 0 pred
97*c83a76b0SSuyog Pawar    VLD1.64        d6, [r1], r5        @ load row 1 pred
98*c83a76b0SSuyog Pawar    VLD1.64        d3, [r1], r5        @ load row 2 pred
99*c83a76b0SSuyog Pawar    VLD1.64        d7, [r1], r5        @ load row 3 pred
100*c83a76b0SSuyog Pawar    VUZP.8         d2, d6              @ de-interleaving unzip instruction to get luma data of pu1_pred in d2
101*c83a76b0SSuyog Pawar    VUZP.8         d3, d7              @ de-interleaving unzip instruction to get luma data of pu1_pred in d3
102*c83a76b0SSuyog Pawar
103*c83a76b0SSuyog Pawar    CMP            r7, #0
104*c83a76b0SSuyog Pawar    BEQ            LOAD_END
105*c83a76b0SSuyog Pawar    VSWP.8         d0, d4
106*c83a76b0SSuyog Pawar    VSWP.8         d1, d5
107*c83a76b0SSuyog Pawar    VSWP.8         d2, d6
108*c83a76b0SSuyog Pawar    VSWP.8         d3, d7
109*c83a76b0SSuyog Pawar
110*c83a76b0SSuyog Pawar    B LOAD_END
111*c83a76b0SSuyog Pawar
112*c83a76b0SSuyog PawarNON_INTERLEAVE_LOAD:
113*c83a76b0SSuyog Pawar    VLD1.U32     d0[0], [r0], r4       @ load row 0 src
114*c83a76b0SSuyog Pawar    VLD1.U32     d0[1], [r0], r4       @ load row 1 src
115*c83a76b0SSuyog Pawar    VLD1.U32     d1[0], [r0], r4       @ load row 2 src
116*c83a76b0SSuyog Pawar    VLD1.U32     d1[1], [r0], r4       @ load row 3 src
117*c83a76b0SSuyog Pawar
118*c83a76b0SSuyog Pawar    VLD1.U32     d2[0], [r1], r5       @ load row 0 pred
119*c83a76b0SSuyog Pawar    VLD1.U32     d2[1], [r1], r5       @ load row 1 pred
120*c83a76b0SSuyog Pawar    VLD1.U32     d3[0], [r1], r5       @ load row 2 pred
121*c83a76b0SSuyog Pawar    VLD1.U32     d3[1], [r1], r5       @ load row 3 pred
122*c83a76b0SSuyog Pawar
123*c83a76b0SSuyog PawarLOAD_END:
124*c83a76b0SSuyog Pawar    @ Finding the residue
125*c83a76b0SSuyog Pawar    VSUBL.U8    q2, d0, d2             @ q2 contains 1st 16-bit 8 residues
126*c83a76b0SSuyog Pawar    VSUBL.U8    q3, d1, d3             @ q3 contains 2nd 16-bit 8 residues
127*c83a76b0SSuyog Pawar
128*c83a76b0SSuyog Pawar    @ SAD caculation
129*c83a76b0SSuyog Pawar    VABDL.U8    q12, d0, d2            @ q12 contains absolute differences
130*c83a76b0SSuyog Pawar    VABAL.U8    q12, d1, d3            @ q12 accumulates absolute differences
131*c83a76b0SSuyog Pawar    VADD.U16    d26, d24, d25          @ add d-registers of q12
132*c83a76b0SSuyog Pawar    VPADDL.U16  d27, d26               @ d27 contains 2 32-bit values that have to be added
133*c83a76b0SSuyog Pawar    VPADDL.U32  d28, d27               @ d28 contains 64-bit SAD, only LSB important
134*c83a76b0SSuyog Pawar    VMOV.32     r0, d28[0]             @ SAD stored in r0 for return
135*c83a76b0SSuyog Pawar    @ SAD caculation ends
136*c83a76b0SSuyog Pawar
137*c83a76b0SSuyog Pawar    @ Forward transform - step 1
138*c83a76b0SSuyog Pawar    VMOV.I16    d2, #64                @ generate immediate constant in d2 for even row multiplication
139*c83a76b0SSuyog Pawar    VTRN.16     d4, d5                 @ 3-step transpose of residue matrix starts
140*c83a76b0SSuyog Pawar    VTRN.16     d6, d7                 @ 2nd step of the 3-step matrix transpose
141*c83a76b0SSuyog Pawar    VMOV.I16    d0, #83                @ generate immediate constant in d0 for odd row multiplication
142*c83a76b0SSuyog Pawar    VTRN.32     q2, q3                 @ Final step of matrix transpose
143*c83a76b0SSuyog Pawar
144*c83a76b0SSuyog Pawar    VMOV.I16    d1, #36                @ generate immediate constant in d1 for odd row multiplication
145*c83a76b0SSuyog Pawar    VSWP        d6, d7                 @ vector swap to allow even and odd row calculation using Q registers
146*c83a76b0SSuyog Pawar    VADD.S16    q10, q2, q3            @ q4 has the even array
147*c83a76b0SSuyog Pawar    VSUB.S16    q11, q2, q3            @ q5 has the odd array
148*c83a76b0SSuyog Pawar    VMULL.S16   q12, d20, d2           @ e[0]*64
149*c83a76b0SSuyog Pawar    VMLAL.S16   q12, d21, d2[0]        @ row 1 of results: e[0]*64 + e[1]*64
150*c83a76b0SSuyog Pawar    VMULL.S16   q13, d20, d2           @ e[0]*64
151*c83a76b0SSuyog Pawar    VMLSL.S16   q13, d21, d2[0]        @ row 3 of results: e[0]*64 - e[1]*64
152*c83a76b0SSuyog Pawar    VMULL.S16   q8, d22, d0            @ o[0]*83
153*c83a76b0SSuyog Pawar    VMLAL.S16   q8, d23, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
154*c83a76b0SSuyog Pawar    VMULL.S16   q9, d22, d1            @ o[0]*36
155*c83a76b0SSuyog Pawar    VMLSL.S16   q9, d23, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
156*c83a76b0SSuyog Pawar
157*c83a76b0SSuyog Pawar    @ Forward transform - step 2
158*c83a76b0SSuyog Pawar    VMOV.I32    d2, #64                @ generate immediate constant in d2 for even row multiplication
159*c83a76b0SSuyog Pawar    VMOV.I32    d0, #83                @ generate immediate constant in d0 for odd row multiplication
160*c83a76b0SSuyog Pawar    VTRN.32     q12, q8                @ 4-step transpose of residue matrix starts
161*c83a76b0SSuyog Pawar    VTRN.32     q13, q9                @ 2nd step of the 4-step matrix transpose
162*c83a76b0SSuyog Pawar
163*c83a76b0SSuyog Pawar    VMOV.I32    d1, #36                @ generate immediate constant in d1 for odd row multiplication
164*c83a76b0SSuyog Pawar    VSWP        d25, d26               @ 3rd step of the 4-step matrix transpose
165*c83a76b0SSuyog Pawar    VSWP        d17, d18               @ 4th step of the 4-step matrix transpose
166*c83a76b0SSuyog Pawar    VADD.S32    q2, q12, q9            @ e[0]
167*c83a76b0SSuyog Pawar    VADD.S32    q3, q8, q13            @ e[1]
168*c83a76b0SSuyog Pawar    VSUB.S32    q10, q12, q9           @ o[0]
169*c83a76b0SSuyog Pawar    VSUB.S32    q11, q8, q13           @ o[1]
170*c83a76b0SSuyog Pawar
171*c83a76b0SSuyog Pawar    VMUL.S32    q12, q2, d2[0]         @ e[0]*64
172*c83a76b0SSuyog Pawar    VMLA.S32    q12, q3, d2[0]         @ row 1 of results: e[0]*64 + e[1]*64
173*c83a76b0SSuyog Pawar    VMUL.S32    q13, q2, d2[0]         @ e[1]*64
174*c83a76b0SSuyog Pawar    VMLS.S32    q13, q3, d2[0]         @ row 3 of results: e[0]*64 - e[1]*64
175*c83a76b0SSuyog Pawar    VMUL.S32    q8, q10, d0[0]         @ o[0]*83
176*c83a76b0SSuyog Pawar    VMLA.S32    q8, q11, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
177*c83a76b0SSuyog Pawar    VMUL.S32    q9, q10, d1[0]         @ o[0]*36
178*c83a76b0SSuyog Pawar    VMLS.S32    q9, q11, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
179*c83a76b0SSuyog Pawar
180*c83a76b0SSuyog Pawar    VRSHRN.S32  d0, q12, #9            @ (row1 + 256)/512
181*c83a76b0SSuyog Pawar    VRSHRN.S32  d1, q8, #9             @ (row2 + 256)/512
182*c83a76b0SSuyog Pawar    VRSHRN.S32  d2, q13, #9            @ (row3 + 256)/512
183*c83a76b0SSuyog Pawar    VRSHRN.S32  d3, q9, #9             @ (row4 + 256)/512
184*c83a76b0SSuyog Pawar
185*c83a76b0SSuyog Pawar    LSL         r7, r6, #1             @ r7 = 2*dst_strd, as pi2_dst contains 2-byte integers
186*c83a76b0SSuyog Pawar    VST1.U16    d0, [r3], r7           @ store 1st row of result
187*c83a76b0SSuyog Pawar    VST1.U16    d1, [r3], r7           @ store 2nd row of result
188*c83a76b0SSuyog Pawar    VST1.U16    d2, [r3], r7           @ store 3rd row of result
189*c83a76b0SSuyog Pawar    VST1.U16    d3, [r3], r7           @ store 4th row of result
190*c83a76b0SSuyog Pawar
191*c83a76b0SSuyog Pawar    LDMFD       sp!,{r4-r7,r15}        @ Reload the registers from SP
192*c83a76b0SSuyog Pawar
193*c83a76b0SSuyog Pawar    @ Function End
194*c83a76b0SSuyog Pawar
195*c83a76b0SSuyog Pawar@/**
196*c83a76b0SSuyog Pawar@*******************************************************************************
197*c83a76b0SSuyog Pawar@*
198*c83a76b0SSuyog Pawar@* @brief
199*c83a76b0SSuyog Pawar@*  This function performs residue calculation and forward  transform type 1
200*c83a76b0SSuyog Pawar@*  on input pixels
201*c83a76b0SSuyog Pawar@*
202*c83a76b0SSuyog Pawar@* @description
203*c83a76b0SSuyog Pawar@*  Performs residue calculation by subtracting source and  prediction and
204*c83a76b0SSuyog Pawar@*  followed by forward transform
205*c83a76b0SSuyog Pawar@*
206*c83a76b0SSuyog Pawar@* @param[in] pu1_src
207*c83a76b0SSuyog Pawar@*  Input 4x4 pixels
208*c83a76b0SSuyog Pawar@*
209*c83a76b0SSuyog Pawar@* @param[in] pu1_pred
210*c83a76b0SSuyog Pawar@*  Prediction data
211*c83a76b0SSuyog Pawar@*
212*c83a76b0SSuyog Pawar@* @param[in] pi2_tmp
213*c83a76b0SSuyog Pawar@*  Temporary buffer of size 4x4
214*c83a76b0SSuyog Pawar@*
215*c83a76b0SSuyog Pawar@* @param[out] pi2_dst
216*c83a76b0SSuyog Pawar@*  Output 4x4 coefficients
217*c83a76b0SSuyog Pawar@*
218*c83a76b0SSuyog Pawar@* @param[in] src_strd
219*c83a76b0SSuyog Pawar@*  Input stride
220*c83a76b0SSuyog Pawar@*
221*c83a76b0SSuyog Pawar@* @param[in] pred_strd
222*c83a76b0SSuyog Pawar@*  Prediction Stride
223*c83a76b0SSuyog Pawar@*
224*c83a76b0SSuyog Pawar@* @param[in] dst_strd
225*c83a76b0SSuyog Pawar@*  Output Stride
226*c83a76b0SSuyog Pawar@*
227*c83a76b0SSuyog Pawar@* @param[in] chr_plane (unused)
228*c83a76b0SSuyog Pawar@*  Chroma plane
229*c83a76b0SSuyog Pawar@*
230*c83a76b0SSuyog Pawar@* @returns void
231*c83a76b0SSuyog Pawar@*
232*c83a76b0SSuyog Pawar@* @remarks
233*c83a76b0SSuyog Pawar@*  None
234*c83a76b0SSuyog Pawar@*
235*c83a76b0SSuyog Pawar@*******************************************************************************
236*c83a76b0SSuyog Pawar@*/
237*c83a76b0SSuyog Pawar@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
238*c83a76b0SSuyog Pawar@                                     UWORD8 *pu1_pred,
239*c83a76b0SSuyog Pawar@                                     WORD32 *pi4_temp,
240*c83a76b0SSuyog Pawar@                                     WORD16 *pi2_dst,
241*c83a76b0SSuyog Pawar@                                     WORD32 src_strd,
242*c83a76b0SSuyog Pawar@                                     WORD32 pred_strd,
243*c83a76b0SSuyog Pawar@                                     WORD32 dst_strd
244*c83a76b0SSuyog Pawar@                                     WORD32 chroma_plane);
245*c83a76b0SSuyog Pawar@
246*c83a76b0SSuyog Pawar@**************Variables Vs Registers*******************************************
247*c83a76b0SSuyog Pawar@
248*c83a76b0SSuyog Pawar@ r0 - pu1_src
249*c83a76b0SSuyog Pawar@ r1 - pu1_pred
250*c83a76b0SSuyog Pawar@ r2 - pi4_temp
251*c83a76b0SSuyog Pawar@ r3 - pi2_dst
252*c83a76b0SSuyog Pawar@
253*c83a76b0SSuyog Pawar@ [sp]   - src_strd
254*c83a76b0SSuyog Pawar@ [sp+4] - pred_strd
255*c83a76b0SSuyog Pawar@ [sp+8] - dst_strd
256*c83a76b0SSuyog Pawar@ [sp+12] - chroma_plane
257*c83a76b0SSuyog Pawar@
258*c83a76b0SSuyog Pawar@*******************************************************************************
259*c83a76b0SSuyog Pawar
260*c83a76b0SSuyog Pawar    .global ihevc_resi_trans_4x4_ttype1_a9q
261*c83a76b0SSuyog Pawar
262*c83a76b0SSuyog Pawarihevc_resi_trans_4x4_ttype1_a9q:
263*c83a76b0SSuyog Pawar
264*c83a76b0SSuyog Pawar    PUSH {r4}
265*c83a76b0SSuyog Pawar    vpush {d8 - d15}
266*c83a76b0SSuyog Pawar
267*c83a76b0SSuyog Pawar    LDR r2,[sp,#68]                 @ r2 = src_strd
268*c83a76b0SSuyog Pawar    LDR r4,[sp,#72]                 @ r4 = pred_strd
269*c83a76b0SSuyog Pawar
270*c83a76b0SSuyog Pawar    VLD1.32 d2[0],[r0],r2           @ Row 1 of source in d2[0]
271*c83a76b0SSuyog Pawar    VLD1.32 d3[0],[r1],r4           @ Row 1 of prediction in d3[0]
272*c83a76b0SSuyog Pawar    VLD1.32 d2[1],[r0],r2           @ Row 2 of source in d2[1]
273*c83a76b0SSuyog Pawar    VLD1.32 d3[1],[r1],r4           @ Row 2 of prediction in d3[1]
274*c83a76b0SSuyog Pawar
275*c83a76b0SSuyog Pawar    VLD1.32 d8[0],[r0],r2           @ Row 3 of source in d8[0]
276*c83a76b0SSuyog Pawar    VABDL.U8 q0,d2,d3               @ Absolute differences of rows 1 and 2 in d0
277*c83a76b0SSuyog Pawar                                    @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue
278*c83a76b0SSuyog Pawar    VLD1.32 d9[0],[r1],r4           @ Row 3 of prediction in d9[0]
279*c83a76b0SSuyog Pawar    VSUBL.U8 q5,d2,d3               @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue
280*c83a76b0SSuyog Pawar    VLD1.32 d8[1],[r0]              @ Row 4 of source in d8[1]
281*c83a76b0SSuyog Pawar    VTRN.16 d10,d11                 @ Transpose step 1
282*c83a76b0SSuyog Pawar    VLD1.32 d9[1],[r1]              @ Row 4 of prediction in d9[1]
283*c83a76b0SSuyog Pawar
284*c83a76b0SSuyog Pawar    VSUBL.U8 q6,d8,d9               @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue
285*c83a76b0SSuyog Pawar                                    @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue
286*c83a76b0SSuyog Pawar    VABAL.U8 q0,d8,d9               @ Absolute differences of rows 3 and 4 in d1
287*c83a76b0SSuyog Pawar    VTRN.16 d12,d13                 @ Transpose step 2
288*c83a76b0SSuyog Pawar    VTRN.32 q5,q6                   @ Transpose step 3, Residue block transposed
289*c83a76b0SSuyog Pawar                                    @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13
290*c83a76b0SSuyog Pawar    VADD.S16 d23,d11,d13            @ d23 = C2 + C4
291*c83a76b0SSuyog Pawar    VMOV.I32 d6,#55                 @ Constant used for multiplication
292*c83a76b0SSuyog Pawar    VADD.S16 d22,d10,d13            @ d22 = C1 + C4
293*c83a76b0SSuyog Pawar    VADD.U16 d0,d1,d0               @ Accumulating SAD step 1
294*c83a76b0SSuyog Pawar    VMOV.I32 d7,#84                 @ Constant used for multiplication
295*c83a76b0SSuyog Pawar    VMULL.S16 q7,d23,d6[0]          @ q7  = 55*C2 + 55*C4
296*c83a76b0SSuyog Pawar    VMOV.I32 d4,#74                 @ Constant used for multiplication
297*c83a76b0SSuyog Pawar    VMULL.S16 q9,d22,d7[0]          @ q9  = 84*C1 + 84*C4
298*c83a76b0SSuyog Pawar    VADD.S16 d16,d10,d11            @ d16 = C1 + C2
299*c83a76b0SSuyog Pawar    VMUL.S16 d12,d12,d4[0]          @ d12 = 74*C3
300*c83a76b0SSuyog Pawar    VMOV.I32 d5,#29                 @ Constant used for multiplication
301*c83a76b0SSuyog Pawar    VPADDL.U16 d0,d0                @ Accumulating SAD step 2
302*c83a76b0SSuyog Pawar    VSUB.S16 d16,d16,d13            @ d16 = C1 + C2 - C4
303*c83a76b0SSuyog Pawar    VMLAL.S16 q7,d22,d5[0]          @ q7  = 29*C1 + 55*C2 + 84*C4
304*c83a76b0SSuyog Pawar    VMLSL.S16 q9,d23,d5[0]          @ q9  = 84*C1 - 29*C2 + 55*C4
305*c83a76b0SSuyog Pawar    VMULL.S16 q8,d16,d4[0]          @ q8  = 74*C1 + 74*C2 - 74*C4
306*c83a76b0SSuyog Pawar    VPADDL.U32 d0,d0                @ Accumulating SAD step 3, SAD in d0
307*c83a76b0SSuyog Pawar    VSUB.S32 q10,q9,q7              @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4
308*c83a76b0SSuyog Pawar    VMOV.32 r0,d0[0]                @ Return SAD value
309*c83a76b0SSuyog Pawar    VRSHR.S32 q8,q8,#1              @ Truncating the 1 bit in q8
310*c83a76b0SSuyog Pawar
311*c83a76b0SSuyog Pawar    VADDW.S16 q7,q7,d12             @ q7  = 29*C1 + 55*C2 + 74*C3 + 84*C4
312*c83a76b0SSuyog Pawar    VSUBW.S16 q9,q9,d12             @ q9  = 84*C1 - 29*C2 - 74*C3 + 55*C4
313*c83a76b0SSuyog Pawar    VADDW.S16 q10,q10,d12           @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4
314*c83a76b0SSuyog Pawar
315*c83a76b0SSuyog Pawar    VRSHR.S32 q7,q7,#1              @ Truncating the 1 bit in q7
316*c83a76b0SSuyog Pawar    VRSHR.S32 q9,q9,#1              @ Truncating the 1 bit in q9
317*c83a76b0SSuyog Pawar    VRSHR.S32 q10,q10,#1            @ Truncating the 1 bit in q10
318*c83a76b0SSuyog Pawar                                    @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10
319*c83a76b0SSuyog Pawar    VTRN.32 q7,q8
320*c83a76b0SSuyog Pawar    VTRN.32 q9,q10
321*c83a76b0SSuyog Pawar    VSWP d15,d18
322*c83a76b0SSuyog Pawar    VSWP d17,d20                    @ Residue block transposed
323*c83a76b0SSuyog Pawar                                    @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10
324*c83a76b0SSuyog Pawar    VADD.S32 q13,q7,q8              @ q13 = S1 + S2
325*c83a76b0SSuyog Pawar    VADD.S32 q1,q7,q10              @ q1 = S1 + S4
326*c83a76b0SSuyog Pawar    VADD.S32 q4,q8,q10              @ q4 = S2 + S4
327*c83a76b0SSuyog Pawar    VSUB.S32 q13,q13,q10            @ q13 = S1 + S2 - S4
328*c83a76b0SSuyog Pawar    VMUL.S32 q12,q1,d5[0]           @ q12 = 29*S1 + 29*S4
329*c83a76b0SSuyog Pawar    VMUL.S32 q14,q1,d7[0]           @ q14 = 84*S1 + 84*S4
330*c83a76b0SSuyog Pawar    VMUL.S32 q13,q13,d4[0]          @ q13 = 74*S1 + 74*S2 - 74*S4
331*c83a76b0SSuyog Pawar
332*c83a76b0SSuyog Pawar    VMLA.S32 q12,q4,d6[0]           @ q12 = 29*S1 + 55*S2 + 84*S4
333*c83a76b0SSuyog Pawar    VMLS.S32 q14,q4,d5[0]           @ q14 = 84*S1 - 29*S2 + 55*S4
334*c83a76b0SSuyog Pawar    VMUL.S32 q9,q9,d4[0]            @ q9 = 74*S3
335*c83a76b0SSuyog Pawar
336*c83a76b0SSuyog Pawar    LDR r4,[sp,#76]                 @ r4 = dst_strd_chr_flag
337*c83a76b0SSuyog Pawar    LSL r4,r4,#1                    @ r4 = 2*dst_strd
338*c83a76b0SSuyog Pawar
339*c83a76b0SSuyog Pawar    VRSHRN.S32 d26,q13,#8
340*c83a76b0SSuyog Pawar    VSUB.S32 q15,q14,q12            @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4
341*c83a76b0SSuyog Pawar
342*c83a76b0SSuyog Pawar    VADD.S32 q12,q12,q9             @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4
343*c83a76b0SSuyog Pawar    VSUB.S32 q14,q14,q9             @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4
344*c83a76b0SSuyog Pawar    VADD.S32 q15,q15,q9             @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4
345*c83a76b0SSuyog Pawar
346*c83a76b0SSuyog Pawar    VRSHRN.S32 d24,q12,#8
347*c83a76b0SSuyog Pawar    VRSHRN.S32 d28,q14,#8
348*c83a76b0SSuyog Pawar    VRSHRN.S32 d30,q15,#8           @ Truncating the last 8 bits
349*c83a76b0SSuyog Pawar                                    @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30
350*c83a76b0SSuyog Pawar    VST1.64 d24,[r3],r4             @ Storing row 1 of transform stage 2
351*c83a76b0SSuyog Pawar    VST1.64 d26,[r3],r4             @ Storing row 2 of transform stage 2
352*c83a76b0SSuyog Pawar    VST1.64 d28,[r3],r4             @ Storing row 3 of transform stage 2
353*c83a76b0SSuyog Pawar    VST1.64 d30,[r3]                @ Storing row 4 of transform stage 2
354*c83a76b0SSuyog Pawar
355*c83a76b0SSuyog Pawar    vpop {d8 - d15}
356*c83a76b0SSuyog Pawar    POP {r4}
357*c83a76b0SSuyog Pawar    MOV pc,lr
358*c83a76b0SSuyog Pawar
359*c83a76b0SSuyog Pawar@/**
360*c83a76b0SSuyog Pawar@*******************************************************************************
361*c83a76b0SSuyog Pawar@*
362*c83a76b0SSuyog Pawar@* @brief
363*c83a76b0SSuyog Pawar@*  This function performs residue calculation and DCT integer forward transform
364*c83a76b0SSuyog Pawar@*  on 8x8 block
365*c83a76b0SSuyog Pawar@*
366*c83a76b0SSuyog Pawar@* @description
367*c83a76b0SSuyog Pawar@*  Performs residue calculation by subtracting source and prediction and
368*c83a76b0SSuyog Pawar@*  followed by DCT integer forward transform
369*c83a76b0SSuyog Pawar@*
370*c83a76b0SSuyog Pawar@* @param[in] pu1_src
371*c83a76b0SSuyog Pawar@*  Input 4x4 pixels
372*c83a76b0SSuyog Pawar@*
373*c83a76b0SSuyog Pawar@* @param[in] pu1_pred
374*c83a76b0SSuyog Pawar@*  Prediction data
375*c83a76b0SSuyog Pawar@*
376*c83a76b0SSuyog Pawar@* @param[in] pi2_tmp
377*c83a76b0SSuyog Pawar@*  Temporary buffer of size 8x8
378*c83a76b0SSuyog Pawar@*
379*c83a76b0SSuyog Pawar@* @param[out] pi2_dst
380*c83a76b0SSuyog Pawar@*  Output 8x8 coefficients
381*c83a76b0SSuyog Pawar@*
382*c83a76b0SSuyog Pawar@* @param[in] src_strd
383*c83a76b0SSuyog Pawar@*  Input stride
384*c83a76b0SSuyog Pawar@*
385*c83a76b0SSuyog Pawar@* @param[in] pred_strd
386*c83a76b0SSuyog Pawar@*  Prediction Stride
387*c83a76b0SSuyog Pawar@*
388*c83a76b0SSuyog Pawar@* @param[in] dst_strd
389*c83a76b0SSuyog Pawar@*  Output Stride
390*c83a76b0SSuyog Pawar@*
391*c83a76b0SSuyog Pawar@* @param[in] chr_plane
392*c83a76b0SSuyog Pawar@*  Chroma plane
393*c83a76b0SSuyog Pawar@*
394*c83a76b0SSuyog Pawar@* @returns void
395*c83a76b0SSuyog Pawar@*
396*c83a76b0SSuyog Pawar@* @remarks
397*c83a76b0SSuyog Pawar@*  None
398*c83a76b0SSuyog Pawar@*
399*c83a76b0SSuyog Pawar@*******************************************************************************
400*c83a76b0SSuyog Pawar@*/
401*c83a76b0SSuyog Pawar@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
402*c83a76b0SSuyog Pawar@                              UWORD8 *pu1_pred,
403*c83a76b0SSuyog Pawar@                              WORB32 *pi4_temp,
404*c83a76b0SSuyog Pawar@                              WORB16 *pi2_dst,
405*c83a76b0SSuyog Pawar@                              WORB32 src_strd,
406*c83a76b0SSuyog Pawar@                              WORB32 pred_strd,
407*c83a76b0SSuyog Pawar@                              WORB32 dst_strd
408*c83a76b0SSuyog Pawar@                              WORB32 chroma_plane);
409*c83a76b0SSuyog Pawar@
410*c83a76b0SSuyog Pawar@**************Variables Vs Registers*******************************************
411*c83a76b0SSuyog Pawar@
412*c83a76b0SSuyog Pawar@ r0 - pu1_src
413*c83a76b0SSuyog Pawar@ r1 - pu1_pred
414*c83a76b0SSuyog Pawar@ r2 - pi4_temp
415*c83a76b0SSuyog Pawar@ r3 - pi2_dst
416*c83a76b0SSuyog Pawar@
417*c83a76b0SSuyog Pawar@ [sp]   - src_strd
418*c83a76b0SSuyog Pawar@ [sp+4] - pred_strd
419*c83a76b0SSuyog Pawar@ [sp+8] - dst_strd
420*c83a76b0SSuyog Pawar@ [sp+12] - chroma_plane
421*c83a76b0SSuyog Pawar@
422*c83a76b0SSuyog Pawar@*******************************************************************************
423*c83a76b0SSuyog Pawar
424*c83a76b0SSuyog Pawar    .global ihevc_resi_trans_8x8_a9q
425*c83a76b0SSuyog Pawar
426*c83a76b0SSuyog Pawarihevc_resi_trans_8x8_a9q:
427*c83a76b0SSuyog Pawar
428*c83a76b0SSuyog Pawar    PUSH {r4,r5}
429*c83a76b0SSuyog Pawar    vpush {d8 - d15}
430*c83a76b0SSuyog Pawar
431*c83a76b0SSuyog Pawar    @ Loading Prediction and Source blocks of size 8x8
432*c83a76b0SSuyog Pawar
433*c83a76b0SSuyog Pawar    LDR r4,[sp,#84]                 @ r4 = chroma flag
434*c83a76b0SSuyog Pawar
435*c83a76b0SSuyog Pawar    CMP r4,#-1                      @ NULL PLANE
436*c83a76b0SSuyog Pawar    BEQ LUMA_LOAD
437*c83a76b0SSuyog Pawar
438*c83a76b0SSuyog Pawar    CMP r4,#1                       @ V PLANE
439*c83a76b0SSuyog Pawar    BEQ CHROMA_V_LOAD
440*c83a76b0SSuyog Pawar                                    @ handling U PLANE
441*c83a76b0SSuyog Pawar    LDR r5,[sp,#72]                 @ r5 = src_strd
442*c83a76b0SSuyog Pawar    LDR r4,[sp,#76]                 @ r4 = pred_strd
443*c83a76b0SSuyog Pawar
444*c83a76b0SSuyog Pawar    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d0
445*c83a76b0SSuyog Pawar    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d1
446*c83a76b0SSuyog Pawar
447*c83a76b0SSuyog Pawar    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
448*c83a76b0SSuyog Pawar    VLD2.8 {d2,d4},[r1],r4          @ Row 2 of prediction in d2
449*c83a76b0SSuyog Pawar    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
450*c83a76b0SSuyog Pawar    VLD2.8 {d3,d5},[r0],r5          @ Row 2 of source in d3
451*c83a76b0SSuyog Pawar
452*c83a76b0SSuyog Pawar    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
453*c83a76b0SSuyog Pawar    VLD2.8 {d4,d6},[r1],r4          @ Row 3 of prediction in d4
454*c83a76b0SSuyog Pawar    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
455*c83a76b0SSuyog Pawar    VLD2.8 {d5,d7},[r0],r5          @ Row 3 of source in d5
456*c83a76b0SSuyog Pawar
457*c83a76b0SSuyog Pawar    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
458*c83a76b0SSuyog Pawar    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d6
459*c83a76b0SSuyog Pawar    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
460*c83a76b0SSuyog Pawar    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d7
461*c83a76b0SSuyog Pawar
462*c83a76b0SSuyog Pawar    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
463*c83a76b0SSuyog Pawar    VLD2.8 {d8,d10},[r1],r4         @ Row 5 of prediction in d8
464*c83a76b0SSuyog Pawar    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
465*c83a76b0SSuyog Pawar    VLD2.8 {d9,d11},[r0],r5         @ Row 5 of source in d9
466*c83a76b0SSuyog Pawar
467*c83a76b0SSuyog Pawar    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
468*c83a76b0SSuyog Pawar    VLD2.8 {d10,d12},[r1],r4        @ Row 6 of prediction in d10
469*c83a76b0SSuyog Pawar    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
470*c83a76b0SSuyog Pawar    VLD2.8 {d11,d13},[r0],r5        @ Row 6 of source in d11
471*c83a76b0SSuyog Pawar
472*c83a76b0SSuyog Pawar    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
473*c83a76b0SSuyog Pawar    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
474*c83a76b0SSuyog Pawar    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
475*c83a76b0SSuyog Pawar    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13
476*c83a76b0SSuyog Pawar
477*c83a76b0SSuyog Pawar    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
478*c83a76b0SSuyog Pawar    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
479*c83a76b0SSuyog Pawar    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
480*c83a76b0SSuyog Pawar    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
481*c83a76b0SSuyog Pawar
482*c83a76b0SSuyog Pawar    B LUMA_LOAD_END
483*c83a76b0SSuyog Pawar
484*c83a76b0SSuyog PawarCHROMA_V_LOAD:
485*c83a76b0SSuyog Pawar    LDR r5,[sp,#72]                 @ r5 = src_strd
486*c83a76b0SSuyog Pawar    LDR r4,[sp,#76]                 @ r4 = pred_strd
487*c83a76b0SSuyog Pawar
488*c83a76b0SSuyog Pawar    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d2
489*c83a76b0SSuyog Pawar    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d3
490*c83a76b0SSuyog Pawar
491*c83a76b0SSuyog Pawar    VABDL.U8 q15,d3,d2              @ Row 1 of absolute difference in q15
492*c83a76b0SSuyog Pawar    VLD2.8 {d4,d6},[r1],r4          @ Row 2 of prediction in d6
493*c83a76b0SSuyog Pawar    VSUBL.U8 q0,d3,d2               @ Row 1 of residue in q0
494*c83a76b0SSuyog Pawar    VLD2.8 {d5,d7},[r0],r5          @ Row 2 of source in d7
495*c83a76b0SSuyog Pawar
496*c83a76b0SSuyog Pawar    VABDL.U8 q9,d7,d6               @ Row 2 of absolute difference in q9
497*c83a76b0SSuyog Pawar    VLD2.8 {d8,d10},[r1],r4         @ Row 3 of prediction in d10
498*c83a76b0SSuyog Pawar    VSUBL.U8 q1,d7,d6               @ Row 2 of residue in q1
499*c83a76b0SSuyog Pawar    VLD2.8 {d9,d11},[r0],r5         @ Row 3 of source in d11
500*c83a76b0SSuyog Pawar
501*c83a76b0SSuyog Pawar    VABAL.U8 q15,d11,d10            @ Row 3 of absolute difference accumulated in q15
502*c83a76b0SSuyog Pawar    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d8
503*c83a76b0SSuyog Pawar    VSUBL.U8 q2,d11,d10             @ Row 3 of residue in q2
504*c83a76b0SSuyog Pawar    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d9
505*c83a76b0SSuyog Pawar
506*c83a76b0SSuyog Pawar    VABAL.U8 q9,d9,d8               @ Row 4 of absolute difference accumulated in q9
507*c83a76b0SSuyog Pawar    VLD2.8 {d10,d12},[r1],r4        @ Row 5 of prediction in d12
508*c83a76b0SSuyog Pawar    VSUBL.U8 q3,d9,d8               @ Row 4 of residue in q3
509*c83a76b0SSuyog Pawar    VLD2.8 {d11,d13},[r0],r5        @ Row 5 of source in d13
510*c83a76b0SSuyog Pawar
511*c83a76b0SSuyog Pawar    VABDL.U8 q10,d13,d12            @ Row 5 of absolute difference in q10
512*c83a76b0SSuyog Pawar    VLD2.8 {d14,d16},[r1],r4        @ Row 6 of prediction in d16
513*c83a76b0SSuyog Pawar    VSUBL.U8 q4,d13,d12             @ Row 5 of residue in q4
514*c83a76b0SSuyog Pawar    VLD2.8 {d15,d17},[r0],r5        @ Row 6 of source in d17
515*c83a76b0SSuyog Pawar
516*c83a76b0SSuyog Pawar    VABAL.U8 q15,d17,d16            @ Row 6 of absolute difference accumulated in q15
517*c83a76b0SSuyog Pawar    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
518*c83a76b0SSuyog Pawar    VSUBL.U8 q5,d17,d16             @ Row 6 of residue in q5
519*c83a76b0SSuyog Pawar    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13
520*c83a76b0SSuyog Pawar
521*c83a76b0SSuyog Pawar    VABAL.U8 q9,d15,d14             @ Row 7 of absolute difference accumulated in q9
522*c83a76b0SSuyog Pawar    VSUBL.U8 q6,d15,d14             @ Row 7 of residue in q6
523*c83a76b0SSuyog Pawar
524*c83a76b0SSuyog Pawar    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
525*c83a76b0SSuyog Pawar    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
526*c83a76b0SSuyog Pawar    VSWP.8 d14,d16
527*c83a76b0SSuyog Pawar    VSWP.8 d15,d17
528*c83a76b0SSuyog Pawar
529*c83a76b0SSuyog Pawar    B LUMA_LOAD_END
530*c83a76b0SSuyog Pawar
531*c83a76b0SSuyog PawarLUMA_LOAD:
532*c83a76b0SSuyog Pawar
533*c83a76b0SSuyog Pawar    LDR r5,[sp,#72]                 @ r5 = src_strd
534*c83a76b0SSuyog Pawar    LDR r4,[sp,#76]                 @ r4 = pred_strd
535*c83a76b0SSuyog Pawar
536*c83a76b0SSuyog Pawar    VLD1.64 d0,[r1],r4              @ Row 1 of prediction in d0
537*c83a76b0SSuyog Pawar    VLD1.64 d1,[r0],r5              @ Row 1 of source in d1
538*c83a76b0SSuyog Pawar
539*c83a76b0SSuyog Pawar    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
540*c83a76b0SSuyog Pawar    VLD1.64 d2,[r1],r4              @ Row 2 of prediction in d2
541*c83a76b0SSuyog Pawar    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
542*c83a76b0SSuyog Pawar    VLD1.64 d3,[r0],r5              @ Row 2 of source in d3
543*c83a76b0SSuyog Pawar
544*c83a76b0SSuyog Pawar    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
545*c83a76b0SSuyog Pawar    VLD1.64 d4,[r1],r4              @ Row 3 of prediction in d4
546*c83a76b0SSuyog Pawar    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
547*c83a76b0SSuyog Pawar    VLD1.64 d5,[r0],r5              @ Row 3 of source in d5
548*c83a76b0SSuyog Pawar
549*c83a76b0SSuyog Pawar    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
550*c83a76b0SSuyog Pawar    VLD1.64 d6,[r1],r4              @ Row 4 of prediction in d6
551*c83a76b0SSuyog Pawar    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
552*c83a76b0SSuyog Pawar    VLD1.64 d7,[r0],r5              @ Row 4 of source in d7
553*c83a76b0SSuyog Pawar
554*c83a76b0SSuyog Pawar    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
555*c83a76b0SSuyog Pawar    VLD1.64 d8,[r1],r4              @ Row 5 of prediction in d8
556*c83a76b0SSuyog Pawar    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
557*c83a76b0SSuyog Pawar    VLD1.64 d9,[r0],r5              @ Row 5 of source in d9
558*c83a76b0SSuyog Pawar
559*c83a76b0SSuyog Pawar    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
560*c83a76b0SSuyog Pawar    VLD1.64 d10,[r1],r4             @ Row 6 of prediction in d10
561*c83a76b0SSuyog Pawar    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
562*c83a76b0SSuyog Pawar    VLD1.64 d11,[r0],r5             @ Row 6 of source in d11
563*c83a76b0SSuyog Pawar
564*c83a76b0SSuyog Pawar    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
565*c83a76b0SSuyog Pawar    VLD1.64 d12,[r1],r4             @ Row 7 of prediction in d12
566*c83a76b0SSuyog Pawar    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
567*c83a76b0SSuyog Pawar    VLD1.64 d13,[r0],r5             @ Row 7 of source in d13
568*c83a76b0SSuyog Pawar
569*c83a76b0SSuyog Pawar    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
570*c83a76b0SSuyog Pawar    VLD1.64 d14,[r1]                @ Row 8 of prediction in d14
571*c83a76b0SSuyog Pawar    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
572*c83a76b0SSuyog Pawar    VLD1.64 d15,[r0]                @ Row 8 of source in d15
573*c83a76b0SSuyog Pawar
574*c83a76b0SSuyog PawarLUMA_LOAD_END:
575*c83a76b0SSuyog Pawar
576*c83a76b0SSuyog Pawar    @ Transform stage 1
577*c83a76b0SSuyog Pawar    @ Transposing residue matrix
578*c83a76b0SSuyog Pawar
579*c83a76b0SSuyog Pawar    VABAL.U8 q10,d15,d14            @ Row 8 of absolute difference accumulated in q10
580*c83a76b0SSuyog Pawar    VTRN.16 q0,q1                   @ Transpose residue matrix step (1a)
581*c83a76b0SSuyog Pawar    VSUBL.U8 q7,d15,d14             @ Row 8 of residue in q7
582*c83a76b0SSuyog Pawar    VTRN.16 q2,q3                   @ Transpose residue matrix step (1b)
583*c83a76b0SSuyog Pawar
584*c83a76b0SSuyog Pawar    VTRN.16 q4,q5                   @ Transpose residue matrix step (1c)
585*c83a76b0SSuyog Pawar    VTRN.16 q6,q7                   @ Transpose residue matrix step (1d)
586*c83a76b0SSuyog Pawar    VTRN.32 q0,q2                   @ Transpose residue matrix step (2a)
587*c83a76b0SSuyog Pawar    VTRN.32 q1,q3                   @ Transpose residue matrix step (2b)
588*c83a76b0SSuyog Pawar
589*c83a76b0SSuyog Pawar    VADD.U16 q8,q15,q9              @ SAD calculation (1)
590*c83a76b0SSuyog Pawar    VTRN.32 q4,q6                   @ Transpose residue matrix step (2c)
591*c83a76b0SSuyog Pawar    VTRN.32 q5,q7                   @ Transpose residue matrix step (2d)
592*c83a76b0SSuyog Pawar
593*c83a76b0SSuyog Pawar    VADD.U16 q8,q8,q10              @ SAD calculation (2)
594*c83a76b0SSuyog Pawar    VSWP d1,d8                      @ Transpose residue matrix step (3a)
595*c83a76b0SSuyog Pawar    VSWP d3,d10                     @ Transpose residue matrix step (3b)
596*c83a76b0SSuyog Pawar
597*c83a76b0SSuyog Pawar    VADD.U16 d16,d16,d17            @ SAD calculation (3)
598*c83a76b0SSuyog Pawar    VSWP d7,d14                     @ Transpose residue matrix step (3c)
599*c83a76b0SSuyog Pawar    VSWP d5,d12                     @ Transpose residue matrix step (3d)
600*c83a76b0SSuyog Pawar                                    @ Columns of residue C0-C7 (8x8 matrix) in q0-q7
601*c83a76b0SSuyog Pawar    VPADDL.U16 d16,d16              @ SAD calculation (4)
602*c83a76b0SSuyog Pawar
603*c83a76b0SSuyog Pawar    @ Evaluating first step in Butterfly diagram
604*c83a76b0SSuyog Pawar
605*c83a76b0SSuyog Pawar    VADD.S16 q10,q0,q7              @ q10 = C0 + C7
606*c83a76b0SSuyog Pawar    VADD.S16 q11,q1,q6              @ q11 = C1 + C6
607*c83a76b0SSuyog Pawar    VPADDL.U32 d16,d16              @ SAD calculation (5)
608*c83a76b0SSuyog Pawar    VADD.S16 q12,q2,q5              @ q12 = C2 + C5
609*c83a76b0SSuyog Pawar    VADD.S16 q13,q3,q4              @ q13 = C3 + C4
610*c83a76b0SSuyog Pawar
611*c83a76b0SSuyog Pawar    VSUB.S16 q4,q3,q4               @ q4  = C3 - C4
612*c83a76b0SSuyog Pawar    VSUB.S16 q5,q2,q5               @ q5  = C2 - C5
613*c83a76b0SSuyog Pawar    VSUB.S16 q6,q1,q6               @ q6  = C1 - C6
614*c83a76b0SSuyog Pawar    VSUB.S16 q7,q0,q7               @ q7  = C0 - C7
615*c83a76b0SSuyog Pawar
616*c83a76b0SSuyog Pawar    @ Calculating F0, F2, F4 and F6
617*c83a76b0SSuyog Pawar
618*c83a76b0SSuyog Pawar    VADD.S16 q1,q11,q12             @ q1  = C1 + C2 + C5 + C6
619*c83a76b0SSuyog Pawar    VADD.S16 q2,q10,q13             @ q2  = C0 + C3 + C4 + C7
620*c83a76b0SSuyog Pawar
621*c83a76b0SSuyog Pawar    MOV r4,#50
622*c83a76b0SSuyog Pawar    LSL r4,r4,#16
623*c83a76b0SSuyog Pawar    ADD r4,r4,#18
624*c83a76b0SSuyog Pawar    MOV r5,#89
625*c83a76b0SSuyog Pawar    LSL r5,r5,#16
626*c83a76b0SSuyog Pawar    ADD r5,r5,#75
627*c83a76b0SSuyog Pawar    VMOV d0,r4,r5                   @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18
628*c83a76b0SSuyog Pawar
629*c83a76b0SSuyog Pawar    MOV r4,#83
630*c83a76b0SSuyog Pawar    LSL r4,r4,#16
631*c83a76b0SSuyog Pawar    ADD r4,r4,#36
632*c83a76b0SSuyog Pawar    VMOV d1,r4,r4                   @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36
633*c83a76b0SSuyog Pawar
634*c83a76b0SSuyog Pawar    VSUB.S16 q10,q10,q13            @ q10 = C0 - C3 - C4 + C7
635*c83a76b0SSuyog Pawar    VSUB.S16 q11,q11,q12            @ q11 = C1 - C2 - C5 + C6
636*c83a76b0SSuyog Pawar    VMOV.32 r0,d16[0]               @ SAD calculation (6) : Return value = SAD
637*c83a76b0SSuyog Pawar
638*c83a76b0SSuyog Pawar    VSUB.S16 q3,q2,q1               @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7
639*c83a76b0SSuyog Pawar    VADD.S16 q2,q2,q1               @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7
640*c83a76b0SSuyog Pawar
641*c83a76b0SSuyog Pawar    VMULL.S16 q14,d20,d1[1]         @ q14 = [0] of 83*(C0 - C3 - C4 + C7)
642*c83a76b0SSuyog Pawar    VMULL.S16 q15,d21,d1[1]         @ q15 = [1] of 83*(C0 - C3 - C4 + C7)
643*c83a76b0SSuyog Pawar    VMULL.S16 q9,d20,d1[0]          @ q9  = [0] of 36*(C0 - C3 - C4 + C7)
644*c83a76b0SSuyog Pawar    VMULL.S16 q10,d21,d1[0]         @ q10 = [1] of 36*(C0 - C3 - C4 + C7)
645*c83a76b0SSuyog Pawar
646*c83a76b0SSuyog Pawar    VMLAL.S16 q14,d22,d1[0]         @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
647*c83a76b0SSuyog Pawar    VSHLL.S16 q13,d6,#6             @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
648*c83a76b0SSuyog Pawar    VMLAL.S16 q15,d23,d1[0]         @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
649*c83a76b0SSuyog Pawar    VSHLL.S16 q3,d7,#6              @ q3  = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
650*c83a76b0SSuyog Pawar    VMLSL.S16 q9,d22,d1[1]          @ q9  = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
651*c83a76b0SSuyog Pawar    VSHLL.S16 q12,d4,#6             @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
652*c83a76b0SSuyog Pawar    VMLSL.S16 q10,d23,d1[1]         @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
653*c83a76b0SSuyog Pawar    VSHLL.S16 q2,d5,#6              @ q2  = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
654*c83a76b0SSuyog Pawar
655*c83a76b0SSuyog Pawar    @ Calculating F1, F3, F5 and F7
656*c83a76b0SSuyog Pawar
657*c83a76b0SSuyog Pawar    MOV r4,#48
658*c83a76b0SSuyog Pawar    VST1.64 {d24,d25},[r2]!         @ Row 1 of transform stage 1 F0[0] stored
659*c83a76b0SSuyog Pawar    VST1.64 {d4,d5},[r2],r4         @ Row 1 of transform stage 1 F0[1] stored
660*c83a76b0SSuyog Pawar    VST1.64 {d28,d29},[r2]!         @ Row 3 of transform stage 1 F2[0] stored
661*c83a76b0SSuyog Pawar    VST1.64 {d30,d31},[r2],r4       @ Row 3 of transform stage 1 F2[1] stored
662*c83a76b0SSuyog Pawar
663*c83a76b0SSuyog Pawar    VST1.64 {d26,d27},[r2]!         @ Row 5 of transform stage 1 F4[0] stored
664*c83a76b0SSuyog Pawar    VMULL.S16 q1,d14,d0[3]          @ q1  = [0] of 89*(C0 - C7)
665*c83a76b0SSuyog Pawar    VMULL.S16 q8,d15,d0[3]          @ q8  = [1] of 89*(C0 - C7)
666*c83a76b0SSuyog Pawar    VST1.64 {d6,d7},[r2],r4         @ Row 5 of transform stage 1 F4[1] stored
667*c83a76b0SSuyog Pawar    VMULL.S16 q11,d14,d0[2]         @ q11 = [0] of 75*(C0 - C7)
668*c83a76b0SSuyog Pawar    VMULL.S16 q13,d15,d0[2]         @ q13 = [1] of 75*(C0 - C7)
669*c83a76b0SSuyog Pawar    VST1.64 {d18,d19},[r2]!         @ Row 7 of transform stage 1 F6[0] stored
670*c83a76b0SSuyog Pawar    VMULL.S16 q3,d14,d0[1]          @ q3  = [0] of 50*(C0 - C7)
671*c83a76b0SSuyog Pawar    VMULL.S16 q9,d15,d0[1]          @ q9  = [1] of 50*(C0 - C7)
672*c83a76b0SSuyog Pawar    VST1.64 {d20,d21},[r2]          @ Row 7 of transform stage 1 F6[1] stored
673*c83a76b0SSuyog Pawar    VMULL.S16 q10,d14,d0[0]         @ q10 = [0] of 18*(C0 - C7)
674*c83a76b0SSuyog Pawar    VMULL.S16 q7,d15,d0[0]          @ q7  = [1] of 18*(C0 - C7)
675*c83a76b0SSuyog Pawar
676*c83a76b0SSuyog Pawar    VMLAL.S16 q1,d12,d0[2]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6)
677*c83a76b0SSuyog Pawar    VMLAL.S16 q8,d13,d0[2]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6)
678*c83a76b0SSuyog Pawar    VMLSL.S16 q11,d12,d0[0]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6)
679*c83a76b0SSuyog Pawar    VMLSL.S16 q13,d13,d0[0]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6)
680*c83a76b0SSuyog Pawar    VMLSL.S16 q3,d12,d0[3]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6)
681*c83a76b0SSuyog Pawar    VMLSL.S16 q9,d13,d0[3]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6)
682*c83a76b0SSuyog Pawar    VMLSL.S16 q10,d12,d0[1]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6)
683*c83a76b0SSuyog Pawar    VMLSL.S16 q7,d13,d0[1]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6)
684*c83a76b0SSuyog Pawar
685*c83a76b0SSuyog Pawar    VMLAL.S16 q1,d10,d0[1]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
686*c83a76b0SSuyog Pawar    VMLAL.S16 q8,d11,d0[1]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
687*c83a76b0SSuyog Pawar    VMLSL.S16 q11,d10,d0[3]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
688*c83a76b0SSuyog Pawar    VMLSL.S16 q13,d11,d0[3]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
689*c83a76b0SSuyog Pawar    VMLAL.S16 q3,d10,d0[0]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
690*c83a76b0SSuyog Pawar    VMLAL.S16 q9,d11,d0[0]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
691*c83a76b0SSuyog Pawar    VMLAL.S16 q10,d10,d0[2]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
692*c83a76b0SSuyog Pawar    VMLAL.S16 q7,d11,d0[2]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
693*c83a76b0SSuyog Pawar
694*c83a76b0SSuyog Pawar    VMLAL.S16 q1,d8,d0[0]           @ q1  = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
695*c83a76b0SSuyog Pawar    VMLAL.S16 q8,d9,d0[0]           @ q8  = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
696*c83a76b0SSuyog Pawar    VMLSL.S16 q11,d8,d0[1]          @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
697*c83a76b0SSuyog Pawar    VMLSL.S16 q13,d9,d0[1]          @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
698*c83a76b0SSuyog Pawar    SUB r2,r2,#176                  @ r2 now points to the second row
699*c83a76b0SSuyog Pawar    VMLAL.S16 q3,d8,d0[2]           @ q3  = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
700*c83a76b0SSuyog Pawar    VMLAL.S16 q9,d9,d0[2]           @ q9  = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
701*c83a76b0SSuyog Pawar    VST1.64 {d2,d3},[r2]!           @ Row 2 of transform stage 1 F1[0] stored
702*c83a76b0SSuyog Pawar    VMLSL.S16 q10,d8,d0[3]          @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
703*c83a76b0SSuyog Pawar    VMLSL.S16 q7,d9,d0[3]           @ q7  = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
704*c83a76b0SSuyog Pawar
705*c83a76b0SSuyog Pawar    VST1.64 {d16,d17},[r2],r4       @ Row 2 of transform stage 1 F1[1] stored
706*c83a76b0SSuyog Pawar    VST1.64 {d22,d23},[r2]!         @ Row 4 of transform stage 1 F3[0] stored
707*c83a76b0SSuyog Pawar    VST1.64 {d26,d27},[r2],r4       @ Row 4 of transform stage 1 F3[1] stored
708*c83a76b0SSuyog Pawar    VST1.64 {d6,d7},[r2]!           @ Row 6 of transform stage 1 F5[0] stored
709*c83a76b0SSuyog Pawar    VST1.64 {d18,d19},[r2],r4       @ Row 6 of transform stage 1 F5[1] stored
710*c83a76b0SSuyog Pawar    VST1.64 {d20,d21},[r2]!         @ Row 8 of transform stage 1 F7[0] stored
711*c83a76b0SSuyog Pawar    VST1.64 {d14,d15},[r2]          @ Row 8 of transform stage 1 F7[1] stored
712*c83a76b0SSuyog Pawar
713*c83a76b0SSuyog Pawar    @ Transform stage 2 (for rows 1-4 of transform stage 1)
714*c83a76b0SSuyog Pawar    @ Transposing the 4 rows (F0, F1, F2, F3)
715*c83a76b0SSuyog Pawar    @ F0 = {q2,q12},  F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11}
716*c83a76b0SSuyog Pawar
717*c83a76b0SSuyog Pawar    VTRN.32 q12,q1                  @ Transposing first half of transform stage 1 (1a)
718*c83a76b0SSuyog Pawar    VTRN.32 q14,q11                 @ Transposing first half of transform stage 1 (1b)
719*c83a76b0SSuyog Pawar    VSWP d25,d28                    @ Transposing first half of transform stage 1 (2a)
720*c83a76b0SSuyog Pawar    VSWP d22,d3                     @ Transposing first half of transform stage 1 (2b)
721*c83a76b0SSuyog Pawar
722*c83a76b0SSuyog Pawar    VTRN.32 q2,q8                   @ Transposing first half of transform stage 1 (3a)
723*c83a76b0SSuyog Pawar    VTRN.32 q15,q13                 @ Transposing first half of transform stage 1 (3b)
724*c83a76b0SSuyog Pawar    VSWP d5,d30                     @ Transposing first half of transform stage 1 (4a)
725*c83a76b0SSuyog Pawar    VSWP d26,d17                    @ Transposing first half of transform stage 1 (4b)
726*c83a76b0SSuyog Pawar                                    @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13
727*c83a76b0SSuyog Pawar
728*c83a76b0SSuyog Pawar    @ Evaluating first step in Butterfly diagram
729*c83a76b0SSuyog Pawar
730*c83a76b0SSuyog Pawar    VADD.S32 q0,q12,q13             @ q0  = B0 + B7
731*c83a76b0SSuyog Pawar    VADD.S32 q5,q11,q2              @ q5  = B3 + B4
732*c83a76b0SSuyog Pawar    VADD.S32 q3,q1,q15              @ q3  = B1 + B6
733*c83a76b0SSuyog Pawar    VADD.S32 q4,q14,q8              @ q4  = B2 + B5
734*c83a76b0SSuyog Pawar
735*c83a76b0SSuyog Pawar    VSUB.S32 q7,q14,q8              @ q7  = B2 - B5
736*c83a76b0SSuyog Pawar    VSUB.S32 q8,q1,q15              @ q8  = B1 - B6
737*c83a76b0SSuyog Pawar    VSUB.S32 q6,q11,q2              @ q6  = B3 - B4
738*c83a76b0SSuyog Pawar    VSUB.S32 q9,q12,q13             @ q9  = B0 - B7
739*c83a76b0SSuyog Pawar
740*c83a76b0SSuyog Pawar    @ Calculating G0, G2, G4 and G6
741*c83a76b0SSuyog Pawar
742*c83a76b0SSuyog Pawar    MOV r4,#18
743*c83a76b0SSuyog Pawar    MOV r5,#50
744*c83a76b0SSuyog Pawar    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
745*c83a76b0SSuyog Pawar    VSUB.S32 q2,q0,q5               @ q2  = B0 - B3 - B4 + B7
746*c83a76b0SSuyog Pawar
747*c83a76b0SSuyog Pawar    MOV r4,#75
748*c83a76b0SSuyog Pawar    MOV r5,#89
749*c83a76b0SSuyog Pawar    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
750*c83a76b0SSuyog Pawar    VADD.S32 q10,q0,q5              @ q10 = B0 + B3 + B4 + B7
751*c83a76b0SSuyog Pawar
752*c83a76b0SSuyog Pawar    MOV r4,#36
753*c83a76b0SSuyog Pawar    MOV r5,#83
754*c83a76b0SSuyog Pawar    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
755*c83a76b0SSuyog Pawar    VSUB.S32 q11,q3,q4              @ q11 = B1 - B2 - B5 + B6
756*c83a76b0SSuyog Pawar    VADD.S32 q3,q3,q4               @ q3  = B1 + B2 + B5 + B6
757*c83a76b0SSuyog Pawar
758*c83a76b0SSuyog Pawar    VMUL.S32 q12,q2,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
759*c83a76b0SSuyog Pawar    VMUL.S32 q2,q2,d0[0]            @ q2  = 36*(B0 - B3 - B4 + B7)
760*c83a76b0SSuyog Pawar    VMUL.S32 q5,q9,d3[1]            @ q5 = 89*(B0 - B7)
761*c83a76b0SSuyog Pawar    VADD.S32 q14,q10,q3             @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
762*c83a76b0SSuyog Pawar    VMUL.S32 q4,q9,d3[0]            @ q4 = 75*(B0 - B7)
763*c83a76b0SSuyog Pawar    VSUB.S32 q15,q10,q3             @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
764*c83a76b0SSuyog Pawar@    VSHL.S32 q14,q14,#6             ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
765*c83a76b0SSuyog Pawar@    VSHL.S32 q15,q15,#6             ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
766*c83a76b0SSuyog Pawar
767*c83a76b0SSuyog Pawar    VMLA.S32 q12,q11,d0[0]          @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
768*c83a76b0SSuyog Pawar    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in G0
769*c83a76b0SSuyog Pawar    VMLS.S32 q2,q11,d0[1]           @ q2  = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
770*c83a76b0SSuyog Pawar    VRSHRN.I32 d30,q15,#5           @ Truncating last 11 bits in G4
771*c83a76b0SSuyog Pawar
772*c83a76b0SSuyog Pawar    LDR r4,[sp,#80]                 @ r4 = dst_strd
773*c83a76b0SSuyog Pawar    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
774*c83a76b0SSuyog Pawar
775*c83a76b0SSuyog Pawar    VMUL.S32 q3,q9,d2[1]            @ q3 = 50*(B0 - B7)
776*c83a76b0SSuyog Pawar    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in G2
777*c83a76b0SSuyog Pawar    VMUL.S32 q9,q9,d2[0]            @ q9 = 18*(B0 - B7)
778*c83a76b0SSuyog Pawar    VRSHRN.I32 d4,q2,#11            @ Truncating last 11 bits in G6
779*c83a76b0SSuyog Pawar
780*c83a76b0SSuyog Pawar    VMLA.S32 q5,q8,d3[0]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6)
781*c83a76b0SSuyog Pawar    VST1.64 d28,[r3],r4             @ First half-row of row 1 of transform stage 2 (G0) stored
782*c83a76b0SSuyog Pawar    VMLS.S32 q4,q8,d2[0]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6)
783*c83a76b0SSuyog Pawar
784*c83a76b0SSuyog Pawar    VMLS.S32 q3,q8,d3[1]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6)
785*c83a76b0SSuyog Pawar    VST1.64 d24,[r3],r4             @ First half-row of row 3 of transform stage 2 (G2) stored
786*c83a76b0SSuyog Pawar    VMLS.S32 q9,q8,d2[1]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6)
787*c83a76b0SSuyog Pawar
788*c83a76b0SSuyog Pawar    VMLA.S32 q5,q7,d2[1]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
789*c83a76b0SSuyog Pawar    VST1.64 d30,[r3],r4             @ First half-row of row 5 of transform stage 2 (G4) stored
790*c83a76b0SSuyog Pawar    VMLS.S32 q4,q7,d3[1]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
791*c83a76b0SSuyog Pawar
792*c83a76b0SSuyog Pawar    VMLA.S32 q3,q7,d2[0]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
793*c83a76b0SSuyog Pawar    VST1.64 d4,[r3]                 @ First half-row of row 7 of transform stage 2 (G6) stored
794*c83a76b0SSuyog Pawar    VMLA.S32 q9,q7,d3[0]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
795*c83a76b0SSuyog Pawar
796*c83a76b0SSuyog Pawar    VMLA.S32 q5,q6,d2[0]            @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
797*c83a76b0SSuyog Pawar    VMLS.S32 q4,q6,d2[1]            @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
798*c83a76b0SSuyog Pawar    VMLA.S32 q3,q6,d3[0]            @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
799*c83a76b0SSuyog Pawar    VMLS.S32 q9,q6,d3[1]            @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
800*c83a76b0SSuyog Pawar
801*c83a76b0SSuyog Pawar    SUB r3,r3,r4,LSL #1
802*c83a76b0SSuyog Pawar    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd*2
803*c83a76b0SSuyog Pawar                                    @ r3 is moved from row 7 to row 2
804*c83a76b0SSuyog Pawar    VRSHRN.I32 d10,q5,#11           @ Truncating last 11 bits in G1
805*c83a76b0SSuyog Pawar    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in G3
806*c83a76b0SSuyog Pawar    VRSHRN.I32 d6,q3,#11            @ Truncating last 11 bits in G5
807*c83a76b0SSuyog Pawar    VST1.64 d10,[r3],r4             @ First half-row of row 2 of transform stage 2 (G1) stored
808*c83a76b0SSuyog Pawar    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in G7
809*c83a76b0SSuyog Pawar
810*c83a76b0SSuyog Pawar    VST1.64 d8,[r3],r4              @ First half-row of row 4 of transform stage 2 (G3) stored
811*c83a76b0SSuyog Pawar    VST1.64 d6,[r3],r4              @ First half-row of row 6 of transform stage 2 (G5) stored
812*c83a76b0SSuyog Pawar    VST1.64 d18,[r3]!               @ First half-row of row 8 of transform stage 2 (G7) stored
813*c83a76b0SSuyog Pawar
814*c83a76b0SSuyog Pawar    @ Transform stage 2 (for rows 5-8 of transform stage 1)
815*c83a76b0SSuyog Pawar    @ Loading the 4 rows (F4, F5, F6, F7)
816*c83a76b0SSuyog Pawar
817*c83a76b0SSuyog Pawar    SUB r2,r2,#112                  @ r2 jumps from row 8 to row 5 in temporary memory
818*c83a76b0SSuyog Pawar    VLD1.64 {d20,d21},[r2]!         @ q10 = F4[0]
819*c83a76b0SSuyog Pawar    VLD1.64 {d22,d23},[r2]!         @ q11 = F4[1]
820*c83a76b0SSuyog Pawar    VLD1.64 {d8,d9},[r2]!           @ q4  = F5[0]
821*c83a76b0SSuyog Pawar    @ Transposing the 4 rows
822*c83a76b0SSuyog Pawar    @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12}
823*c83a76b0SSuyog Pawar
824*c83a76b0SSuyog Pawar    VTRN.32 q10,q4                  @ Transposing second half of transform stage 1 (1a)
825*c83a76b0SSuyog Pawar    VLD1.64 {d10,d11},[r2]!         @ q5  = F5[1]
826*c83a76b0SSuyog Pawar    VLD1.64 {d4,d5},[r2]!           @ q2  = F6[0]
827*c83a76b0SSuyog Pawar    VLD1.64 {d6,d7},[r2]!           @ q3  = F6[1]
828*c83a76b0SSuyog Pawar    VLD1.64 {d24,d25},[r2]!         @ q12 = F7[0]
829*c83a76b0SSuyog Pawar    VTRN.32 q2,q12                  @ Transposing second half of transform stage 1 (1b)
830*c83a76b0SSuyog Pawar    VLD1.64 {d26,d27},[r2]          @ q13 = F7[1]
831*c83a76b0SSuyog Pawar
832*c83a76b0SSuyog Pawar    VSWP d21,d4                     @ Transposing second half of transform stage 1 (2a)
833*c83a76b0SSuyog Pawar    VSWP d24,d9                     @ Transposing second half of transform stage 1 (2b)
834*c83a76b0SSuyog Pawar
835*c83a76b0SSuyog Pawar    VTRN.32 q11,q5                  @ Transposing second half of transform stage 1 (3a)
836*c83a76b0SSuyog Pawar    VTRN.32 q3,q13                  @ Transposing second half of transform stage 1 (3b)
837*c83a76b0SSuyog Pawar    VSWP d26,d11                    @ Transposing second half of transform stage 1 (4b)
838*c83a76b0SSuyog Pawar    VSWP d23,d6                     @ Transposing second half of transform stage 1 (4a)
839*c83a76b0SSuyog Pawar                                    @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13
840*c83a76b0SSuyog Pawar
841*c83a76b0SSuyog Pawar    @ Evaluating first step in Butterfly diagram
842*c83a76b0SSuyog Pawar
843*c83a76b0SSuyog Pawar    VADD.S32 q0,q10,q13             @ q0  = B0 + B7
844*c83a76b0SSuyog Pawar    VADD.S32 q15,q12,q11            @ q15 = B3 + B4
845*c83a76b0SSuyog Pawar    VADD.S32 q1,q4,q3               @ q1  = B1 + B6
846*c83a76b0SSuyog Pawar    VADD.S32 q14,q2,q5              @ q14 = B2 + B5
847*c83a76b0SSuyog Pawar
848*c83a76b0SSuyog Pawar    VSUB.S32 q9,q10,q13             @ q9  = B0 - B7
849*c83a76b0SSuyog Pawar    VSUB.S32 q6,q12,q11             @ q6  = B3 - B4
850*c83a76b0SSuyog Pawar    VSUB.S32 q7,q2,q5               @ q7  = B2 - B5
851*c83a76b0SSuyog Pawar    VSUB.S32 q8,q4,q3               @ q8  = B1 - B6
852*c83a76b0SSuyog Pawar
853*c83a76b0SSuyog Pawar    @ Calculating H0, H2, H4 and H6
854*c83a76b0SSuyog Pawar
855*c83a76b0SSuyog Pawar    VADD.S32 q3,q1,q14              @ q3 = B1 + B2 + B5 + B6
856*c83a76b0SSuyog Pawar    VSUB.S32 q5,q1,q14              @ q5 = B1 - B2 - B5 + B6
857*c83a76b0SSuyog Pawar
858*c83a76b0SSuyog Pawar    MOV r4,#18
859*c83a76b0SSuyog Pawar    MOV r5,#50
860*c83a76b0SSuyog Pawar    VSUB.S32 q4,q0,q15              @ q4 = B0 - B3 - B4 + B7
861*c83a76b0SSuyog Pawar    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
862*c83a76b0SSuyog Pawar
863*c83a76b0SSuyog Pawar    MOV r4,#75
864*c83a76b0SSuyog Pawar    MOV r5,#89
865*c83a76b0SSuyog Pawar    VADD.S32 q2,q0,q15              @ q2 = B0 + B3 + B4 + B7
866*c83a76b0SSuyog Pawar    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
867*c83a76b0SSuyog Pawar
868*c83a76b0SSuyog Pawar    MOV r4,#36
869*c83a76b0SSuyog Pawar    MOV r5,#83
870*c83a76b0SSuyog Pawar
871*c83a76b0SSuyog Pawar    @ Calculating H1, H3, H5 and H7
872*c83a76b0SSuyog Pawar
873*c83a76b0SSuyog Pawar    VMUL.S32 q10,q9,d3[1]           @ q10 = 89*(B0 - B7)
874*c83a76b0SSuyog Pawar    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
875*c83a76b0SSuyog Pawar
876*c83a76b0SSuyog Pawar    VMUL.S32 q13,q9,d3[0]           @ q13 = 75*(B0 - B7)
877*c83a76b0SSuyog Pawar
878*c83a76b0SSuyog Pawar    VMUL.S32 q12,q4,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
879*c83a76b0SSuyog Pawar    VADD.S32 q14,q2,q3              @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
880*c83a76b0SSuyog Pawar    VMUL.S32 q4,q4,d0[0]            @ q4  = 36*(B0 - B3 - B4 + B7)
881*c83a76b0SSuyog Pawar    VSUB.S32 q2,q2,q3               @ q2  = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
882*c83a76b0SSuyog Pawar
883*c83a76b0SSuyog Pawar
884*c83a76b0SSuyog Pawar    VMLA.S32 q12,q5,d0[0]           @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
885*c83a76b0SSuyog Pawar@    VSHL.S32 q14,q14,#6             ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
886*c83a76b0SSuyog Pawar    VMLS.S32 q4,q5,d0[1]            @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
887*c83a76b0SSuyog Pawar@    VSHL.S32 q2,q15,#6              ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
888*c83a76b0SSuyog Pawar
889*c83a76b0SSuyog Pawar    VMUL.S32 q11,q9,d2[1]           @ q11 = 50*(B0 - B7)
890*c83a76b0SSuyog Pawar    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in H0
891*c83a76b0SSuyog Pawar    VMUL.S32 q9,q9,d2[0]            @ q9  = 18*(B0 - B7)
892*c83a76b0SSuyog Pawar    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in H2
893*c83a76b0SSuyog Pawar
894*c83a76b0SSuyog Pawar    VMLA.S32 q10,q8,d3[0]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6)
895*c83a76b0SSuyog Pawar    VRSHRN.I32 d4,q2,#5             @ Truncating last 11 bits in H4
896*c83a76b0SSuyog Pawar    VMLS.S32 q13,q8,d2[0]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6)
897*c83a76b0SSuyog Pawar    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in H6
898*c83a76b0SSuyog Pawar
899*c83a76b0SSuyog Pawar    LDR r4,[sp,#80]                 @ r4 = dst_strd
900*c83a76b0SSuyog Pawar    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
901*c83a76b0SSuyog Pawar
902*c83a76b0SSuyog Pawar    SUB r3,r3,r4,LSL #2
903*c83a76b0SSuyog Pawar    ADD r3,r3,r4,ASR #1             @ r3 = r3 - 7*dst_strd*2
904*c83a76b0SSuyog Pawar                                    @ r3 is moved from row 8 to row 1
905*c83a76b0SSuyog Pawar    VMLS.S32 q11,q8,d3[1]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6)
906*c83a76b0SSuyog Pawar    VST1.64 d28,[r3],r4             @ Second half-row of row 1 of transform stage 2 (H0) stored
907*c83a76b0SSuyog Pawar    VMLS.S32 q9,q8,d2[1]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6)
908*c83a76b0SSuyog Pawar
909*c83a76b0SSuyog Pawar    VMLA.S32 q10,q7,d2[1]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
910*c83a76b0SSuyog Pawar    VST1.64 d24,[r3],r4             @ Second half-row of row 3 of transform stage 2 (H2) stored
911*c83a76b0SSuyog Pawar    VMLS.S32 q13,q7,d3[1]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
912*c83a76b0SSuyog Pawar
913*c83a76b0SSuyog Pawar    VMLA.S32 q11,q7,d2[0]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
914*c83a76b0SSuyog Pawar    VST1.64 d4,[r3],r4              @ Second half-row of row 5 of transform stage 2 (H4) stored
915*c83a76b0SSuyog Pawar    VMLA.S32 q9,q7,d3[0]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
916*c83a76b0SSuyog Pawar
917*c83a76b0SSuyog Pawar    VMLA.S32 q10,q6,d2[0]           @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
918*c83a76b0SSuyog Pawar    VST1.64 d8,[r3]                 @ Second half-row of row 7 of transform stage 2 (H6) stored
919*c83a76b0SSuyog Pawar    VMLS.S32 q13,q6,d2[1]           @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
920*c83a76b0SSuyog Pawar
921*c83a76b0SSuyog Pawar    VMLA.S32 q11,q6,d3[0]           @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
922*c83a76b0SSuyog Pawar    VMLS.S32 q9,q6,d3[1]            @ q9  = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
923*c83a76b0SSuyog Pawar
924*c83a76b0SSuyog Pawar    SUB r3,r3,r4,LSL #1
925*c83a76b0SSuyog Pawar    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd
926*c83a76b0SSuyog Pawar                                    @ r3 is moved from row 7 to row 2
927*c83a76b0SSuyog Pawar    VRSHRN.I32 d20,q10,#11          @ Truncating last 11 bits in H1
928*c83a76b0SSuyog Pawar    VRSHRN.I32 d26,q13,#11          @ Truncating last 11 bits in H3
929*c83a76b0SSuyog Pawar    VRSHRN.I32 d22,q11,#11          @ Truncating last 11 bits in H5
930*c83a76b0SSuyog Pawar    VST1.64 d20,[r3],r4             @ Second half-row of row 2 of transform stage 2 (H1) stored
931*c83a76b0SSuyog Pawar    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in H7
932*c83a76b0SSuyog Pawar
933*c83a76b0SSuyog Pawar    VST1.64 d26,[r3],r4             @ Second half-row of row 4 of transform stage 2 (H3) stored
934*c83a76b0SSuyog Pawar    VST1.64 d22,[r3],r4             @ Second half-row of row 6 of transform stage 2 (H5) stored
935*c83a76b0SSuyog Pawar    VST1.64 d18,[r3]                @ Second half-row of row 8 of transform stage 2 (H7) stored
936*c83a76b0SSuyog Pawar
937*c83a76b0SSuyog Pawar    vpop {d8 - d15}
938*c83a76b0SSuyog Pawar    POP {r4,r5}
939*c83a76b0SSuyog Pawar    MOV pc,lr
940*c83a76b0SSuyog Pawar
941*c83a76b0SSuyog Pawar@/**
942*c83a76b0SSuyog Pawar@*/ *******************************************************************************
943*c83a76b0SSuyog Pawar@*/
944*c83a76b0SSuyog Pawar@*/@brief
945*c83a76b0SSuyog Pawar@*/  This function performs residue calculation and forward  transform on
946*c83a76b0SSuyog Pawar@*/ input pixels
947*c83a76b0SSuyog Pawar@*/
948*c83a76b0SSuyog Pawar@*/@par Description:
949*c83a76b0SSuyog Pawar@*/ Performs residue calculation by subtracting source and  prediction and
950*c83a76b0SSuyog Pawar@*/ followed by forward transform
951*c83a76b0SSuyog Pawar@*/
952*c83a76b0SSuyog Pawar@*/ @param[in] pu1_src
953*c83a76b0SSuyog Pawar@*/  Input 16x16 pixels
954*c83a76b0SSuyog Pawar@*/
955*c83a76b0SSuyog Pawar@*/ @param[in] pu1_pred
956*c83a76b0SSuyog Pawar@*/  Prediction data
957*c83a76b0SSuyog Pawar@*/
958*c83a76b0SSuyog Pawar@*/ @param[in] pi2_tmp
959*c83a76b0SSuyog Pawar@*/  Temporary buffer of size 16x16
960*c83a76b0SSuyog Pawar@*/
961*c83a76b0SSuyog Pawar@*/ @param[out] pi2_dst
962*c83a76b0SSuyog Pawar@*/  Output 16x16 coefficients
963*c83a76b0SSuyog Pawar@*/
964*c83a76b0SSuyog Pawar@*/ @param[in] src_strd
965*c83a76b0SSuyog Pawar@*/  Input stride
966*c83a76b0SSuyog Pawar@*/
967*c83a76b0SSuyog Pawar@*/ @param[in] pred_strd
968*c83a76b0SSuyog Pawar@*/  Prediction Stride
969*c83a76b0SSuyog Pawar@*/
970*c83a76b0SSuyog Pawar@*/ @param[in] dst_strd
971*c83a76b0SSuyog Pawar@*/  Output Stride
972*c83a76b0SSuyog Pawar@*/
973*c83a76b0SSuyog Pawar@*/ @param[in] chr_plane
974*c83a76b0SSuyog Pawar@*/  Chroma plane
975*c83a76b0SSuyog Pawar@*/
976*c83a76b0SSuyog Pawar@*/ @returns  Void
977*c83a76b0SSuyog Pawar@*/
978*c83a76b0SSuyog Pawar@*/ @remarks
979*c83a76b0SSuyog Pawar@*/  None
980*c83a76b0SSuyog Pawar@*/
981*c83a76b0SSuyog Pawar@*/*******************************************************************************
982*c83a76b0SSuyog Pawar@*/
983*c83a76b0SSuyog Pawar
984*c83a76b0SSuyog Pawar.extern g_ai2_ihevc_trans_16
985*c83a76b0SSuyog Pawar.extern g_ai4_ihevc_trans_16
986*c83a76b0SSuyog Pawar
987*c83a76b0SSuyog Pawarg_ai2_ihevc_trans_16_addr_1:
988*c83a76b0SSuyog Pawar.long g_ai2_ihevc_trans_16 - ulbl1 - 8
989*c83a76b0SSuyog Pawar
990*c83a76b0SSuyog Pawarg_ai2_ihevc_trans_16_addr_2:
991*c83a76b0SSuyog Pawar.long g_ai2_ihevc_trans_16 - ulbl2 - 8
992*c83a76b0SSuyog Pawar
993*c83a76b0SSuyog Pawarg_ai4_ihevc_trans_16_addr:
994*c83a76b0SSuyog Pawar.long g_ai4_ihevc_trans_16 - ulbl3 - 8
995*c83a76b0SSuyog Pawar
996*c83a76b0SSuyog Pawar    .global ihevc_resi_trans_16x16_a9q
997*c83a76b0SSuyog Pawar
998*c83a76b0SSuyog Pawarihevc_resi_trans_16x16_a9q:
999*c83a76b0SSuyog Pawar
1000*c83a76b0SSuyog Pawar.equ TMP_STRIDE        ,  64            @16*4, Stride of tmp register
1001*c83a76b0SSuyog Pawar.equ SHIFT             ,  13            @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement
1002*c83a76b0SSuyog Pawar.equ RADD              ,  4096          @1 << (shift - 1);
1003*c83a76b0SSuyog Pawar
1004*c83a76b0SSuyog Pawar.equ COFF_STD_2B       ,  32            @Stride for g_ai2_ihevc_trans_16 in bytes
1005*c83a76b0SSuyog Pawar.equ COFF_STD_W        ,  32            @Stride for g_ai4_ihevc_trans_16 in bytes
1006*c83a76b0SSuyog Pawar
1007*c83a76b0SSuyog Pawar@;LOAD the fucntion
1008*c83a76b0SSuyog Pawar    STMFD          SP!,{r4-r12,LR}      @stack store values of the arguments
1009*c83a76b0SSuyog Pawar    vpush          {d8 - d15}
1010*c83a76b0SSuyog Pawar    SUB            SP,SP,#32
1011*c83a76b0SSuyog Pawar
1012*c83a76b0SSuyog Pawar    LDR             R4,[SP,#136]         @get src_strd
1013*c83a76b0SSuyog Pawar    LDR             R5,[SP,#140]         @get pred_strd
1014*c83a76b0SSuyog Pawar    LDR             R6,[SP,#144]         @get dst_strd
1015*c83a76b0SSuyog Pawar    LDR             R14,[SP,#148]        @get chroma_plane
1016*c83a76b0SSuyog Pawar
1017*c83a76b0SSuyog Pawar    MOV R8,#0                           @Set loop counter
1018*c83a76b0SSuyog Pawar    LDR R9,g_ai2_ihevc_trans_16_addr_1    @get 16 bit transform matrix
1019*c83a76b0SSuyog Pawarulbl1:
1020*c83a76b0SSuyog Pawar    ADD R9, R9, PC
1021*c83a76b0SSuyog Pawar    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16
1022*c83a76b0SSuyog Pawar    @and write to stack
1023*c83a76b0SSuyog Pawar    MOV R12,#COFF_STD_2B
1024*c83a76b0SSuyog Pawar    LSL R12,#2
1025*c83a76b0SSuyog Pawar
1026*c83a76b0SSuyog Pawar    VLD1.S32 D30[0],[R9],R12
1027*c83a76b0SSuyog Pawar    VLD1.S32 D30[1],[R9],R12
1028*c83a76b0SSuyog Pawar    VLD1.S32 D31[0],[R9],R12
1029*c83a76b0SSuyog Pawar    VLD1.S32 D31[1],[R9],R12
1030*c83a76b0SSuyog Pawar
1031*c83a76b0SSuyog Pawar    VTRN.S32 D30,D31
1032*c83a76b0SSuyog Pawar    VTRN.S16 D30,D31
1033*c83a76b0SSuyog Pawar    VST1.S16 {d30,d31},[SP]
1034*c83a76b0SSuyog Pawar
1035*c83a76b0SSuyog Pawar    LDR R9,g_ai2_ihevc_trans_16_addr_2      @get back 16 bit transform matrix
1036*c83a76b0SSuyog Pawarulbl2:
1037*c83a76b0SSuyog Pawar    ADD R9, R9, PC
1038*c83a76b0SSuyog Pawar
1039*c83a76b0SSuyog Pawar    MOV R7,#TMP_STRIDE
1040*c83a76b0SSuyog Pawar
1041*c83a76b0SSuyog Pawar    VMOV.S32 Q14,#0
1042*c83a76b0SSuyog Pawar
1043*c83a76b0SSuyog Pawar@R0         pu1_src
1044*c83a76b0SSuyog Pawar@R1         pu1_pred
1045*c83a76b0SSuyog Pawar@R2         pi4_tmp
1046*c83a76b0SSuyog Pawar@R3         pi2_dst
1047*c83a76b0SSuyog Pawar@R4         src_strd
1048*c83a76b0SSuyog Pawar@R5         pred_strd
1049*c83a76b0SSuyog Pawar@R6         dst_strd
1050*c83a76b0SSuyog Pawar@R7         tmp_dst Nx4 block stride
1051*c83a76b0SSuyog Pawar@R8         loop cntr
1052*c83a76b0SSuyog Pawar@R9         g_ai2_ihevc_trans_16
1053*c83a76b0SSuyog Pawar@R10        tmp_dst Nx4 block offset
1054*c83a76b0SSuyog Pawar@R11        tmp register
1055*c83a76b0SSuyog Pawar@R12        ------
1056*c83a76b0SSuyog Pawar@R14        chroma_plane
1057*c83a76b0SSuyog Pawar@q14        shift 32 bit
1058*c83a76b0SSuyog Pawar@q15        add 32 bit
1059*c83a76b0SSuyog Pawar
1060*c83a76b0SSuyog PawarCORE_LOOP_16X16_HORIZ:
1061*c83a76b0SSuyog Pawar
1062*c83a76b0SSuyog Pawar    CMP R14,#-1
1063*c83a76b0SSuyog Pawar    BGT INTERLEAVED_LOAD_S1
1064*c83a76b0SSuyog Pawar
1065*c83a76b0SSuyog Pawar    VLD1.U8 {d0,d1},[R0],R4             @LOAD 1-16 src row 1
1066*c83a76b0SSuyog Pawar    VLD1.U8 {d2,d3},[R1],R5             @LOAD 1-16 pred row 1
1067*c83a76b0SSuyog Pawar    VLD1.U8 {d4,d5},[R0],R4             @LOAD 1-16 src row 2
1068*c83a76b0SSuyog Pawar    VLD1.U8 {d6,d7},[R1],R5             @LOAD 1-16 pred row 2
1069*c83a76b0SSuyog Pawar    B    LOAD_DONE
1070*c83a76b0SSuyog Pawar
1071*c83a76b0SSuyog PawarINTERLEAVED_LOAD_S1:
1072*c83a76b0SSuyog Pawar    CMP R14,#1
1073*c83a76b0SSuyog Pawar    BEQ INTERLEAVED_LOAD_S2
1074*c83a76b0SSuyog Pawar    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
1075*c83a76b0SSuyog Pawar    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
1076*c83a76b0SSuyog Pawar    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
1077*c83a76b0SSuyog Pawar    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
1078*c83a76b0SSuyog Pawar    B LOAD_DONE
1079*c83a76b0SSuyog Pawar
1080*c83a76b0SSuyog PawarINTERLEAVED_LOAD_S2:
1081*c83a76b0SSuyog Pawar    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
1082*c83a76b0SSuyog Pawar    VSWP.U8 Q0,Q1
1083*c83a76b0SSuyog Pawar    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
1084*c83a76b0SSuyog Pawar    VSWP.U8 Q1,Q2
1085*c83a76b0SSuyog Pawar    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
1086*c83a76b0SSuyog Pawar    VSWP.U8 Q2,Q3
1087*c83a76b0SSuyog Pawar    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
1088*c83a76b0SSuyog Pawar    VSWP.U8 Q3,Q4
1089*c83a76b0SSuyog Pawar
1090*c83a76b0SSuyog PawarLOAD_DONE:
1091*c83a76b0SSuyog Pawar
1092*c83a76b0SSuyog Pawar    VSUBL.U8 Q4,D0,D2                   @Get residue 1-8 row 1
1093*c83a76b0SSuyog Pawar    VSUBL.U8 Q5,D1,D3                   @Get residue 9-16 row 1
1094*c83a76b0SSuyog Pawar    VSUBL.U8 Q6,D4,D6                   @Get residue 1-8 row 2
1095*c83a76b0SSuyog Pawar    VSUBL.U8 Q7,D5,D7                   @Get residue 9-16 row 2
1096*c83a76b0SSuyog Pawar
1097*c83a76b0SSuyog Pawar    @Get blk sads
1098*c83a76b0SSuyog Pawar    VABDL.U8 Q15,D0,D2
1099*c83a76b0SSuyog Pawar    VABAL.U8 Q15,D1,D3
1100*c83a76b0SSuyog Pawar    VABAL.U8 Q15,D4,D6
1101*c83a76b0SSuyog Pawar    VABAL.U8 Q15,D5,D7
1102*c83a76b0SSuyog Pawar    VADDW.S16 Q14,Q14,D30
1103*c83a76b0SSuyog Pawar    VADDW.S16 Q14,Q14,D31
1104*c83a76b0SSuyog Pawar
1105*c83a76b0SSuyog Pawar    VREV64.S16 Q5,Q5                    @Rev row 1
1106*c83a76b0SSuyog Pawar    VREV64.S16 Q7,Q7                    @Rev row 2
1107*c83a76b0SSuyog Pawar    VSWP D10,D11
1108*c83a76b0SSuyog Pawar    VSWP D14,D15
1109*c83a76b0SSuyog Pawar
1110*c83a76b0SSuyog Pawar    VADD.S16 Q8 ,Q4,Q5                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 1
1111*c83a76b0SSuyog Pawar    VSUB.S16 Q9 ,Q4,Q5                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 1
1112*c83a76b0SSuyog Pawar    VADD.S16 Q10,Q6,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 2
1113*c83a76b0SSuyog Pawar    VSUB.S16 Q11,Q6,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 2
1114*c83a76b0SSuyog Pawar
1115*c83a76b0SSuyog Pawar    VREV64.S16    D24,D17               @rev e[k] k-> 4-7 row 1
1116*c83a76b0SSuyog Pawar    VREV64.S16    D25,D21               @rev e[k] k-> 4-7 row 2
1117*c83a76b0SSuyog Pawar    VMOV.S16    D17,D20
1118*c83a76b0SSuyog Pawar
1119*c83a76b0SSuyog Pawar    @arrangement OF DATA
1120*c83a76b0SSuyog Pawar    @Q8     A1 A2 A3 A4 B1 B2 B3 B4
1121*c83a76b0SSuyog Pawar    @Q12    A8 A7 A6 A5 B8 B7 B6 B5
1122*c83a76b0SSuyog Pawar
1123*c83a76b0SSuyog Pawar    VADD.S16 Q13,Q8,Q12                 @ee[k] = e[k] + e[7 - k] row 1 & 2
1124*c83a76b0SSuyog Pawar    VSUB.S16 Q0,Q8,Q12                  @eo[k] = e[k] - e[7 - k] row 1 & 2
1125*c83a76b0SSuyog Pawar
1126*c83a76b0SSuyog Pawar    @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3]
1127*c83a76b0SSuyog Pawar    @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3]
1128*c83a76b0SSuyog Pawar    VTRN.S32 D26,D27                    @1-cycle stall before it?
1129*c83a76b0SSuyog Pawar    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
1130*c83a76b0SSuyog Pawar    @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3]
1131*c83a76b0SSuyog Pawar    VREV32.16 D2,D27                    @1-cycle stall before it?
1132*c83a76b0SSuyog Pawar    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
1133*c83a76b0SSuyog Pawar    @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2]
1134*c83a76b0SSuyog Pawar    VMOV.S16 D27,D26
1135*c83a76b0SSuyog Pawar    VNEG.S16 D3,D2
1136*c83a76b0SSuyog Pawar    @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1]  R1ee[0]  R1ee[1]  R2ee[0]  R2ee[1]
1137*c83a76b0SSuyog Pawar    @Q1  R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2]
1138*c83a76b0SSuyog Pawar
1139*c83a76b0SSuyog Pawar    @D8 : [0 0] [4 0] [8 0] [12 0]
1140*c83a76b0SSuyog Pawar    @D9 : [0 1] [4 1] [8 1] [12 1]
1141*c83a76b0SSuyog Pawar    VLD1.S16 {d8,d9},[SP]               @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1]
1142*c83a76b0SSuyog Pawar    VADD.S16 Q1,Q13,Q1                  @ 1-cycle stall before it?
1143*c83a76b0SSuyog Pawar    @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
1144*c83a76b0SSuyog Pawar
1145*c83a76b0SSuyog Pawar    @Q1  R1eee[0] R1eee[1] R2eee[0] R2eee[1]
1146*c83a76b0SSuyog Pawar    @    R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
1147*c83a76b0SSuyog Pawar    VTRN.S16 D2,D3                      @2-cycle stall before it?
1148*c83a76b0SSuyog Pawar    @Q1  R1eee[0] R1eeo[0] R2eee[0] R2eeo[0]
1149*c83a76b0SSuyog Pawar    @     R1eee[1] R1eeo[1] R2eee[1] R2eeo[1]
1150*c83a76b0SSuyog Pawar
1151*c83a76b0SSuyog Pawar    VDUP.S32 D4,D2[0]    @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]    ;1-cycle stall?
1152*c83a76b0SSuyog Pawar    VDUP.S32 D5,D2[1]    @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
1153*c83a76b0SSuyog Pawar    VDUP.S32 D6,D3[0]    @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
1154*c83a76b0SSuyog Pawar    VDUP.S32 D7,D3[1]    @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
1155*c83a76b0SSuyog Pawar
1156*c83a76b0SSuyog Pawar    @---------------Process EO--------------------
1157*c83a76b0SSuyog Pawar    @ Early start to avoid stalls
1158*c83a76b0SSuyog Pawar    MOV R12,#COFF_STD_2B                @Get stride of coeffs
1159*c83a76b0SSuyog Pawar
1160*c83a76b0SSuyog Pawar    VMULL.S16 Q5,D4,D8                  @   g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]
1161*c83a76b0SSuyog Pawar    VMLAL.S16 Q5,D6,D9                  @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
1162*c83a76b0SSuyog Pawar    VMULL.S16 Q6,D5,D8                  @   g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
1163*c83a76b0SSuyog Pawar    VMLAL.S16 Q6,D7,D9                  @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
1164*c83a76b0SSuyog Pawar
1165*c83a76b0SSuyog Pawar    ADD R11,R9,R12,LSL #1               @Load address of g_ai2_ihevc_trans_16[2]
1166*c83a76b0SSuyog Pawar    LSL R12,R12,#2
1167*c83a76b0SSuyog Pawar
1168*c83a76b0SSuyog Pawar    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[2][0-4]]
1169*c83a76b0SSuyog Pawar
1170*c83a76b0SSuyog Pawar    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[6][0-4]
1171*c83a76b0SSuyog Pawar    VMULL.S16 Q1,D26,D0                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R1
1172*c83a76b0SSuyog Pawar
1173*c83a76b0SSuyog Pawar    VMULL.S16 Q2,D26,D1                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R2
1174*c83a76b0SSuyog Pawar
1175*c83a76b0SSuyog Pawar    VZIP.S32 Q5,Q6                      @3-cycle instruction
1176*c83a76b0SSuyog Pawar    VMULL.S16 Q3,D27,D0                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R1
1177*c83a76b0SSuyog Pawar
1178*c83a76b0SSuyog Pawar
1179*c83a76b0SSuyog Pawar    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[10][0-4]
1180*c83a76b0SSuyog Pawar    VMULL.S16 Q4,D27,D1                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R2
1181*c83a76b0SSuyog Pawar
1182*c83a76b0SSuyog Pawar    @These values must go to 0 4 8 12 colums hence we need stride *4
1183*c83a76b0SSuyog Pawar    LSL R10,R7,#2
1184*c83a76b0SSuyog Pawar
1185*c83a76b0SSuyog Pawar    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[14][0-4]
1186*c83a76b0SSuyog Pawar
1187*c83a76b0SSuyog Pawar    VST1.32 D10,[R2],R10
1188*c83a76b0SSuyog Pawar    VMULL.S16 Q8,D27,D1                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2
1189*c83a76b0SSuyog Pawar
1190*c83a76b0SSuyog Pawar    VST1.32 D11,[R2],R10
1191*c83a76b0SSuyog Pawar    VMULL.S16 Q7,D27,D0                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1
1192*c83a76b0SSuyog Pawar
1193*c83a76b0SSuyog Pawar    VST1.32 D12,[R2],R10
1194*c83a76b0SSuyog Pawar    VMULL.S16 Q5,D26,D0                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1
1195*c83a76b0SSuyog Pawar
1196*c83a76b0SSuyog Pawar    VST1.32 D13,[R2],R10
1197*c83a76b0SSuyog Pawar    VMULL.S16 Q6,D26,D1                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2
1198*c83a76b0SSuyog Pawar
1199*c83a76b0SSuyog Pawar    SUB R2,R2,R10,LSL #2
1200*c83a76b0SSuyog Pawar
1201*c83a76b0SSuyog Pawar    @transpose the 4x4 matrix row1
1202*c83a76b0SSuyog Pawar    VTRN.32 Q1, Q3                      @R1 transpose1 -- 2 cycles
1203*c83a76b0SSuyog Pawar
1204*c83a76b0SSuyog Pawar    @transpose the 4x4 matrix row2
1205*c83a76b0SSuyog Pawar    VTRN.32 Q2,Q4                       @R2 transpose1 -- 2 cycles
1206*c83a76b0SSuyog Pawar
1207*c83a76b0SSuyog Pawar    VTRN.32 Q5, Q7                      @R1 transpose1 -- 2 cycles
1208*c83a76b0SSuyog Pawar
1209*c83a76b0SSuyog Pawar    VTRN.32 Q6,Q8                       @R2 transpose1 -- 2 cycles
1210*c83a76b0SSuyog Pawar
1211*c83a76b0SSuyog Pawar    VSWP    D10,D3                      @R1 transpose2
1212*c83a76b0SSuyog Pawar    VSWP    D14,D7                      @R1 transpose2
1213*c83a76b0SSuyog Pawar
1214*c83a76b0SSuyog Pawar    VSWP    D12,D5                      @R2 transpose2
1215*c83a76b0SSuyog Pawar    VSWP    D16,D9                      @R2 transpose2
1216*c83a76b0SSuyog Pawar
1217*c83a76b0SSuyog Pawar    VADD.S32 Q5,Q5,Q1                   @R1 add
1218*c83a76b0SSuyog Pawar    VADD.S32 Q3,Q3,Q7                   @R1 add
1219*c83a76b0SSuyog Pawar
1220*c83a76b0SSuyog Pawar    VADD.S32 Q2,Q2,Q4                   @R2 add
1221*c83a76b0SSuyog Pawar    VADD.S32 Q6,Q6,Q8                   @R2 add
1222*c83a76b0SSuyog Pawar
1223*c83a76b0SSuyog Pawar    VADD.S32 Q5,Q5,Q3                   @R1 add
1224*c83a76b0SSuyog Pawar
1225*c83a76b0SSuyog Pawar    VADD.S32 Q4,Q6,Q2                   @R2 add
1226*c83a76b0SSuyog Pawar
1227*c83a76b0SSuyog Pawar    @-----------------------Processing O ----------------------------
1228*c83a76b0SSuyog Pawar    @ Early start to avoid stalls
1229*c83a76b0SSuyog Pawar    MOV R12,#COFF_STD_2B                @Get coeffs stride
1230*c83a76b0SSuyog Pawar    LSL R12,R12,#1
1231*c83a76b0SSuyog Pawar    ADD R11,R9,#COFF_STD_2B             @Get address of g_ai2_ihevc_trans_16[1]
1232*c83a76b0SSuyog Pawar
1233*c83a76b0SSuyog Pawar    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles
1234*c83a76b0SSuyog Pawar
1235*c83a76b0SSuyog Pawar    VZIP.S32 Q5,Q4                      @ 3 cycle instruction
1236*c83a76b0SSuyog Pawar    VMULL.S16 Q6,D18,D4                 @o[0][0-3]*  R1
1237*c83a76b0SSuyog Pawar
1238*c83a76b0SSuyog Pawar
1239*c83a76b0SSuyog Pawar    VMLAL.S16 Q6,D19,D5                 @o[0][4-7]*  R1     ; follows MULL instruction: Multiplier accumulator forwarding
1240*c83a76b0SSuyog Pawar    @write to memory
1241*c83a76b0SSuyog Pawar    @this should go to 2 6 10 14
1242*c83a76b0SSuyog Pawar    LSL R10,R7,#2
1243*c83a76b0SSuyog Pawar    ADD R2,R2,R7,LSL #1                 @move to third row
1244*c83a76b0SSuyog Pawar    VST1.32 D10,[R2],R10
1245*c83a76b0SSuyog Pawar    VMULL.S16 Q7,D22,D4                 @o[0][0-3]*  R2
1246*c83a76b0SSuyog Pawar
1247*c83a76b0SSuyog Pawar    VST1.32 D11,[R2],R10
1248*c83a76b0SSuyog Pawar    VMLAL.S16 Q7,D23,D5                 @o[0][4-7]*  R2
1249*c83a76b0SSuyog Pawar
1250*c83a76b0SSuyog Pawar    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1251*c83a76b0SSuyog Pawar
1252*c83a76b0SSuyog Pawar    VST1.32 D8,[R2],R10
1253*c83a76b0SSuyog Pawar    VMULL.S16 Q8,D18,D4                 @o[1][0-3]*  R1
1254*c83a76b0SSuyog Pawar
1255*c83a76b0SSuyog Pawar    VST1.32 D9,[R2],R10
1256*c83a76b0SSuyog Pawar    VMLAL.S16 Q8,D19,D5                 @o[1][4-7]*  R1
1257*c83a76b0SSuyog Pawar    SUB R2,R2,R10,LSL #2
1258*c83a76b0SSuyog Pawar    SUB R2,R2,R7,LSL #1
1259*c83a76b0SSuyog Pawar
1260*c83a76b0SSuyog Pawar    @--------------------Done procrssing EO -------------------------
1261*c83a76b0SSuyog Pawar
1262*c83a76b0SSuyog Pawar    @ -----------------Processing O continues------------------------
1263*c83a76b0SSuyog Pawar
1264*c83a76b0SSuyog Pawar    VMULL.S16 Q10,D22,D4                @o[1][0-3]*  R2
1265*c83a76b0SSuyog Pawar    VMLAL.S16 Q10,D23,D5                @o[1][4-7]*  R2
1266*c83a76b0SSuyog Pawar
1267*c83a76b0SSuyog Pawar    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
1268*c83a76b0SSuyog Pawar
1269*c83a76b0SSuyog Pawar    VLD1.S16 {d6,d7},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
1270*c83a76b0SSuyog Pawar    VMULL.S16 Q12,D18,D4                @o[2][0-3]*  R1
1271*c83a76b0SSuyog Pawar
1272*c83a76b0SSuyog Pawar    VMLAL.S16 Q12,D19,D5                @o[2][4-7]*  R1
1273*c83a76b0SSuyog Pawar    VMULL.S16 Q0,D18,D6                 @o[3][0-3]*  R1
1274*c83a76b0SSuyog Pawar    VMLAL.S16 Q0,D19,D7                 @o[3][4-7]*  R1
1275*c83a76b0SSuyog Pawar
1276*c83a76b0SSuyog Pawar    VMULL.S16 Q13,D22,D4                @o[2][0-3]*  R2
1277*c83a76b0SSuyog Pawar    VMLAL.S16 Q13,D23,D5                @o[2][4-7]*  R2
1278*c83a76b0SSuyog Pawar    VMULL.S16 Q1,D22,D6                 @o[3][0-3]*  R2
1279*c83a76b0SSuyog Pawar    VMLAL.S16 Q1,D23,D7                 @o[3][4-7]*  R2
1280*c83a76b0SSuyog Pawar
1281*c83a76b0SSuyog Pawar    @transpose the 4x4 matrix R1
1282*c83a76b0SSuyog Pawar    VTRN.32 Q6, Q8                      @ 2-cycle instruction
1283*c83a76b0SSuyog Pawar
1284*c83a76b0SSuyog Pawar    VTRN.32 Q12,Q0                      @ 2-cycle instruction
1285*c83a76b0SSuyog Pawar
1286*c83a76b0SSuyog Pawar    @transpose the 4x4 matrix R2
1287*c83a76b0SSuyog Pawar    VTRN.32 Q7,Q10                      @ 2-cycle instruction
1288*c83a76b0SSuyog Pawar
1289*c83a76b0SSuyog Pawar    VTRN.32 Q13,Q1                      @ 2-cycle instruction
1290*c83a76b0SSuyog Pawar
1291*c83a76b0SSuyog Pawar    VSWP    D24,D13
1292*c83a76b0SSuyog Pawar    VSWP    D0, D17
1293*c83a76b0SSuyog Pawar
1294*c83a76b0SSuyog Pawar    VSWP     D26,D15
1295*c83a76b0SSuyog Pawar    VSWP    D2,D21
1296*c83a76b0SSuyog Pawar
1297*c83a76b0SSuyog Pawar    VADD.S32 Q8 ,Q8 ,Q6
1298*c83a76b0SSuyog Pawar    VADD.S32 Q12,Q12,Q0
1299*c83a76b0SSuyog Pawar
1300*c83a76b0SSuyog Pawar    VADD.S32 Q10,Q10,Q7
1301*c83a76b0SSuyog Pawar    VADD.S32 Q13,Q13,Q1
1302*c83a76b0SSuyog Pawar
1303*c83a76b0SSuyog Pawar    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[9][0-7]
1304*c83a76b0SSuyog Pawar    VADD.S32 Q12 ,Q12 ,Q8
1305*c83a76b0SSuyog Pawar
1306*c83a76b0SSuyog Pawar    VADD.S32 Q13,Q13,Q10
1307*c83a76b0SSuyog Pawar    VMULL.S16 Q3,D18,D4                 @o[4][0-3]*  R1
1308*c83a76b0SSuyog Pawar    VMLAL.S16 Q3,D19,D5                 @o[4][4-7]*  R1
1309*c83a76b0SSuyog Pawar
1310*c83a76b0SSuyog Pawar    VZIP.S32 Q12,Q13
1311*c83a76b0SSuyog Pawar    VMULL.S16 Q4,D22,D4                 @o[0][0-3]*  R2
1312*c83a76b0SSuyog Pawar
1313*c83a76b0SSuyog Pawar
1314*c83a76b0SSuyog Pawar    VMLAL.S16 Q4,D23,D5                 @o[0][4-7]*  R2
1315*c83a76b0SSuyog Pawar    @write to memory
1316*c83a76b0SSuyog Pawar    @this should go to 1 3 5 7
1317*c83a76b0SSuyog Pawar    ADD R2,R2,R7
1318*c83a76b0SSuyog Pawar    LSL R7,R7,#1
1319*c83a76b0SSuyog Pawar    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[11][0-7]
1320*c83a76b0SSuyog Pawar
1321*c83a76b0SSuyog Pawar    VST1.32 D24,[R2],R7
1322*c83a76b0SSuyog Pawar    VMULL.S16 Q5,D18,D4                 @o[5][0-3]*  R1
1323*c83a76b0SSuyog Pawar
1324*c83a76b0SSuyog Pawar    VST1.32 D25,[R2],R7
1325*c83a76b0SSuyog Pawar    VMLAL.S16 Q5,D19,D5                 @o[5][4-7]*  R1
1326*c83a76b0SSuyog Pawar
1327*c83a76b0SSuyog Pawar    VST1.32 D26,[R2],R7
1328*c83a76b0SSuyog Pawar    VMULL.S16 Q6,D22,D4                 @o[0][0-3]*  R2
1329*c83a76b0SSuyog Pawar
1330*c83a76b0SSuyog Pawar    VST1.32 D27,[R2],R7
1331*c83a76b0SSuyog Pawar    VMLAL.S16 Q6,D23,D5                 @o[0][4-7]*  R2
1332*c83a76b0SSuyog Pawar
1333*c83a76b0SSuyog Pawar    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[13][0-7]
1334*c83a76b0SSuyog Pawar
1335*c83a76b0SSuyog Pawar    VLD1.S16 {d2,d3},[R11],R12          @g_ai2_ihevc_trans_16[15][0-7]
1336*c83a76b0SSuyog Pawar    VMULL.S16 Q7,D18,D4                 @o[6][0-3]*  R1
1337*c83a76b0SSuyog Pawar
1338*c83a76b0SSuyog Pawar    VMLAL.S16 Q7,D19,D5                 @o[6][4-7]*  R1
1339*c83a76b0SSuyog Pawar    VMULL.S16 Q10,D18,D2                @o[7][0-3]*  R1
1340*c83a76b0SSuyog Pawar    VMLAL.S16 Q10,D19,D3                @o[7][4-7]*  R1
1341*c83a76b0SSuyog Pawar
1342*c83a76b0SSuyog Pawar    VMULL.S16 Q8,D22,D4                 @o[0][0-3]*  R2
1343*c83a76b0SSuyog Pawar    VMLAL.S16 Q8,D23,D5                 @o[0][4-7]*  R2
1344*c83a76b0SSuyog Pawar    VMULL.S16 Q12,D22,D2                @o[0][0-3]*  R2
1345*c83a76b0SSuyog Pawar    VMLAL.S16 Q12,D23,D3                @o[0][4-7]*  R2
1346*c83a76b0SSuyog Pawar
1347*c83a76b0SSuyog Pawar
1348*c83a76b0SSuyog Pawar    @transpose the 4x4 matrix R1
1349*c83a76b0SSuyog Pawar    VTRN.32 Q3 ,Q5                      @ 2-cycle instruction
1350*c83a76b0SSuyog Pawar
1351*c83a76b0SSuyog Pawar    VTRN.32 Q7 ,Q10                     @ transpose step 2 R1 , 2-cycle instruction
1352*c83a76b0SSuyog Pawar
1353*c83a76b0SSuyog Pawar    @transpose the 4x4 matrix R2
1354*c83a76b0SSuyog Pawar    VTRN.32 Q4 ,Q6                      @ 2-cycle instruction
1355*c83a76b0SSuyog Pawar
1356*c83a76b0SSuyog Pawar    VTRN.32 Q8 ,Q12                     @ transpose step 2 R2 , 2-cycle instruction
1357*c83a76b0SSuyog Pawar
1358*c83a76b0SSuyog Pawar    VSWP    D14,D7                      @ transpose step 3, R1
1359*c83a76b0SSuyog Pawar    VSWP    D20,D11                     @ transpose step 4, R1
1360*c83a76b0SSuyog Pawar    VSWP    D16,D9                      @ transpose step 3, R2
1361*c83a76b0SSuyog Pawar    VSWP    D24,D13                     @ transpose step 4, R2
1362*c83a76b0SSuyog Pawar
1363*c83a76b0SSuyog Pawar    VADD.S32 Q5 ,Q5 ,Q3
1364*c83a76b0SSuyog Pawar    VADD.S32 Q10,Q10,Q7
1365*c83a76b0SSuyog Pawar    VADD.S32 Q6 ,Q6 ,Q4
1366*c83a76b0SSuyog Pawar    VADD.S32 Q12,Q12,Q8
1367*c83a76b0SSuyog Pawar    VADD.S32 Q10,Q10,Q5
1368*c83a76b0SSuyog Pawar    VADD.S32 Q12,Q12,Q6
1369*c83a76b0SSuyog Pawar
1370*c83a76b0SSuyog Pawar    @ 2-cycle stall
1371*c83a76b0SSuyog Pawar    VZIP.S32 Q10,Q12                    @ 3-cycle instruction
1372*c83a76b0SSuyog Pawar
1373*c83a76b0SSuyog Pawar    @ 2-cycle stall
1374*c83a76b0SSuyog Pawar    @this should go to 9 11 13 15
1375*c83a76b0SSuyog Pawar    VST1.32 D20,[R2],R7
1376*c83a76b0SSuyog Pawar
1377*c83a76b0SSuyog Pawar    VST1.32 D21,[R2],R7
1378*c83a76b0SSuyog Pawar
1379*c83a76b0SSuyog Pawar    VST1.32 D24,[R2],R7
1380*c83a76b0SSuyog Pawar
1381*c83a76b0SSuyog Pawar    VST1.32 D25,[R2],R7
1382*c83a76b0SSuyog Pawar
1383*c83a76b0SSuyog Pawar    SUB R2,R2,R7,LSL #3
1384*c83a76b0SSuyog Pawar    LSR R7,R7,#1
1385*c83a76b0SSuyog Pawar    SUB R2,R2,R7
1386*c83a76b0SSuyog Pawar
1387*c83a76b0SSuyog Pawar    ADD R2,R2,#8                        @MOVE TO NEXT to next COLUMN - pi4_tmp
1388*c83a76b0SSuyog Pawar
1389*c83a76b0SSuyog Pawar    ADD R8,R8,#2                        @increment loop cntr
1390*c83a76b0SSuyog Pawar    CMP R8,#16                          @check lllop cntr
1391*c83a76b0SSuyog Pawar    BNE CORE_LOOP_16X16_HORIZ           @jump acc
1392*c83a76b0SSuyog Pawar
1393*c83a76b0SSuyog Pawar
1394*c83a76b0SSuyog Pawar@*****************Vertical transform************************************
1395*c83a76b0SSuyog Pawar
1396*c83a76b0SSuyog Pawar@Initialization for vert transform
1397*c83a76b0SSuyog Pawar@pi4_tmp will be the new src
1398*c83a76b0SSuyog Pawar@tmp stride will be new src stride
1399*c83a76b0SSuyog Pawar@dst will be new pi4_tmp
1400*c83a76b0SSuyog Pawar@dst stride will be new tmp stride
1401*c83a76b0SSuyog Pawar@trans table will be of 32 bit
1402*c83a76b0SSuyog Pawar
1403*c83a76b0SSuyog Pawar    LDR R9,g_ai4_ihevc_trans_16_addr        @get 32 bit transform matrix
1404*c83a76b0SSuyog Pawarulbl3:
1405*c83a76b0SSuyog Pawar    ADD R9, R9, PC
1406*c83a76b0SSuyog Pawar
1407*c83a76b0SSuyog Pawar    SUB R0,R2,#64                       @set tmp as src [-32 to move back to orgin]
1408*c83a76b0SSuyog Pawar    MOV R2,R3                           @set dst as tmp
1409*c83a76b0SSuyog Pawar    MOV R4,#TMP_STRIDE                  @set tmp stride as src stride
1410*c83a76b0SSuyog Pawar    LSL R7,R6,#1                        @Set dst stride as tmp stride
1411*c83a76b0SSuyog Pawar    SUB R4,#48                          @Adjust stride 3 previous loads
1412*c83a76b0SSuyog Pawar
1413*c83a76b0SSuyog Pawar    @Block SAD
1414*c83a76b0SSuyog Pawar    VADD.S32 D28,D28,D29
1415*c83a76b0SSuyog Pawar    VPADD.S32 D28,D28,D29
1416*c83a76b0SSuyog Pawar    VMOV.S32 R3,D28[0]
1417*c83a76b0SSuyog Pawar    @ SAD calculation ends -- final value in R3.
1418*c83a76b0SSuyog Pawar
1419*c83a76b0SSuyog Pawar    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1]
1420*c83a76b0SSuyog Pawar    @values of g_ai4_ihevc_trans_16 and write to stack
1421*c83a76b0SSuyog Pawar    MOV R12,#COFF_STD_W
1422*c83a76b0SSuyog Pawar    LSL R12,R12,#2
1423*c83a76b0SSuyog Pawar    VLD1.S32 D28,[R9],R12
1424*c83a76b0SSuyog Pawar    VLD1.S32 D29,[R9],R12
1425*c83a76b0SSuyog Pawar    VLD1.S32 D30,[R9],R12
1426*c83a76b0SSuyog Pawar    VLD1.S32 D31,[R9],R12
1427*c83a76b0SSuyog Pawar    SUB R9,R9,R12,LSL #2
1428*c83a76b0SSuyog Pawar
1429*c83a76b0SSuyog Pawar    VREV64.32 Q15,Q15
1430*c83a76b0SSuyog Pawar    VTRN.S32 Q14,Q15
1431*c83a76b0SSuyog Pawar    VST1.S32 {Q14-Q15},[SP]
1432*c83a76b0SSuyog Pawar
1433*c83a76b0SSuyog Pawar    VMOV.U32 Q14,#RADD                  @get the round factor to q14
1434*c83a76b0SSuyog Pawar    VMOV.U32 Q15,#SHIFT                 @Get the shift to neon
1435*c83a76b0SSuyog Pawar
1436*c83a76b0SSuyog Pawar    MOV R8,#0                           @INIT LOOP
1437*c83a76b0SSuyog Pawar
1438*c83a76b0SSuyog PawarCORE_LOOP_16X16_VERT:
1439*c83a76b0SSuyog Pawar
1440*c83a76b0SSuyog Pawar    VLD1.S32 {D0,D1},[R0]!              @LOAD 1-4 src R1
1441*c83a76b0SSuyog Pawar    VLD1.S32 {D2,D3},[R0]!              @LOAD 5-8 pred R1
1442*c83a76b0SSuyog Pawar    VLD1.S32 {D4,D5},[R0]!              @LOAD 9-12 src R1
1443*c83a76b0SSuyog Pawar    VLD1.S32 {D6,D7},[R0],R4            @LOAD 12-16 pred R1
1444*c83a76b0SSuyog Pawar
1445*c83a76b0SSuyog Pawar    VLD1.S32 {D8,D9},[R0]!              @LOAD 1-4 src R2
1446*c83a76b0SSuyog Pawar    VLD1.S32 {D10,D11},[R0]!            @LOAD 5-8 pred R2
1447*c83a76b0SSuyog Pawar    VLD1.S32 {D12,D13},[R0]!            @LOAD 9-12 src R2
1448*c83a76b0SSuyog Pawar    VLD1.S32 {D14,D15},[R0],R4          @LOAD 12-16 pred R2
1449*c83a76b0SSuyog Pawar
1450*c83a76b0SSuyog Pawar    VREV64.S32 Q2,Q2                    @Rev 9-12 R1
1451*c83a76b0SSuyog Pawar    VREV64.S32 Q3,Q3                    @Rev 12-16 R1
1452*c83a76b0SSuyog Pawar    VREV64.S32 Q6,Q6                    @Rev 9-12 R2
1453*c83a76b0SSuyog Pawar    VREV64.S32 Q7,Q7                    @Rev 12-16 R2
1454*c83a76b0SSuyog Pawar
1455*c83a76b0SSuyog Pawar    VSWP D6,D7
1456*c83a76b0SSuyog Pawar    VSWP D4,D5
1457*c83a76b0SSuyog Pawar    VADD.S32 Q8 ,Q0,Q3                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R1
1458*c83a76b0SSuyog Pawar    VSWP D12,D13                        @ dual issued with prev. instruction
1459*c83a76b0SSuyog Pawar    VADD.S32 Q9 ,Q1,Q2                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R1
1460*c83a76b0SSuyog Pawar    VSWP D14,D15                        @ dual issued with prev. instruction
1461*c83a76b0SSuyog Pawar    VSUB.S32 Q10,Q0,Q3                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R1
1462*c83a76b0SSuyog Pawar    VSUB.S32 Q11,Q1,Q2                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R1
1463*c83a76b0SSuyog Pawar
1464*c83a76b0SSuyog Pawar    VADD.S32 Q12,Q4,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R2
1465*c83a76b0SSuyog Pawar    VREV64.S32    Q9 ,Q9                @rev e[k] k-> 4-7 R1, dual issued with prev. instruction
1466*c83a76b0SSuyog Pawar    VADD.S32 Q13,Q5,Q6                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R2
1467*c83a76b0SSuyog Pawar    VSUB.S32 Q0 ,Q4,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R2
1468*c83a76b0SSuyog Pawar    VSWP D18,D19                        @ dual issued with prev. instruction
1469*c83a76b0SSuyog Pawar    VSUB.S32 Q1 ,Q5,Q6                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R2
1470*c83a76b0SSuyog Pawar    VREV64.S32    Q13,Q13               @rev e[k] k-> 4-7 R2, dual issued with prev. instruction
1471*c83a76b0SSuyog Pawar
1472*c83a76b0SSuyog Pawar    VADD.S32 Q2,Q8,Q9                   @ee[k] = e[k] + e[7 - k] row R1
1473*c83a76b0SSuyog Pawar    VSUB.S32 Q3,Q8,Q9                   @eo[k] = e[k] - e[7 - k] row R1
1474*c83a76b0SSuyog Pawar    VSWP D26,D27
1475*c83a76b0SSuyog Pawar
1476*c83a76b0SSuyog Pawar
1477*c83a76b0SSuyog Pawar    VADD.S32 Q4,Q12,Q13                 @ee[k] = e[k] + e[7 - k] row R2
1478*c83a76b0SSuyog Pawar    VSUB.S32 Q5,Q12,Q13                 @eo[k] = e[k] - e[7 - k] row R2
1479*c83a76b0SSuyog Pawar    VREV64.S32 D5,D5                    @rev ee[k] 4-7 R1, dual issued with prev. instruction
1480*c83a76b0SSuyog Pawar
1481*c83a76b0SSuyog Pawar    VADD.S32 D12,D4,D5                  @eee[0] eee[1]    R1
1482*c83a76b0SSuyog Pawar    VSUB.S32 D13,D4,D5                  @eeo[0] eeo[1]    R1
1483*c83a76b0SSuyog Pawar    VREV64.S32 D9,D9                    @rev ee[k] 4-7 R2, dual issued with prev. instruction
1484*c83a76b0SSuyog Pawar
1485*c83a76b0SSuyog Pawar
1486*c83a76b0SSuyog Pawar    VADD.S32 D14,D8,D9                  @eee[0] eee[1]    R2
1487*c83a76b0SSuyog Pawar    VSUB.S32 D15,D8,D9                  @eeo[0] eeo[1]    R2
1488*c83a76b0SSuyog Pawar
1489*c83a76b0SSuyog Pawar    VLD1.S32 {Q12,Q13},[SP]             @Load g_ai2_ihevc_trans_16[xx]->  Q12 : [0 0] [8 0] [4 0] [12 0]  Q13 : [0 1] [8 1] [4 1] [12 1]
1490*c83a76b0SSuyog Pawar    VREV64.S32 Q8,Q6                    @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1   ->     ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1
1491*c83a76b0SSuyog Pawar
1492*c83a76b0SSuyog Pawar    VREV64.S32 Q9,Q7                    @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2     ->    ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2
1493*c83a76b0SSuyog Pawar
1494*c83a76b0SSuyog Pawar
1495*c83a76b0SSuyog Pawar    VMUL.S32 Q4,Q6,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R1
1496*c83a76b0SSuyog Pawar    VMLA.S32 Q4,Q8,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0]    R1
1497*c83a76b0SSuyog Pawar
1498*c83a76b0SSuyog Pawar    VMUL.S32 Q6,Q7,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R2
1499*c83a76b0SSuyog Pawar    VMLA.S32 Q6,Q9,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2
1500*c83a76b0SSuyog Pawar
1501*c83a76b0SSuyog Pawar                                        @Q3    :R1E00 R1E01 R1E02 R1E03
1502*c83a76b0SSuyog Pawar                                        @Q5    :R2E00 R2E01 R2E02 R2E03
1503*c83a76b0SSuyog Pawar    VSWP D7,D10                         @ dual issued with prev. instruction
1504*c83a76b0SSuyog Pawar                                        @Q3    :R1E00 R1E01 R2E00 R2E01
1505*c83a76b0SSuyog Pawar                                        @Q5    :R1E02 R1E03 R2E02 R2E03
1506*c83a76b0SSuyog Pawar    VSWP D7,D11
1507*c83a76b0SSuyog Pawar                                        @Q3    :R1E00 R1E01 R2E02 R2E03
1508*c83a76b0SSuyog Pawar                                        @Q5    :R1E02 R1E03 R2E00 R2E01
1509*c83a76b0SSuyog Pawar
1510*c83a76b0SSuyog Pawar    MOV R12,#COFF_STD_W
1511*c83a76b0SSuyog Pawar    ADD R11,R9,R12,LSL #1               @Get to the 2nd row of src
1512*c83a76b0SSuyog Pawar    LSL R12,R12,#2
1513*c83a76b0SSuyog Pawar
1514*c83a76b0SSuyog Pawar    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr.
1515*c83a76b0SSuyog Pawar
1516*c83a76b0SSuyog Pawar    VADD.S32  Q4,Q4,Q14                 @ROUND  R1
1517*c83a76b0SSuyog Pawar    VMUL.S32  Q12,Q3,Q7                 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction
1518*c83a76b0SSuyog Pawar    VSWP      D14,D15                   @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction
1519*c83a76b0SSuyog Pawar
1520*c83a76b0SSuyog Pawar    VADD.S32 Q6,Q6,Q14                  @ROUND  R2
1521*c83a76b0SSuyog Pawar
1522*c83a76b0SSuyog Pawar    VSHRN.S32 D8,Q4,#SHIFT              @NARROW R1
1523*c83a76b0SSuyog Pawar
1524*c83a76b0SSuyog Pawar    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[6][0-4]
1525*c83a76b0SSuyog Pawar    VSHRN.S32 D9,Q6,#SHIFT              @NARROW R2, dual issued in 2nd cycle
1526*c83a76b0SSuyog Pawar
1527*c83a76b0SSuyog Pawar    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction
1528*c83a76b0SSuyog Pawar    VSWP      D16,D17                   @dual issued with prev. instr.
1529*c83a76b0SSuyog Pawar
1530*c83a76b0SSuyog Pawar    VZIP.S16 D8,D9                      @INTERLEAVE R1 R2 R1 R2 R1 R2 to write
1531*c83a76b0SSuyog Pawar    VMLA.S32  Q12,Q5,Q7                 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction
1532*c83a76b0SSuyog Pawar
1533*c83a76b0SSuyog Pawar
1534*c83a76b0SSuyog Pawar    @WRITE INTO MEM the values or wait to be shuffled
1535*c83a76b0SSuyog Pawar    @These values must go to 0 4 8 12 colums
1536*c83a76b0SSuyog Pawar    LSL R10,R7,#2
1537*c83a76b0SSuyog Pawar    VST1.S32 D8[0],[R2],R10
1538*c83a76b0SSuyog Pawar
1539*c83a76b0SSuyog Pawar    VST1.S32 D9[0],[R2],R10
1540*c83a76b0SSuyog Pawar
1541*c83a76b0SSuyog Pawar    VST1.S32 D8[1],[R2],R10
1542*c83a76b0SSuyog Pawar    VPADD.S32 D18,D24,D25               @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03
1543*c83a76b0SSuyog Pawar                                        @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01
1544*c83a76b0SSuyog Pawar
1545*c83a76b0SSuyog Pawar    VST1.S32 D9[1],[R2],R10
1546*c83a76b0SSuyog Pawar    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1547*c83a76b0SSuyog Pawar    LSL R10,R10,#2
1548*c83a76b0SSuyog Pawar    SUB R2,R2,R10
1549*c83a76b0SSuyog Pawar
1550*c83a76b0SSuyog Pawar    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[10][0-4]
1551*c83a76b0SSuyog Pawar
1552*c83a76b0SSuyog Pawar    VMUL.S32  Q6,Q3,Q7                  @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4]
1553*c83a76b0SSuyog Pawar    VSWP      D14,D15                   @ dual issued with prev. instruction
1554*c83a76b0SSuyog Pawar    VPADD.S32 D19,D4,D5
1555*c83a76b0SSuyog Pawar
1556*c83a76b0SSuyog Pawar    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[14][0-4]
1557*c83a76b0SSuyog Pawar    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4]
1558*c83a76b0SSuyog Pawar    VSWP      D16,D17
1559*c83a76b0SSuyog Pawar
1560*c83a76b0SSuyog Pawar    VMLA.S32  Q6,Q5,Q7                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1561*c83a76b0SSuyog Pawar    VADD.S32 Q9,Q9,Q14                  @Round by RADD R1
1562*c83a76b0SSuyog Pawar    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1563*c83a76b0SSuyog Pawar    VSHRN.S32 D8,Q9,#SHIFT              @Shift by SHIFT
1564*c83a76b0SSuyog Pawar    VPADD.S32 D24,D12,D13
1565*c83a76b0SSuyog Pawar    @---------------Processing O, Row 1 and Row 2--------------------------------------
1566*c83a76b0SSuyog Pawar    @ Early start to avoid stalls
1567*c83a76b0SSuyog Pawar    MOV R12,#COFF_STD_W
1568*c83a76b0SSuyog Pawar    ADD R11,R9,R12                      @Get 1ST row
1569*c83a76b0SSuyog Pawar    LSL R12,R12,#1
1570*c83a76b0SSuyog Pawar
1571*c83a76b0SSuyog Pawar    LSL R10,R7,#2
1572*c83a76b0SSuyog Pawar    ADD R2,R2,R7,LSL #1                 @move to third row
1573*c83a76b0SSuyog Pawar    @this should go to 2  6 10 14
1574*c83a76b0SSuyog Pawar    VST1.S32 D8[0],[R2],R10
1575*c83a76b0SSuyog Pawar
1576*c83a76b0SSuyog Pawar    VST1.S32 D8[1],[R2],R10
1577*c83a76b0SSuyog Pawar    VPADD.S32 D25,D4,D5                 @ dual issued with prev. instruction in 2nd cycle
1578*c83a76b0SSuyog Pawar
1579*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
1580*c83a76b0SSuyog Pawar    VADD.S32 Q12,Q12,Q14                @Round by RADD R2, dual issued with prev. instruction in 2nd cycle
1581*c83a76b0SSuyog Pawar    VMUL.S32 Q6,Q2,Q0                   @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2
1582*c83a76b0SSuyog Pawar    VMLA.S32 Q6,Q3,Q1                   @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2
1583*c83a76b0SSuyog Pawar    VSHRN.S32 D9,Q12,#SHIFT             @Shift by SHIFT
1584*c83a76b0SSuyog Pawar
1585*c83a76b0SSuyog Pawar    VMUL.S32 Q2,Q2,Q10                  @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1
1586*c83a76b0SSuyog Pawar    VMLA.S32 Q2,Q3,Q11                  @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1
1587*c83a76b0SSuyog Pawar    VADD.S32 D11,D12,D13                @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr.
1588*c83a76b0SSuyog Pawar    VST1.S32 D9[0],[R2],R10
1589*c83a76b0SSuyog Pawar
1590*c83a76b0SSuyog Pawar    VST1.S32 D9[1],[R2],R10
1591*c83a76b0SSuyog Pawar    VADD.S32 D10,D4,D5                  @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr.
1592*c83a76b0SSuyog Pawar    LSL R10,R10,#2                      @go back to orgin
1593*c83a76b0SSuyog Pawar    SUB R2,R2,R10
1594*c83a76b0SSuyog Pawar    SUB R2,R2,R7,LSL #1
1595*c83a76b0SSuyog Pawar
1596*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1597*c83a76b0SSuyog Pawar
1598*c83a76b0SSuyog Pawar    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
1599*c83a76b0SSuyog Pawar    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
1600*c83a76b0SSuyog Pawar    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
1601*c83a76b0SSuyog Pawar    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
1602*c83a76b0SSuyog Pawar
1603*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
1604*c83a76b0SSuyog Pawar    VADD.S32 D18,D14,D15
1605*c83a76b0SSuyog Pawar    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
1606*c83a76b0SSuyog Pawar    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
1607*c83a76b0SSuyog Pawar    VADD.S32 D19,D16,D17
1608*c83a76b0SSuyog Pawar    VMUL.S32 Q4,Q2,Q0
1609*c83a76b0SSuyog Pawar    VMLA.S32 Q4,Q3,Q1
1610*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
1611*c83a76b0SSuyog Pawar    VADD.S32 D26,D24,D25                @ dual issued with prev. instr.
1612*c83a76b0SSuyog Pawar    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
1613*c83a76b0SSuyog Pawar    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
1614*c83a76b0SSuyog Pawar    VADD.S32 D27,D8,D9
1615*c83a76b0SSuyog Pawar    VMUL.S32 Q4,Q2,Q0
1616*c83a76b0SSuyog Pawar    VMLA.S32 Q4,Q3,Q1
1617*c83a76b0SSuyog Pawar    VADD.S32 D12,D12,D13
1618*c83a76b0SSuyog Pawar    @Q5 Q9 Q13 Q6
1619*c83a76b0SSuyog Pawar    VPADD.S32 D14,D10,D11
1620*c83a76b0SSuyog Pawar    VPADD.S32 D15,D18,D19
1621*c83a76b0SSuyog Pawar    VPADD.S32 D16,D26,D27
1622*c83a76b0SSuyog Pawar    VADD.S32  D13,D8,D9
1623*c83a76b0SSuyog Pawar    VADD.S32 Q9,Q7,Q14
1624*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[0][0-7]
1625*c83a76b0SSuyog Pawar    VPADD.S32 D17,D12,D13               @ dual issued with prev. instr. in 2nd cycle
1626*c83a76b0SSuyog Pawar
1627*c83a76b0SSuyog Pawar    VMUL.S32 Q4,Q2,Q10                  @o[0][0-3]
1628*c83a76b0SSuyog Pawar    VMLA.S32 Q4,Q3,Q11                  @o[0][4-7]
1629*c83a76b0SSuyog Pawar
1630*c83a76b0SSuyog Pawar    VADD.S32 Q12,Q8,Q14
1631*c83a76b0SSuyog Pawar
1632*c83a76b0SSuyog Pawar    VMUL.S32 Q6,Q2,Q0                   @o[0][0-3]
1633*c83a76b0SSuyog Pawar    VMLA.S32 Q6,Q3,Q1                   @o[0][4-7]
1634*c83a76b0SSuyog Pawar
1635*c83a76b0SSuyog Pawar    VSHRN.S32 D26,Q9,#SHIFT
1636*c83a76b0SSuyog Pawar    VSHRN.S32 D27,Q12,#SHIFT
1637*c83a76b0SSuyog Pawar    VADD.S32 D10,D8,D9
1638*c83a76b0SSuyog Pawar    @write to memory this should go to 1 3 5 7
1639*c83a76b0SSuyog Pawar    ADD R2,R2,R7
1640*c83a76b0SSuyog Pawar    LSL R7,R7,#1
1641*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
1642*c83a76b0SSuyog Pawar    VADD.S32 D11,D12,D13                @ dual issued with prev. instr.
1643*c83a76b0SSuyog Pawar
1644*c83a76b0SSuyog Pawar    VST1.S32 D26[0],[R2],R7
1645*c83a76b0SSuyog Pawar    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
1646*c83a76b0SSuyog Pawar    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
1647*c83a76b0SSuyog Pawar    VST1.S32 D26[1],[R2],R7
1648*c83a76b0SSuyog Pawar    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
1649*c83a76b0SSuyog Pawar    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
1650*c83a76b0SSuyog Pawar    VST1.S32 D27[0],[R2],R7
1651*c83a76b0SSuyog Pawar    VADD.S32 D18,D14,D15
1652*c83a76b0SSuyog Pawar    VST1.S32 D27[1],[R2],R7
1653*c83a76b0SSuyog Pawar
1654*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[2][0-7]
1655*c83a76b0SSuyog Pawar    VADD.S32 D19,D16,D17                @ dual issued with prev. instr.
1656*c83a76b0SSuyog Pawar
1657*c83a76b0SSuyog Pawar    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
1658*c83a76b0SSuyog Pawar    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
1659*c83a76b0SSuyog Pawar    VMUL.S32 Q4,Q2,Q0
1660*c83a76b0SSuyog Pawar    VMLA.S32 Q4,Q3,Q1
1661*c83a76b0SSuyog Pawar
1662*c83a76b0SSuyog Pawar    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1663*c83a76b0SSuyog Pawar    VADD.S32 D26,D24,D25
1664*c83a76b0SSuyog Pawar
1665*c83a76b0SSuyog Pawar    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
1666*c83a76b0SSuyog Pawar    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
1667*c83a76b0SSuyog Pawar    VADD.S32  D27,D8,D9
1668*c83a76b0SSuyog Pawar
1669*c83a76b0SSuyog Pawar    VMUL.S32 Q4,Q2,Q0
1670*c83a76b0SSuyog Pawar    VMLA.S32 Q4,Q3,Q1
1671*c83a76b0SSuyog Pawar    VADD.S32 D12,D12,D13
1672*c83a76b0SSuyog Pawar    @Q5 Q9 Q13 Q6
1673*c83a76b0SSuyog Pawar    VPADD.S32 D14,D10,D11
1674*c83a76b0SSuyog Pawar    VPADD.S32 D15,D18,D19
1675*c83a76b0SSuyog Pawar    VPADD.S32 D16,D26,D27
1676*c83a76b0SSuyog Pawar    VADD.S32  D13,D8,D9
1677*c83a76b0SSuyog Pawar    VADD.S32 Q9,Q7,Q14
1678*c83a76b0SSuyog Pawar    @ 1- cycle stall?
1679*c83a76b0SSuyog Pawar    VPADD.S32 D17,D12,D13
1680*c83a76b0SSuyog Pawar    VSHRN.S32 D22,Q9,#SHIFT
1681*c83a76b0SSuyog Pawar    VADD.S32 Q10,Q8,Q14
1682*c83a76b0SSuyog Pawar    @ 2-cycle stall?
1683*c83a76b0SSuyog Pawar    VSHRN.S32 D23,Q10,#SHIFT
1684*c83a76b0SSuyog Pawar
1685*c83a76b0SSuyog Pawar    @this should go to 9 11 13 15
1686*c83a76b0SSuyog Pawar    @LSL R11,R7,#1
1687*c83a76b0SSuyog Pawar    VST1.S32 D22[0],[R2],R7
1688*c83a76b0SSuyog Pawar    VST1.S32 D22[1],[R2],R7
1689*c83a76b0SSuyog Pawar    VST1.S32 D23[0],[R2],R7
1690*c83a76b0SSuyog Pawar    VST1.S32 D23[1],[R2],R7
1691*c83a76b0SSuyog Pawar
1692*c83a76b0SSuyog Pawar    SUB R2,R2,R7,LSL #3
1693*c83a76b0SSuyog Pawar    LSR R7,R7,#1
1694*c83a76b0SSuyog Pawar    SUB R2,R2,R7
1695*c83a76b0SSuyog Pawar
1696*c83a76b0SSuyog Pawar    ADD R2,R2,#4                        @MOVE TO NEXT to next COLUMN
1697*c83a76b0SSuyog Pawar
1698*c83a76b0SSuyog Pawar    ADD R8,R8,#2                        @increment loop cntr by 2 since we process loop as 2 cols
1699*c83a76b0SSuyog Pawar    CMP R8,#16                          @check loop cntr
1700*c83a76b0SSuyog Pawar    BNE CORE_LOOP_16X16_VERT            @jump acc
1701*c83a76b0SSuyog Pawar
1702*c83a76b0SSuyog Pawar    MOV R0,R3
1703*c83a76b0SSuyog Pawar
1704*c83a76b0SSuyog Pawar    ADD SP,SP,#32
1705*c83a76b0SSuyog Pawar    vpop {d8 - d15}
1706*c83a76b0SSuyog Pawar    LDMFD          sp!,{r4-r12,PC}      @stack store values of the arguments
1707*c83a76b0SSuyog Pawar
1708