1*c83a76b0SSuyog Pawar@/****************************************************************************** 2*c83a76b0SSuyog Pawar@ * 3*c83a76b0SSuyog Pawar@ * Copyright (C) 2018 The Android Open Source Project 4*c83a76b0SSuyog Pawar@ * 5*c83a76b0SSuyog Pawar@ * Licensed under the Apache License, Version 2.0 (the "License"); 6*c83a76b0SSuyog Pawar@ * you may not use this file except in compliance with the License. 7*c83a76b0SSuyog Pawar@ * You may obtain a copy of the License at: 8*c83a76b0SSuyog Pawar@ * 9*c83a76b0SSuyog Pawar@ * http://www.apache.org/licenses/LICENSE-2.0 10*c83a76b0SSuyog Pawar@ * 11*c83a76b0SSuyog Pawar@ * Unless required by applicable law or agreed to in writing, software 12*c83a76b0SSuyog Pawar@ * distributed under the License is distributed on an "AS IS" BASIS, 13*c83a76b0SSuyog Pawar@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*c83a76b0SSuyog Pawar@ * See the License for the specific language governing permissions and 15*c83a76b0SSuyog Pawar@ * limitations under the License. 16*c83a76b0SSuyog Pawar@ * 17*c83a76b0SSuyog Pawar@ ***************************************************************************** 18*c83a76b0SSuyog Pawar@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*c83a76b0SSuyog Pawar@*/ 20*c83a76b0SSuyog Pawar 21*c83a76b0SSuyog Pawar.text 22*c83a76b0SSuyog Pawar.align 4 23*c83a76b0SSuyog Pawar 24*c83a76b0SSuyog Pawar@/** 25*c83a76b0SSuyog Pawar@/******************************************************************************* 26*c83a76b0SSuyog Pawar@/* 27*c83a76b0SSuyog Pawar@/* @brief 28*c83a76b0SSuyog Pawar@/* Residue calculation and Forward Transform for 4x4 block with 8-bit input 29*c83a76b0SSuyog Pawar@/* 30*c83a76b0SSuyog Pawar@/* @par Description: 31*c83a76b0SSuyog Pawar@/* Performs residue calculation by subtracting source and prediction and 32*c83a76b0SSuyog Pawar@/* followed by forward transform 33*c83a76b0SSuyog Pawar@/* 34*c83a76b0SSuyog Pawar@/* @param[in] pu1_src 35*c83a76b0SSuyog Pawar@/* Input 4x4 pixels 36*c83a76b0SSuyog Pawar@/* 37*c83a76b0SSuyog Pawar@/* @param[in] pu1_pred 38*c83a76b0SSuyog Pawar@/* Prediction data 39*c83a76b0SSuyog Pawar@/* 40*c83a76b0SSuyog Pawar@/* @param[in] pi4_tmp 41*c83a76b0SSuyog Pawar@/* Temporary buffer of size 4x4 42*c83a76b0SSuyog Pawar@/* 43*c83a76b0SSuyog Pawar@/* @param[out] pi2_dst 44*c83a76b0SSuyog Pawar@/* Output 4x4 coefficients 45*c83a76b0SSuyog Pawar@/* 46*c83a76b0SSuyog Pawar@/* @param[in] src_strd 47*c83a76b0SSuyog Pawar@/* Input stride 48*c83a76b0SSuyog Pawar@/* 49*c83a76b0SSuyog Pawar@/* @param[in] pred_strd 50*c83a76b0SSuyog Pawar@/* Prediction Stride 51*c83a76b0SSuyog Pawar@/* 52*c83a76b0SSuyog Pawar@/* @param[in] dst_strd 53*c83a76b0SSuyog Pawar@/* Output Stride 54*c83a76b0SSuyog Pawar@/* 55*c83a76b0SSuyog Pawar@/* @param[in] chr_plane 56*c83a76b0SSuyog Pawar@/* Chroma plane 57*c83a76b0SSuyog Pawar@/* 58*c83a76b0SSuyog Pawar@/* @returns Void 59*c83a76b0SSuyog Pawar@/* 60*c83a76b0SSuyog Pawar@/* @remarks 61*c83a76b0SSuyog Pawar@/* None 62*c83a76b0SSuyog Pawar@/* 63*c83a76b0SSuyog Pawar@/******************************************************************************* 64*c83a76b0SSuyog Pawar@/*/ 65*c83a76b0SSuyog Pawar 66*c83a76b0SSuyog Pawar@/**************Variables Vs Registers***************************************** 67*c83a76b0SSuyog Pawar@ r0 => *pu1_src 68*c83a76b0SSuyog Pawar@ r1 => *pu1_pred 69*c83a76b0SSuyog Pawar@ r2 => *pi4_temp 70*c83a76b0SSuyog Pawar@ r3 => *pi2_dst 71*c83a76b0SSuyog Pawar@ r4 => src_strd 72*c83a76b0SSuyog Pawar@ r5 => pred_strd 73*c83a76b0SSuyog Pawar@ r6 => dst_strd 74*c83a76b0SSuyog Pawar@ r7 => chroma_plane 75*c83a76b0SSuyog Pawar 76*c83a76b0SSuyog Pawar .global ihevc_resi_trans_4x4_a9q 77*c83a76b0SSuyog Pawar 78*c83a76b0SSuyog Pawarihevc_resi_trans_4x4_a9q: 79*c83a76b0SSuyog Pawar 80*c83a76b0SSuyog Pawar STMFD sp!, {r4-r7, r14} @ store all the register components from caller function to memory 81*c83a76b0SSuyog Pawar LDR r4, [sp,#20] @ r4 contains src_strd 82*c83a76b0SSuyog Pawar LDR r5, [sp,#24] @ r5 contains pred_strd 83*c83a76b0SSuyog Pawar LDR r6, [sp,#28] @ r6 contains dst_strd 84*c83a76b0SSuyog Pawar LDR r7, [sp,#32] @ r7 chroma plane 85*c83a76b0SSuyog Pawar 86*c83a76b0SSuyog Pawar CMP r7, #-1 87*c83a76b0SSuyog Pawar BEQ NON_INTERLEAVE_LOAD @if flag == NULL_PLANE, use non-interleaving loads 88*c83a76b0SSuyog Pawar 89*c83a76b0SSuyog Pawar VLD1.64 d0, [r0], r4 @ load row 0 src 90*c83a76b0SSuyog Pawar VLD1.64 d4, [r0], r4 @ load row 1 src 91*c83a76b0SSuyog Pawar VLD1.64 d1, [r0], r4 @ load row 2 src 92*c83a76b0SSuyog Pawar VLD1.64 d5, [r0], r4 @ load row 3 src 93*c83a76b0SSuyog Pawar VUZP.8 d0, d4 @ de-interleaving unzip instruction to get luma data of pu1_src in d0 94*c83a76b0SSuyog Pawar VUZP.8 d1, d5 @ de-interleaving unzip instruction to get luma data of pu1_src in d1 95*c83a76b0SSuyog Pawar 96*c83a76b0SSuyog Pawar VLD1.64 d2, [r1], r5 @ load row 0 pred 97*c83a76b0SSuyog Pawar VLD1.64 d6, [r1], r5 @ load row 1 pred 98*c83a76b0SSuyog Pawar VLD1.64 d3, [r1], r5 @ load row 2 pred 99*c83a76b0SSuyog Pawar VLD1.64 d7, [r1], r5 @ load row 3 pred 100*c83a76b0SSuyog Pawar VUZP.8 d2, d6 @ de-interleaving unzip instruction to get luma data of pu1_pred in d2 101*c83a76b0SSuyog Pawar VUZP.8 d3, d7 @ de-interleaving unzip instruction to get luma data of pu1_pred in d3 102*c83a76b0SSuyog Pawar 103*c83a76b0SSuyog Pawar CMP r7, #0 104*c83a76b0SSuyog Pawar BEQ LOAD_END 105*c83a76b0SSuyog Pawar VSWP.8 d0, d4 106*c83a76b0SSuyog Pawar VSWP.8 d1, d5 107*c83a76b0SSuyog Pawar VSWP.8 d2, d6 108*c83a76b0SSuyog Pawar VSWP.8 d3, d7 109*c83a76b0SSuyog Pawar 110*c83a76b0SSuyog Pawar B LOAD_END 111*c83a76b0SSuyog Pawar 112*c83a76b0SSuyog PawarNON_INTERLEAVE_LOAD: 113*c83a76b0SSuyog Pawar VLD1.U32 d0[0], [r0], r4 @ load row 0 src 114*c83a76b0SSuyog Pawar VLD1.U32 d0[1], [r0], r4 @ load row 1 src 115*c83a76b0SSuyog Pawar VLD1.U32 d1[0], [r0], r4 @ load row 2 src 116*c83a76b0SSuyog Pawar VLD1.U32 d1[1], [r0], r4 @ load row 3 src 117*c83a76b0SSuyog Pawar 118*c83a76b0SSuyog Pawar VLD1.U32 d2[0], [r1], r5 @ load row 0 pred 119*c83a76b0SSuyog Pawar VLD1.U32 d2[1], [r1], r5 @ load row 1 pred 120*c83a76b0SSuyog Pawar VLD1.U32 d3[0], [r1], r5 @ load row 2 pred 121*c83a76b0SSuyog Pawar VLD1.U32 d3[1], [r1], r5 @ load row 3 pred 122*c83a76b0SSuyog Pawar 123*c83a76b0SSuyog PawarLOAD_END: 124*c83a76b0SSuyog Pawar @ Finding the residue 125*c83a76b0SSuyog Pawar VSUBL.U8 q2, d0, d2 @ q2 contains 1st 16-bit 8 residues 126*c83a76b0SSuyog Pawar VSUBL.U8 q3, d1, d3 @ q3 contains 2nd 16-bit 8 residues 127*c83a76b0SSuyog Pawar 128*c83a76b0SSuyog Pawar @ SAD caculation 129*c83a76b0SSuyog Pawar VABDL.U8 q12, d0, d2 @ q12 contains absolute differences 130*c83a76b0SSuyog Pawar VABAL.U8 q12, d1, d3 @ q12 accumulates absolute differences 131*c83a76b0SSuyog Pawar VADD.U16 d26, d24, d25 @ add d-registers of q12 132*c83a76b0SSuyog Pawar VPADDL.U16 d27, d26 @ d27 contains 2 32-bit values that have to be added 133*c83a76b0SSuyog Pawar VPADDL.U32 d28, d27 @ d28 contains 64-bit SAD, only LSB important 134*c83a76b0SSuyog Pawar VMOV.32 r0, d28[0] @ SAD stored in r0 for return 135*c83a76b0SSuyog Pawar @ SAD caculation ends 136*c83a76b0SSuyog Pawar 137*c83a76b0SSuyog Pawar @ Forward transform - step 1 138*c83a76b0SSuyog Pawar VMOV.I16 d2, #64 @ generate immediate constant in d2 for even row multiplication 139*c83a76b0SSuyog Pawar VTRN.16 d4, d5 @ 3-step transpose of residue matrix starts 140*c83a76b0SSuyog Pawar VTRN.16 d6, d7 @ 2nd step of the 3-step matrix transpose 141*c83a76b0SSuyog Pawar VMOV.I16 d0, #83 @ generate immediate constant in d0 for odd row multiplication 142*c83a76b0SSuyog Pawar VTRN.32 q2, q3 @ Final step of matrix transpose 143*c83a76b0SSuyog Pawar 144*c83a76b0SSuyog Pawar VMOV.I16 d1, #36 @ generate immediate constant in d1 for odd row multiplication 145*c83a76b0SSuyog Pawar VSWP d6, d7 @ vector swap to allow even and odd row calculation using Q registers 146*c83a76b0SSuyog Pawar VADD.S16 q10, q2, q3 @ q4 has the even array 147*c83a76b0SSuyog Pawar VSUB.S16 q11, q2, q3 @ q5 has the odd array 148*c83a76b0SSuyog Pawar VMULL.S16 q12, d20, d2 @ e[0]*64 149*c83a76b0SSuyog Pawar VMLAL.S16 q12, d21, d2[0] @ row 1 of results: e[0]*64 + e[1]*64 150*c83a76b0SSuyog Pawar VMULL.S16 q13, d20, d2 @ e[0]*64 151*c83a76b0SSuyog Pawar VMLSL.S16 q13, d21, d2[0] @ row 3 of results: e[0]*64 - e[1]*64 152*c83a76b0SSuyog Pawar VMULL.S16 q8, d22, d0 @ o[0]*83 153*c83a76b0SSuyog Pawar VMLAL.S16 q8, d23, d1[0] @ row 2 of results: o[0]*83 + o[1]*36 154*c83a76b0SSuyog Pawar VMULL.S16 q9, d22, d1 @ o[0]*36 155*c83a76b0SSuyog Pawar VMLSL.S16 q9, d23, d0[0] @ row 4 of results: o[0]*36 - o[1]*83 156*c83a76b0SSuyog Pawar 157*c83a76b0SSuyog Pawar @ Forward transform - step 2 158*c83a76b0SSuyog Pawar VMOV.I32 d2, #64 @ generate immediate constant in d2 for even row multiplication 159*c83a76b0SSuyog Pawar VMOV.I32 d0, #83 @ generate immediate constant in d0 for odd row multiplication 160*c83a76b0SSuyog Pawar VTRN.32 q12, q8 @ 4-step transpose of residue matrix starts 161*c83a76b0SSuyog Pawar VTRN.32 q13, q9 @ 2nd step of the 4-step matrix transpose 162*c83a76b0SSuyog Pawar 163*c83a76b0SSuyog Pawar VMOV.I32 d1, #36 @ generate immediate constant in d1 for odd row multiplication 164*c83a76b0SSuyog Pawar VSWP d25, d26 @ 3rd step of the 4-step matrix transpose 165*c83a76b0SSuyog Pawar VSWP d17, d18 @ 4th step of the 4-step matrix transpose 166*c83a76b0SSuyog Pawar VADD.S32 q2, q12, q9 @ e[0] 167*c83a76b0SSuyog Pawar VADD.S32 q3, q8, q13 @ e[1] 168*c83a76b0SSuyog Pawar VSUB.S32 q10, q12, q9 @ o[0] 169*c83a76b0SSuyog Pawar VSUB.S32 q11, q8, q13 @ o[1] 170*c83a76b0SSuyog Pawar 171*c83a76b0SSuyog Pawar VMUL.S32 q12, q2, d2[0] @ e[0]*64 172*c83a76b0SSuyog Pawar VMLA.S32 q12, q3, d2[0] @ row 1 of results: e[0]*64 + e[1]*64 173*c83a76b0SSuyog Pawar VMUL.S32 q13, q2, d2[0] @ e[1]*64 174*c83a76b0SSuyog Pawar VMLS.S32 q13, q3, d2[0] @ row 3 of results: e[0]*64 - e[1]*64 175*c83a76b0SSuyog Pawar VMUL.S32 q8, q10, d0[0] @ o[0]*83 176*c83a76b0SSuyog Pawar VMLA.S32 q8, q11, d1[0] @ row 2 of results: o[0]*83 + o[1]*36 177*c83a76b0SSuyog Pawar VMUL.S32 q9, q10, d1[0] @ o[0]*36 178*c83a76b0SSuyog Pawar VMLS.S32 q9, q11, d0[0] @ row 4 of results: o[0]*36 - o[1]*83 179*c83a76b0SSuyog Pawar 180*c83a76b0SSuyog Pawar VRSHRN.S32 d0, q12, #9 @ (row1 + 256)/512 181*c83a76b0SSuyog Pawar VRSHRN.S32 d1, q8, #9 @ (row2 + 256)/512 182*c83a76b0SSuyog Pawar VRSHRN.S32 d2, q13, #9 @ (row3 + 256)/512 183*c83a76b0SSuyog Pawar VRSHRN.S32 d3, q9, #9 @ (row4 + 256)/512 184*c83a76b0SSuyog Pawar 185*c83a76b0SSuyog Pawar LSL r7, r6, #1 @ r7 = 2*dst_strd, as pi2_dst contains 2-byte integers 186*c83a76b0SSuyog Pawar VST1.U16 d0, [r3], r7 @ store 1st row of result 187*c83a76b0SSuyog Pawar VST1.U16 d1, [r3], r7 @ store 2nd row of result 188*c83a76b0SSuyog Pawar VST1.U16 d2, [r3], r7 @ store 3rd row of result 189*c83a76b0SSuyog Pawar VST1.U16 d3, [r3], r7 @ store 4th row of result 190*c83a76b0SSuyog Pawar 191*c83a76b0SSuyog Pawar LDMFD sp!,{r4-r7,r15} @ Reload the registers from SP 192*c83a76b0SSuyog Pawar 193*c83a76b0SSuyog Pawar @ Function End 194*c83a76b0SSuyog Pawar 195*c83a76b0SSuyog Pawar@/** 196*c83a76b0SSuyog Pawar@******************************************************************************* 197*c83a76b0SSuyog Pawar@* 198*c83a76b0SSuyog Pawar@* @brief 199*c83a76b0SSuyog Pawar@* This function performs residue calculation and forward transform type 1 200*c83a76b0SSuyog Pawar@* on input pixels 201*c83a76b0SSuyog Pawar@* 202*c83a76b0SSuyog Pawar@* @description 203*c83a76b0SSuyog Pawar@* Performs residue calculation by subtracting source and prediction and 204*c83a76b0SSuyog Pawar@* followed by forward transform 205*c83a76b0SSuyog Pawar@* 206*c83a76b0SSuyog Pawar@* @param[in] pu1_src 207*c83a76b0SSuyog Pawar@* Input 4x4 pixels 208*c83a76b0SSuyog Pawar@* 209*c83a76b0SSuyog Pawar@* @param[in] pu1_pred 210*c83a76b0SSuyog Pawar@* Prediction data 211*c83a76b0SSuyog Pawar@* 212*c83a76b0SSuyog Pawar@* @param[in] pi2_tmp 213*c83a76b0SSuyog Pawar@* Temporary buffer of size 4x4 214*c83a76b0SSuyog Pawar@* 215*c83a76b0SSuyog Pawar@* @param[out] pi2_dst 216*c83a76b0SSuyog Pawar@* Output 4x4 coefficients 217*c83a76b0SSuyog Pawar@* 218*c83a76b0SSuyog Pawar@* @param[in] src_strd 219*c83a76b0SSuyog Pawar@* Input stride 220*c83a76b0SSuyog Pawar@* 221*c83a76b0SSuyog Pawar@* @param[in] pred_strd 222*c83a76b0SSuyog Pawar@* Prediction Stride 223*c83a76b0SSuyog Pawar@* 224*c83a76b0SSuyog Pawar@* @param[in] dst_strd 225*c83a76b0SSuyog Pawar@* Output Stride 226*c83a76b0SSuyog Pawar@* 227*c83a76b0SSuyog Pawar@* @param[in] chr_plane (unused) 228*c83a76b0SSuyog Pawar@* Chroma plane 229*c83a76b0SSuyog Pawar@* 230*c83a76b0SSuyog Pawar@* @returns void 231*c83a76b0SSuyog Pawar@* 232*c83a76b0SSuyog Pawar@* @remarks 233*c83a76b0SSuyog Pawar@* None 234*c83a76b0SSuyog Pawar@* 235*c83a76b0SSuyog Pawar@******************************************************************************* 236*c83a76b0SSuyog Pawar@*/ 237*c83a76b0SSuyog Pawar@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src, 238*c83a76b0SSuyog Pawar@ UWORD8 *pu1_pred, 239*c83a76b0SSuyog Pawar@ WORD32 *pi4_temp, 240*c83a76b0SSuyog Pawar@ WORD16 *pi2_dst, 241*c83a76b0SSuyog Pawar@ WORD32 src_strd, 242*c83a76b0SSuyog Pawar@ WORD32 pred_strd, 243*c83a76b0SSuyog Pawar@ WORD32 dst_strd 244*c83a76b0SSuyog Pawar@ WORD32 chroma_plane); 245*c83a76b0SSuyog Pawar@ 246*c83a76b0SSuyog Pawar@**************Variables Vs Registers******************************************* 247*c83a76b0SSuyog Pawar@ 248*c83a76b0SSuyog Pawar@ r0 - pu1_src 249*c83a76b0SSuyog Pawar@ r1 - pu1_pred 250*c83a76b0SSuyog Pawar@ r2 - pi4_temp 251*c83a76b0SSuyog Pawar@ r3 - pi2_dst 252*c83a76b0SSuyog Pawar@ 253*c83a76b0SSuyog Pawar@ [sp] - src_strd 254*c83a76b0SSuyog Pawar@ [sp+4] - pred_strd 255*c83a76b0SSuyog Pawar@ [sp+8] - dst_strd 256*c83a76b0SSuyog Pawar@ [sp+12] - chroma_plane 257*c83a76b0SSuyog Pawar@ 258*c83a76b0SSuyog Pawar@******************************************************************************* 259*c83a76b0SSuyog Pawar 260*c83a76b0SSuyog Pawar .global ihevc_resi_trans_4x4_ttype1_a9q 261*c83a76b0SSuyog Pawar 262*c83a76b0SSuyog Pawarihevc_resi_trans_4x4_ttype1_a9q: 263*c83a76b0SSuyog Pawar 264*c83a76b0SSuyog Pawar PUSH {r4} 265*c83a76b0SSuyog Pawar vpush {d8 - d15} 266*c83a76b0SSuyog Pawar 267*c83a76b0SSuyog Pawar LDR r2,[sp,#68] @ r2 = src_strd 268*c83a76b0SSuyog Pawar LDR r4,[sp,#72] @ r4 = pred_strd 269*c83a76b0SSuyog Pawar 270*c83a76b0SSuyog Pawar VLD1.32 d2[0],[r0],r2 @ Row 1 of source in d2[0] 271*c83a76b0SSuyog Pawar VLD1.32 d3[0],[r1],r4 @ Row 1 of prediction in d3[0] 272*c83a76b0SSuyog Pawar VLD1.32 d2[1],[r0],r2 @ Row 2 of source in d2[1] 273*c83a76b0SSuyog Pawar VLD1.32 d3[1],[r1],r4 @ Row 2 of prediction in d3[1] 274*c83a76b0SSuyog Pawar 275*c83a76b0SSuyog Pawar VLD1.32 d8[0],[r0],r2 @ Row 3 of source in d8[0] 276*c83a76b0SSuyog Pawar VABDL.U8 q0,d2,d3 @ Absolute differences of rows 1 and 2 in d0 277*c83a76b0SSuyog Pawar @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue 278*c83a76b0SSuyog Pawar VLD1.32 d9[0],[r1],r4 @ Row 3 of prediction in d9[0] 279*c83a76b0SSuyog Pawar VSUBL.U8 q5,d2,d3 @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue 280*c83a76b0SSuyog Pawar VLD1.32 d8[1],[r0] @ Row 4 of source in d8[1] 281*c83a76b0SSuyog Pawar VTRN.16 d10,d11 @ Transpose step 1 282*c83a76b0SSuyog Pawar VLD1.32 d9[1],[r1] @ Row 4 of prediction in d9[1] 283*c83a76b0SSuyog Pawar 284*c83a76b0SSuyog Pawar VSUBL.U8 q6,d8,d9 @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue 285*c83a76b0SSuyog Pawar @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue 286*c83a76b0SSuyog Pawar VABAL.U8 q0,d8,d9 @ Absolute differences of rows 3 and 4 in d1 287*c83a76b0SSuyog Pawar VTRN.16 d12,d13 @ Transpose step 2 288*c83a76b0SSuyog Pawar VTRN.32 q5,q6 @ Transpose step 3, Residue block transposed 289*c83a76b0SSuyog Pawar @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13 290*c83a76b0SSuyog Pawar VADD.S16 d23,d11,d13 @ d23 = C2 + C4 291*c83a76b0SSuyog Pawar VMOV.I32 d6,#55 @ Constant used for multiplication 292*c83a76b0SSuyog Pawar VADD.S16 d22,d10,d13 @ d22 = C1 + C4 293*c83a76b0SSuyog Pawar VADD.U16 d0,d1,d0 @ Accumulating SAD step 1 294*c83a76b0SSuyog Pawar VMOV.I32 d7,#84 @ Constant used for multiplication 295*c83a76b0SSuyog Pawar VMULL.S16 q7,d23,d6[0] @ q7 = 55*C2 + 55*C4 296*c83a76b0SSuyog Pawar VMOV.I32 d4,#74 @ Constant used for multiplication 297*c83a76b0SSuyog Pawar VMULL.S16 q9,d22,d7[0] @ q9 = 84*C1 + 84*C4 298*c83a76b0SSuyog Pawar VADD.S16 d16,d10,d11 @ d16 = C1 + C2 299*c83a76b0SSuyog Pawar VMUL.S16 d12,d12,d4[0] @ d12 = 74*C3 300*c83a76b0SSuyog Pawar VMOV.I32 d5,#29 @ Constant used for multiplication 301*c83a76b0SSuyog Pawar VPADDL.U16 d0,d0 @ Accumulating SAD step 2 302*c83a76b0SSuyog Pawar VSUB.S16 d16,d16,d13 @ d16 = C1 + C2 - C4 303*c83a76b0SSuyog Pawar VMLAL.S16 q7,d22,d5[0] @ q7 = 29*C1 + 55*C2 + 84*C4 304*c83a76b0SSuyog Pawar VMLSL.S16 q9,d23,d5[0] @ q9 = 84*C1 - 29*C2 + 55*C4 305*c83a76b0SSuyog Pawar VMULL.S16 q8,d16,d4[0] @ q8 = 74*C1 + 74*C2 - 74*C4 306*c83a76b0SSuyog Pawar VPADDL.U32 d0,d0 @ Accumulating SAD step 3, SAD in d0 307*c83a76b0SSuyog Pawar VSUB.S32 q10,q9,q7 @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4 308*c83a76b0SSuyog Pawar VMOV.32 r0,d0[0] @ Return SAD value 309*c83a76b0SSuyog Pawar VRSHR.S32 q8,q8,#1 @ Truncating the 1 bit in q8 310*c83a76b0SSuyog Pawar 311*c83a76b0SSuyog Pawar VADDW.S16 q7,q7,d12 @ q7 = 29*C1 + 55*C2 + 74*C3 + 84*C4 312*c83a76b0SSuyog Pawar VSUBW.S16 q9,q9,d12 @ q9 = 84*C1 - 29*C2 - 74*C3 + 55*C4 313*c83a76b0SSuyog Pawar VADDW.S16 q10,q10,d12 @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4 314*c83a76b0SSuyog Pawar 315*c83a76b0SSuyog Pawar VRSHR.S32 q7,q7,#1 @ Truncating the 1 bit in q7 316*c83a76b0SSuyog Pawar VRSHR.S32 q9,q9,#1 @ Truncating the 1 bit in q9 317*c83a76b0SSuyog Pawar VRSHR.S32 q10,q10,#1 @ Truncating the 1 bit in q10 318*c83a76b0SSuyog Pawar @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10 319*c83a76b0SSuyog Pawar VTRN.32 q7,q8 320*c83a76b0SSuyog Pawar VTRN.32 q9,q10 321*c83a76b0SSuyog Pawar VSWP d15,d18 322*c83a76b0SSuyog Pawar VSWP d17,d20 @ Residue block transposed 323*c83a76b0SSuyog Pawar @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10 324*c83a76b0SSuyog Pawar VADD.S32 q13,q7,q8 @ q13 = S1 + S2 325*c83a76b0SSuyog Pawar VADD.S32 q1,q7,q10 @ q1 = S1 + S4 326*c83a76b0SSuyog Pawar VADD.S32 q4,q8,q10 @ q4 = S2 + S4 327*c83a76b0SSuyog Pawar VSUB.S32 q13,q13,q10 @ q13 = S1 + S2 - S4 328*c83a76b0SSuyog Pawar VMUL.S32 q12,q1,d5[0] @ q12 = 29*S1 + 29*S4 329*c83a76b0SSuyog Pawar VMUL.S32 q14,q1,d7[0] @ q14 = 84*S1 + 84*S4 330*c83a76b0SSuyog Pawar VMUL.S32 q13,q13,d4[0] @ q13 = 74*S1 + 74*S2 - 74*S4 331*c83a76b0SSuyog Pawar 332*c83a76b0SSuyog Pawar VMLA.S32 q12,q4,d6[0] @ q12 = 29*S1 + 55*S2 + 84*S4 333*c83a76b0SSuyog Pawar VMLS.S32 q14,q4,d5[0] @ q14 = 84*S1 - 29*S2 + 55*S4 334*c83a76b0SSuyog Pawar VMUL.S32 q9,q9,d4[0] @ q9 = 74*S3 335*c83a76b0SSuyog Pawar 336*c83a76b0SSuyog Pawar LDR r4,[sp,#76] @ r4 = dst_strd_chr_flag 337*c83a76b0SSuyog Pawar LSL r4,r4,#1 @ r4 = 2*dst_strd 338*c83a76b0SSuyog Pawar 339*c83a76b0SSuyog Pawar VRSHRN.S32 d26,q13,#8 340*c83a76b0SSuyog Pawar VSUB.S32 q15,q14,q12 @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4 341*c83a76b0SSuyog Pawar 342*c83a76b0SSuyog Pawar VADD.S32 q12,q12,q9 @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4 343*c83a76b0SSuyog Pawar VSUB.S32 q14,q14,q9 @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4 344*c83a76b0SSuyog Pawar VADD.S32 q15,q15,q9 @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4 345*c83a76b0SSuyog Pawar 346*c83a76b0SSuyog Pawar VRSHRN.S32 d24,q12,#8 347*c83a76b0SSuyog Pawar VRSHRN.S32 d28,q14,#8 348*c83a76b0SSuyog Pawar VRSHRN.S32 d30,q15,#8 @ Truncating the last 8 bits 349*c83a76b0SSuyog Pawar @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30 350*c83a76b0SSuyog Pawar VST1.64 d24,[r3],r4 @ Storing row 1 of transform stage 2 351*c83a76b0SSuyog Pawar VST1.64 d26,[r3],r4 @ Storing row 2 of transform stage 2 352*c83a76b0SSuyog Pawar VST1.64 d28,[r3],r4 @ Storing row 3 of transform stage 2 353*c83a76b0SSuyog Pawar VST1.64 d30,[r3] @ Storing row 4 of transform stage 2 354*c83a76b0SSuyog Pawar 355*c83a76b0SSuyog Pawar vpop {d8 - d15} 356*c83a76b0SSuyog Pawar POP {r4} 357*c83a76b0SSuyog Pawar MOV pc,lr 358*c83a76b0SSuyog Pawar 359*c83a76b0SSuyog Pawar@/** 360*c83a76b0SSuyog Pawar@******************************************************************************* 361*c83a76b0SSuyog Pawar@* 362*c83a76b0SSuyog Pawar@* @brief 363*c83a76b0SSuyog Pawar@* This function performs residue calculation and DCT integer forward transform 364*c83a76b0SSuyog Pawar@* on 8x8 block 365*c83a76b0SSuyog Pawar@* 366*c83a76b0SSuyog Pawar@* @description 367*c83a76b0SSuyog Pawar@* Performs residue calculation by subtracting source and prediction and 368*c83a76b0SSuyog Pawar@* followed by DCT integer forward transform 369*c83a76b0SSuyog Pawar@* 370*c83a76b0SSuyog Pawar@* @param[in] pu1_src 371*c83a76b0SSuyog Pawar@* Input 4x4 pixels 372*c83a76b0SSuyog Pawar@* 373*c83a76b0SSuyog Pawar@* @param[in] pu1_pred 374*c83a76b0SSuyog Pawar@* Prediction data 375*c83a76b0SSuyog Pawar@* 376*c83a76b0SSuyog Pawar@* @param[in] pi2_tmp 377*c83a76b0SSuyog Pawar@* Temporary buffer of size 8x8 378*c83a76b0SSuyog Pawar@* 379*c83a76b0SSuyog Pawar@* @param[out] pi2_dst 380*c83a76b0SSuyog Pawar@* Output 8x8 coefficients 381*c83a76b0SSuyog Pawar@* 382*c83a76b0SSuyog Pawar@* @param[in] src_strd 383*c83a76b0SSuyog Pawar@* Input stride 384*c83a76b0SSuyog Pawar@* 385*c83a76b0SSuyog Pawar@* @param[in] pred_strd 386*c83a76b0SSuyog Pawar@* Prediction Stride 387*c83a76b0SSuyog Pawar@* 388*c83a76b0SSuyog Pawar@* @param[in] dst_strd 389*c83a76b0SSuyog Pawar@* Output Stride 390*c83a76b0SSuyog Pawar@* 391*c83a76b0SSuyog Pawar@* @param[in] chr_plane 392*c83a76b0SSuyog Pawar@* Chroma plane 393*c83a76b0SSuyog Pawar@* 394*c83a76b0SSuyog Pawar@* @returns void 395*c83a76b0SSuyog Pawar@* 396*c83a76b0SSuyog Pawar@* @remarks 397*c83a76b0SSuyog Pawar@* None 398*c83a76b0SSuyog Pawar@* 399*c83a76b0SSuyog Pawar@******************************************************************************* 400*c83a76b0SSuyog Pawar@*/ 401*c83a76b0SSuyog Pawar@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src, 402*c83a76b0SSuyog Pawar@ UWORD8 *pu1_pred, 403*c83a76b0SSuyog Pawar@ WORB32 *pi4_temp, 404*c83a76b0SSuyog Pawar@ WORB16 *pi2_dst, 405*c83a76b0SSuyog Pawar@ WORB32 src_strd, 406*c83a76b0SSuyog Pawar@ WORB32 pred_strd, 407*c83a76b0SSuyog Pawar@ WORB32 dst_strd 408*c83a76b0SSuyog Pawar@ WORB32 chroma_plane); 409*c83a76b0SSuyog Pawar@ 410*c83a76b0SSuyog Pawar@**************Variables Vs Registers******************************************* 411*c83a76b0SSuyog Pawar@ 412*c83a76b0SSuyog Pawar@ r0 - pu1_src 413*c83a76b0SSuyog Pawar@ r1 - pu1_pred 414*c83a76b0SSuyog Pawar@ r2 - pi4_temp 415*c83a76b0SSuyog Pawar@ r3 - pi2_dst 416*c83a76b0SSuyog Pawar@ 417*c83a76b0SSuyog Pawar@ [sp] - src_strd 418*c83a76b0SSuyog Pawar@ [sp+4] - pred_strd 419*c83a76b0SSuyog Pawar@ [sp+8] - dst_strd 420*c83a76b0SSuyog Pawar@ [sp+12] - chroma_plane 421*c83a76b0SSuyog Pawar@ 422*c83a76b0SSuyog Pawar@******************************************************************************* 423*c83a76b0SSuyog Pawar 424*c83a76b0SSuyog Pawar .global ihevc_resi_trans_8x8_a9q 425*c83a76b0SSuyog Pawar 426*c83a76b0SSuyog Pawarihevc_resi_trans_8x8_a9q: 427*c83a76b0SSuyog Pawar 428*c83a76b0SSuyog Pawar PUSH {r4,r5} 429*c83a76b0SSuyog Pawar vpush {d8 - d15} 430*c83a76b0SSuyog Pawar 431*c83a76b0SSuyog Pawar @ Loading Prediction and Source blocks of size 8x8 432*c83a76b0SSuyog Pawar 433*c83a76b0SSuyog Pawar LDR r4,[sp,#84] @ r4 = chroma flag 434*c83a76b0SSuyog Pawar 435*c83a76b0SSuyog Pawar CMP r4,#-1 @ NULL PLANE 436*c83a76b0SSuyog Pawar BEQ LUMA_LOAD 437*c83a76b0SSuyog Pawar 438*c83a76b0SSuyog Pawar CMP r4,#1 @ V PLANE 439*c83a76b0SSuyog Pawar BEQ CHROMA_V_LOAD 440*c83a76b0SSuyog Pawar @ handling U PLANE 441*c83a76b0SSuyog Pawar LDR r5,[sp,#72] @ r5 = src_strd 442*c83a76b0SSuyog Pawar LDR r4,[sp,#76] @ r4 = pred_strd 443*c83a76b0SSuyog Pawar 444*c83a76b0SSuyog Pawar VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d0 445*c83a76b0SSuyog Pawar VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d1 446*c83a76b0SSuyog Pawar 447*c83a76b0SSuyog Pawar VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15 448*c83a76b0SSuyog Pawar VLD2.8 {d2,d4},[r1],r4 @ Row 2 of prediction in d2 449*c83a76b0SSuyog Pawar VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0 450*c83a76b0SSuyog Pawar VLD2.8 {d3,d5},[r0],r5 @ Row 2 of source in d3 451*c83a76b0SSuyog Pawar 452*c83a76b0SSuyog Pawar VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9 453*c83a76b0SSuyog Pawar VLD2.8 {d4,d6},[r1],r4 @ Row 3 of prediction in d4 454*c83a76b0SSuyog Pawar VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1 455*c83a76b0SSuyog Pawar VLD2.8 {d5,d7},[r0],r5 @ Row 3 of source in d5 456*c83a76b0SSuyog Pawar 457*c83a76b0SSuyog Pawar VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15 458*c83a76b0SSuyog Pawar VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d6 459*c83a76b0SSuyog Pawar VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2 460*c83a76b0SSuyog Pawar VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d7 461*c83a76b0SSuyog Pawar 462*c83a76b0SSuyog Pawar VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9 463*c83a76b0SSuyog Pawar VLD2.8 {d8,d10},[r1],r4 @ Row 5 of prediction in d8 464*c83a76b0SSuyog Pawar VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3 465*c83a76b0SSuyog Pawar VLD2.8 {d9,d11},[r0],r5 @ Row 5 of source in d9 466*c83a76b0SSuyog Pawar 467*c83a76b0SSuyog Pawar VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10 468*c83a76b0SSuyog Pawar VLD2.8 {d10,d12},[r1],r4 @ Row 6 of prediction in d10 469*c83a76b0SSuyog Pawar VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4 470*c83a76b0SSuyog Pawar VLD2.8 {d11,d13},[r0],r5 @ Row 6 of source in d11 471*c83a76b0SSuyog Pawar 472*c83a76b0SSuyog Pawar VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15 473*c83a76b0SSuyog Pawar VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12 474*c83a76b0SSuyog Pawar VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5 475*c83a76b0SSuyog Pawar VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13 476*c83a76b0SSuyog Pawar 477*c83a76b0SSuyog Pawar VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9 478*c83a76b0SSuyog Pawar VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14 479*c83a76b0SSuyog Pawar VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6 480*c83a76b0SSuyog Pawar VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15 481*c83a76b0SSuyog Pawar 482*c83a76b0SSuyog Pawar B LUMA_LOAD_END 483*c83a76b0SSuyog Pawar 484*c83a76b0SSuyog PawarCHROMA_V_LOAD: 485*c83a76b0SSuyog Pawar LDR r5,[sp,#72] @ r5 = src_strd 486*c83a76b0SSuyog Pawar LDR r4,[sp,#76] @ r4 = pred_strd 487*c83a76b0SSuyog Pawar 488*c83a76b0SSuyog Pawar VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d2 489*c83a76b0SSuyog Pawar VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d3 490*c83a76b0SSuyog Pawar 491*c83a76b0SSuyog Pawar VABDL.U8 q15,d3,d2 @ Row 1 of absolute difference in q15 492*c83a76b0SSuyog Pawar VLD2.8 {d4,d6},[r1],r4 @ Row 2 of prediction in d6 493*c83a76b0SSuyog Pawar VSUBL.U8 q0,d3,d2 @ Row 1 of residue in q0 494*c83a76b0SSuyog Pawar VLD2.8 {d5,d7},[r0],r5 @ Row 2 of source in d7 495*c83a76b0SSuyog Pawar 496*c83a76b0SSuyog Pawar VABDL.U8 q9,d7,d6 @ Row 2 of absolute difference in q9 497*c83a76b0SSuyog Pawar VLD2.8 {d8,d10},[r1],r4 @ Row 3 of prediction in d10 498*c83a76b0SSuyog Pawar VSUBL.U8 q1,d7,d6 @ Row 2 of residue in q1 499*c83a76b0SSuyog Pawar VLD2.8 {d9,d11},[r0],r5 @ Row 3 of source in d11 500*c83a76b0SSuyog Pawar 501*c83a76b0SSuyog Pawar VABAL.U8 q15,d11,d10 @ Row 3 of absolute difference accumulated in q15 502*c83a76b0SSuyog Pawar VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d8 503*c83a76b0SSuyog Pawar VSUBL.U8 q2,d11,d10 @ Row 3 of residue in q2 504*c83a76b0SSuyog Pawar VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d9 505*c83a76b0SSuyog Pawar 506*c83a76b0SSuyog Pawar VABAL.U8 q9,d9,d8 @ Row 4 of absolute difference accumulated in q9 507*c83a76b0SSuyog Pawar VLD2.8 {d10,d12},[r1],r4 @ Row 5 of prediction in d12 508*c83a76b0SSuyog Pawar VSUBL.U8 q3,d9,d8 @ Row 4 of residue in q3 509*c83a76b0SSuyog Pawar VLD2.8 {d11,d13},[r0],r5 @ Row 5 of source in d13 510*c83a76b0SSuyog Pawar 511*c83a76b0SSuyog Pawar VABDL.U8 q10,d13,d12 @ Row 5 of absolute difference in q10 512*c83a76b0SSuyog Pawar VLD2.8 {d14,d16},[r1],r4 @ Row 6 of prediction in d16 513*c83a76b0SSuyog Pawar VSUBL.U8 q4,d13,d12 @ Row 5 of residue in q4 514*c83a76b0SSuyog Pawar VLD2.8 {d15,d17},[r0],r5 @ Row 6 of source in d17 515*c83a76b0SSuyog Pawar 516*c83a76b0SSuyog Pawar VABAL.U8 q15,d17,d16 @ Row 6 of absolute difference accumulated in q15 517*c83a76b0SSuyog Pawar VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12 518*c83a76b0SSuyog Pawar VSUBL.U8 q5,d17,d16 @ Row 6 of residue in q5 519*c83a76b0SSuyog Pawar VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13 520*c83a76b0SSuyog Pawar 521*c83a76b0SSuyog Pawar VABAL.U8 q9,d15,d14 @ Row 7 of absolute difference accumulated in q9 522*c83a76b0SSuyog Pawar VSUBL.U8 q6,d15,d14 @ Row 7 of residue in q6 523*c83a76b0SSuyog Pawar 524*c83a76b0SSuyog Pawar VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14 525*c83a76b0SSuyog Pawar VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15 526*c83a76b0SSuyog Pawar VSWP.8 d14,d16 527*c83a76b0SSuyog Pawar VSWP.8 d15,d17 528*c83a76b0SSuyog Pawar 529*c83a76b0SSuyog Pawar B LUMA_LOAD_END 530*c83a76b0SSuyog Pawar 531*c83a76b0SSuyog PawarLUMA_LOAD: 532*c83a76b0SSuyog Pawar 533*c83a76b0SSuyog Pawar LDR r5,[sp,#72] @ r5 = src_strd 534*c83a76b0SSuyog Pawar LDR r4,[sp,#76] @ r4 = pred_strd 535*c83a76b0SSuyog Pawar 536*c83a76b0SSuyog Pawar VLD1.64 d0,[r1],r4 @ Row 1 of prediction in d0 537*c83a76b0SSuyog Pawar VLD1.64 d1,[r0],r5 @ Row 1 of source in d1 538*c83a76b0SSuyog Pawar 539*c83a76b0SSuyog Pawar VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15 540*c83a76b0SSuyog Pawar VLD1.64 d2,[r1],r4 @ Row 2 of prediction in d2 541*c83a76b0SSuyog Pawar VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0 542*c83a76b0SSuyog Pawar VLD1.64 d3,[r0],r5 @ Row 2 of source in d3 543*c83a76b0SSuyog Pawar 544*c83a76b0SSuyog Pawar VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9 545*c83a76b0SSuyog Pawar VLD1.64 d4,[r1],r4 @ Row 3 of prediction in d4 546*c83a76b0SSuyog Pawar VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1 547*c83a76b0SSuyog Pawar VLD1.64 d5,[r0],r5 @ Row 3 of source in d5 548*c83a76b0SSuyog Pawar 549*c83a76b0SSuyog Pawar VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15 550*c83a76b0SSuyog Pawar VLD1.64 d6,[r1],r4 @ Row 4 of prediction in d6 551*c83a76b0SSuyog Pawar VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2 552*c83a76b0SSuyog Pawar VLD1.64 d7,[r0],r5 @ Row 4 of source in d7 553*c83a76b0SSuyog Pawar 554*c83a76b0SSuyog Pawar VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9 555*c83a76b0SSuyog Pawar VLD1.64 d8,[r1],r4 @ Row 5 of prediction in d8 556*c83a76b0SSuyog Pawar VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3 557*c83a76b0SSuyog Pawar VLD1.64 d9,[r0],r5 @ Row 5 of source in d9 558*c83a76b0SSuyog Pawar 559*c83a76b0SSuyog Pawar VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10 560*c83a76b0SSuyog Pawar VLD1.64 d10,[r1],r4 @ Row 6 of prediction in d10 561*c83a76b0SSuyog Pawar VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4 562*c83a76b0SSuyog Pawar VLD1.64 d11,[r0],r5 @ Row 6 of source in d11 563*c83a76b0SSuyog Pawar 564*c83a76b0SSuyog Pawar VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15 565*c83a76b0SSuyog Pawar VLD1.64 d12,[r1],r4 @ Row 7 of prediction in d12 566*c83a76b0SSuyog Pawar VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5 567*c83a76b0SSuyog Pawar VLD1.64 d13,[r0],r5 @ Row 7 of source in d13 568*c83a76b0SSuyog Pawar 569*c83a76b0SSuyog Pawar VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9 570*c83a76b0SSuyog Pawar VLD1.64 d14,[r1] @ Row 8 of prediction in d14 571*c83a76b0SSuyog Pawar VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6 572*c83a76b0SSuyog Pawar VLD1.64 d15,[r0] @ Row 8 of source in d15 573*c83a76b0SSuyog Pawar 574*c83a76b0SSuyog PawarLUMA_LOAD_END: 575*c83a76b0SSuyog Pawar 576*c83a76b0SSuyog Pawar @ Transform stage 1 577*c83a76b0SSuyog Pawar @ Transposing residue matrix 578*c83a76b0SSuyog Pawar 579*c83a76b0SSuyog Pawar VABAL.U8 q10,d15,d14 @ Row 8 of absolute difference accumulated in q10 580*c83a76b0SSuyog Pawar VTRN.16 q0,q1 @ Transpose residue matrix step (1a) 581*c83a76b0SSuyog Pawar VSUBL.U8 q7,d15,d14 @ Row 8 of residue in q7 582*c83a76b0SSuyog Pawar VTRN.16 q2,q3 @ Transpose residue matrix step (1b) 583*c83a76b0SSuyog Pawar 584*c83a76b0SSuyog Pawar VTRN.16 q4,q5 @ Transpose residue matrix step (1c) 585*c83a76b0SSuyog Pawar VTRN.16 q6,q7 @ Transpose residue matrix step (1d) 586*c83a76b0SSuyog Pawar VTRN.32 q0,q2 @ Transpose residue matrix step (2a) 587*c83a76b0SSuyog Pawar VTRN.32 q1,q3 @ Transpose residue matrix step (2b) 588*c83a76b0SSuyog Pawar 589*c83a76b0SSuyog Pawar VADD.U16 q8,q15,q9 @ SAD calculation (1) 590*c83a76b0SSuyog Pawar VTRN.32 q4,q6 @ Transpose residue matrix step (2c) 591*c83a76b0SSuyog Pawar VTRN.32 q5,q7 @ Transpose residue matrix step (2d) 592*c83a76b0SSuyog Pawar 593*c83a76b0SSuyog Pawar VADD.U16 q8,q8,q10 @ SAD calculation (2) 594*c83a76b0SSuyog Pawar VSWP d1,d8 @ Transpose residue matrix step (3a) 595*c83a76b0SSuyog Pawar VSWP d3,d10 @ Transpose residue matrix step (3b) 596*c83a76b0SSuyog Pawar 597*c83a76b0SSuyog Pawar VADD.U16 d16,d16,d17 @ SAD calculation (3) 598*c83a76b0SSuyog Pawar VSWP d7,d14 @ Transpose residue matrix step (3c) 599*c83a76b0SSuyog Pawar VSWP d5,d12 @ Transpose residue matrix step (3d) 600*c83a76b0SSuyog Pawar @ Columns of residue C0-C7 (8x8 matrix) in q0-q7 601*c83a76b0SSuyog Pawar VPADDL.U16 d16,d16 @ SAD calculation (4) 602*c83a76b0SSuyog Pawar 603*c83a76b0SSuyog Pawar @ Evaluating first step in Butterfly diagram 604*c83a76b0SSuyog Pawar 605*c83a76b0SSuyog Pawar VADD.S16 q10,q0,q7 @ q10 = C0 + C7 606*c83a76b0SSuyog Pawar VADD.S16 q11,q1,q6 @ q11 = C1 + C6 607*c83a76b0SSuyog Pawar VPADDL.U32 d16,d16 @ SAD calculation (5) 608*c83a76b0SSuyog Pawar VADD.S16 q12,q2,q5 @ q12 = C2 + C5 609*c83a76b0SSuyog Pawar VADD.S16 q13,q3,q4 @ q13 = C3 + C4 610*c83a76b0SSuyog Pawar 611*c83a76b0SSuyog Pawar VSUB.S16 q4,q3,q4 @ q4 = C3 - C4 612*c83a76b0SSuyog Pawar VSUB.S16 q5,q2,q5 @ q5 = C2 - C5 613*c83a76b0SSuyog Pawar VSUB.S16 q6,q1,q6 @ q6 = C1 - C6 614*c83a76b0SSuyog Pawar VSUB.S16 q7,q0,q7 @ q7 = C0 - C7 615*c83a76b0SSuyog Pawar 616*c83a76b0SSuyog Pawar @ Calculating F0, F2, F4 and F6 617*c83a76b0SSuyog Pawar 618*c83a76b0SSuyog Pawar VADD.S16 q1,q11,q12 @ q1 = C1 + C2 + C5 + C6 619*c83a76b0SSuyog Pawar VADD.S16 q2,q10,q13 @ q2 = C0 + C3 + C4 + C7 620*c83a76b0SSuyog Pawar 621*c83a76b0SSuyog Pawar MOV r4,#50 622*c83a76b0SSuyog Pawar LSL r4,r4,#16 623*c83a76b0SSuyog Pawar ADD r4,r4,#18 624*c83a76b0SSuyog Pawar MOV r5,#89 625*c83a76b0SSuyog Pawar LSL r5,r5,#16 626*c83a76b0SSuyog Pawar ADD r5,r5,#75 627*c83a76b0SSuyog Pawar VMOV d0,r4,r5 @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18 628*c83a76b0SSuyog Pawar 629*c83a76b0SSuyog Pawar MOV r4,#83 630*c83a76b0SSuyog Pawar LSL r4,r4,#16 631*c83a76b0SSuyog Pawar ADD r4,r4,#36 632*c83a76b0SSuyog Pawar VMOV d1,r4,r4 @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36 633*c83a76b0SSuyog Pawar 634*c83a76b0SSuyog Pawar VSUB.S16 q10,q10,q13 @ q10 = C0 - C3 - C4 + C7 635*c83a76b0SSuyog Pawar VSUB.S16 q11,q11,q12 @ q11 = C1 - C2 - C5 + C6 636*c83a76b0SSuyog Pawar VMOV.32 r0,d16[0] @ SAD calculation (6) : Return value = SAD 637*c83a76b0SSuyog Pawar 638*c83a76b0SSuyog Pawar VSUB.S16 q3,q2,q1 @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7 639*c83a76b0SSuyog Pawar VADD.S16 q2,q2,q1 @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7 640*c83a76b0SSuyog Pawar 641*c83a76b0SSuyog Pawar VMULL.S16 q14,d20,d1[1] @ q14 = [0] of 83*(C0 - C3 - C4 + C7) 642*c83a76b0SSuyog Pawar VMULL.S16 q15,d21,d1[1] @ q15 = [1] of 83*(C0 - C3 - C4 + C7) 643*c83a76b0SSuyog Pawar VMULL.S16 q9,d20,d1[0] @ q9 = [0] of 36*(C0 - C3 - C4 + C7) 644*c83a76b0SSuyog Pawar VMULL.S16 q10,d21,d1[0] @ q10 = [1] of 36*(C0 - C3 - C4 + C7) 645*c83a76b0SSuyog Pawar 646*c83a76b0SSuyog Pawar VMLAL.S16 q14,d22,d1[0] @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6) 647*c83a76b0SSuyog Pawar VSHLL.S16 q13,d6,#6 @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7) 648*c83a76b0SSuyog Pawar VMLAL.S16 q15,d23,d1[0] @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6) 649*c83a76b0SSuyog Pawar VSHLL.S16 q3,d7,#6 @ q3 = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7) 650*c83a76b0SSuyog Pawar VMLSL.S16 q9,d22,d1[1] @ q9 = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6) 651*c83a76b0SSuyog Pawar VSHLL.S16 q12,d4,#6 @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7) 652*c83a76b0SSuyog Pawar VMLSL.S16 q10,d23,d1[1] @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6) 653*c83a76b0SSuyog Pawar VSHLL.S16 q2,d5,#6 @ q2 = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7) 654*c83a76b0SSuyog Pawar 655*c83a76b0SSuyog Pawar @ Calculating F1, F3, F5 and F7 656*c83a76b0SSuyog Pawar 657*c83a76b0SSuyog Pawar MOV r4,#48 658*c83a76b0SSuyog Pawar VST1.64 {d24,d25},[r2]! @ Row 1 of transform stage 1 F0[0] stored 659*c83a76b0SSuyog Pawar VST1.64 {d4,d5},[r2],r4 @ Row 1 of transform stage 1 F0[1] stored 660*c83a76b0SSuyog Pawar VST1.64 {d28,d29},[r2]! @ Row 3 of transform stage 1 F2[0] stored 661*c83a76b0SSuyog Pawar VST1.64 {d30,d31},[r2],r4 @ Row 3 of transform stage 1 F2[1] stored 662*c83a76b0SSuyog Pawar 663*c83a76b0SSuyog Pawar VST1.64 {d26,d27},[r2]! @ Row 5 of transform stage 1 F4[0] stored 664*c83a76b0SSuyog Pawar VMULL.S16 q1,d14,d0[3] @ q1 = [0] of 89*(C0 - C7) 665*c83a76b0SSuyog Pawar VMULL.S16 q8,d15,d0[3] @ q8 = [1] of 89*(C0 - C7) 666*c83a76b0SSuyog Pawar VST1.64 {d6,d7},[r2],r4 @ Row 5 of transform stage 1 F4[1] stored 667*c83a76b0SSuyog Pawar VMULL.S16 q11,d14,d0[2] @ q11 = [0] of 75*(C0 - C7) 668*c83a76b0SSuyog Pawar VMULL.S16 q13,d15,d0[2] @ q13 = [1] of 75*(C0 - C7) 669*c83a76b0SSuyog Pawar VST1.64 {d18,d19},[r2]! @ Row 7 of transform stage 1 F6[0] stored 670*c83a76b0SSuyog Pawar VMULL.S16 q3,d14,d0[1] @ q3 = [0] of 50*(C0 - C7) 671*c83a76b0SSuyog Pawar VMULL.S16 q9,d15,d0[1] @ q9 = [1] of 50*(C0 - C7) 672*c83a76b0SSuyog Pawar VST1.64 {d20,d21},[r2] @ Row 7 of transform stage 1 F6[1] stored 673*c83a76b0SSuyog Pawar VMULL.S16 q10,d14,d0[0] @ q10 = [0] of 18*(C0 - C7) 674*c83a76b0SSuyog Pawar VMULL.S16 q7,d15,d0[0] @ q7 = [1] of 18*(C0 - C7) 675*c83a76b0SSuyog Pawar 676*c83a76b0SSuyog Pawar VMLAL.S16 q1,d12,d0[2] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) 677*c83a76b0SSuyog Pawar VMLAL.S16 q8,d13,d0[2] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) 678*c83a76b0SSuyog Pawar VMLSL.S16 q11,d12,d0[0] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) 679*c83a76b0SSuyog Pawar VMLSL.S16 q13,d13,d0[0] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) 680*c83a76b0SSuyog Pawar VMLSL.S16 q3,d12,d0[3] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) 681*c83a76b0SSuyog Pawar VMLSL.S16 q9,d13,d0[3] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) 682*c83a76b0SSuyog Pawar VMLSL.S16 q10,d12,d0[1] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) 683*c83a76b0SSuyog Pawar VMLSL.S16 q7,d13,d0[1] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) 684*c83a76b0SSuyog Pawar 685*c83a76b0SSuyog Pawar VMLAL.S16 q1,d10,d0[1] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) 686*c83a76b0SSuyog Pawar VMLAL.S16 q8,d11,d0[1] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) 687*c83a76b0SSuyog Pawar VMLSL.S16 q11,d10,d0[3] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) 688*c83a76b0SSuyog Pawar VMLSL.S16 q13,d11,d0[3] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) 689*c83a76b0SSuyog Pawar VMLAL.S16 q3,d10,d0[0] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) 690*c83a76b0SSuyog Pawar VMLAL.S16 q9,d11,d0[0] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) 691*c83a76b0SSuyog Pawar VMLAL.S16 q10,d10,d0[2] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) 692*c83a76b0SSuyog Pawar VMLAL.S16 q7,d11,d0[2] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) 693*c83a76b0SSuyog Pawar 694*c83a76b0SSuyog Pawar VMLAL.S16 q1,d8,d0[0] @ q1 = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4) 695*c83a76b0SSuyog Pawar VMLAL.S16 q8,d9,d0[0] @ q8 = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4) 696*c83a76b0SSuyog Pawar VMLSL.S16 q11,d8,d0[1] @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4) 697*c83a76b0SSuyog Pawar VMLSL.S16 q13,d9,d0[1] @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4) 698*c83a76b0SSuyog Pawar SUB r2,r2,#176 @ r2 now points to the second row 699*c83a76b0SSuyog Pawar VMLAL.S16 q3,d8,d0[2] @ q3 = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4) 700*c83a76b0SSuyog Pawar VMLAL.S16 q9,d9,d0[2] @ q9 = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4) 701*c83a76b0SSuyog Pawar VST1.64 {d2,d3},[r2]! @ Row 2 of transform stage 1 F1[0] stored 702*c83a76b0SSuyog Pawar VMLSL.S16 q10,d8,d0[3] @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4) 703*c83a76b0SSuyog Pawar VMLSL.S16 q7,d9,d0[3] @ q7 = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4) 704*c83a76b0SSuyog Pawar 705*c83a76b0SSuyog Pawar VST1.64 {d16,d17},[r2],r4 @ Row 2 of transform stage 1 F1[1] stored 706*c83a76b0SSuyog Pawar VST1.64 {d22,d23},[r2]! @ Row 4 of transform stage 1 F3[0] stored 707*c83a76b0SSuyog Pawar VST1.64 {d26,d27},[r2],r4 @ Row 4 of transform stage 1 F3[1] stored 708*c83a76b0SSuyog Pawar VST1.64 {d6,d7},[r2]! @ Row 6 of transform stage 1 F5[0] stored 709*c83a76b0SSuyog Pawar VST1.64 {d18,d19},[r2],r4 @ Row 6 of transform stage 1 F5[1] stored 710*c83a76b0SSuyog Pawar VST1.64 {d20,d21},[r2]! @ Row 8 of transform stage 1 F7[0] stored 711*c83a76b0SSuyog Pawar VST1.64 {d14,d15},[r2] @ Row 8 of transform stage 1 F7[1] stored 712*c83a76b0SSuyog Pawar 713*c83a76b0SSuyog Pawar @ Transform stage 2 (for rows 1-4 of transform stage 1) 714*c83a76b0SSuyog Pawar @ Transposing the 4 rows (F0, F1, F2, F3) 715*c83a76b0SSuyog Pawar @ F0 = {q2,q12}, F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11} 716*c83a76b0SSuyog Pawar 717*c83a76b0SSuyog Pawar VTRN.32 q12,q1 @ Transposing first half of transform stage 1 (1a) 718*c83a76b0SSuyog Pawar VTRN.32 q14,q11 @ Transposing first half of transform stage 1 (1b) 719*c83a76b0SSuyog Pawar VSWP d25,d28 @ Transposing first half of transform stage 1 (2a) 720*c83a76b0SSuyog Pawar VSWP d22,d3 @ Transposing first half of transform stage 1 (2b) 721*c83a76b0SSuyog Pawar 722*c83a76b0SSuyog Pawar VTRN.32 q2,q8 @ Transposing first half of transform stage 1 (3a) 723*c83a76b0SSuyog Pawar VTRN.32 q15,q13 @ Transposing first half of transform stage 1 (3b) 724*c83a76b0SSuyog Pawar VSWP d5,d30 @ Transposing first half of transform stage 1 (4a) 725*c83a76b0SSuyog Pawar VSWP d26,d17 @ Transposing first half of transform stage 1 (4b) 726*c83a76b0SSuyog Pawar @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13 727*c83a76b0SSuyog Pawar 728*c83a76b0SSuyog Pawar @ Evaluating first step in Butterfly diagram 729*c83a76b0SSuyog Pawar 730*c83a76b0SSuyog Pawar VADD.S32 q0,q12,q13 @ q0 = B0 + B7 731*c83a76b0SSuyog Pawar VADD.S32 q5,q11,q2 @ q5 = B3 + B4 732*c83a76b0SSuyog Pawar VADD.S32 q3,q1,q15 @ q3 = B1 + B6 733*c83a76b0SSuyog Pawar VADD.S32 q4,q14,q8 @ q4 = B2 + B5 734*c83a76b0SSuyog Pawar 735*c83a76b0SSuyog Pawar VSUB.S32 q7,q14,q8 @ q7 = B2 - B5 736*c83a76b0SSuyog Pawar VSUB.S32 q8,q1,q15 @ q8 = B1 - B6 737*c83a76b0SSuyog Pawar VSUB.S32 q6,q11,q2 @ q6 = B3 - B4 738*c83a76b0SSuyog Pawar VSUB.S32 q9,q12,q13 @ q9 = B0 - B7 739*c83a76b0SSuyog Pawar 740*c83a76b0SSuyog Pawar @ Calculating G0, G2, G4 and G6 741*c83a76b0SSuyog Pawar 742*c83a76b0SSuyog Pawar MOV r4,#18 743*c83a76b0SSuyog Pawar MOV r5,#50 744*c83a76b0SSuyog Pawar VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18 745*c83a76b0SSuyog Pawar VSUB.S32 q2,q0,q5 @ q2 = B0 - B3 - B4 + B7 746*c83a76b0SSuyog Pawar 747*c83a76b0SSuyog Pawar MOV r4,#75 748*c83a76b0SSuyog Pawar MOV r5,#89 749*c83a76b0SSuyog Pawar VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75 750*c83a76b0SSuyog Pawar VADD.S32 q10,q0,q5 @ q10 = B0 + B3 + B4 + B7 751*c83a76b0SSuyog Pawar 752*c83a76b0SSuyog Pawar MOV r4,#36 753*c83a76b0SSuyog Pawar MOV r5,#83 754*c83a76b0SSuyog Pawar VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36 755*c83a76b0SSuyog Pawar VSUB.S32 q11,q3,q4 @ q11 = B1 - B2 - B5 + B6 756*c83a76b0SSuyog Pawar VADD.S32 q3,q3,q4 @ q3 = B1 + B2 + B5 + B6 757*c83a76b0SSuyog Pawar 758*c83a76b0SSuyog Pawar VMUL.S32 q12,q2,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7) 759*c83a76b0SSuyog Pawar VMUL.S32 q2,q2,d0[0] @ q2 = 36*(B0 - B3 - B4 + B7) 760*c83a76b0SSuyog Pawar VMUL.S32 q5,q9,d3[1] @ q5 = 89*(B0 - B7) 761*c83a76b0SSuyog Pawar VADD.S32 q14,q10,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 762*c83a76b0SSuyog Pawar VMUL.S32 q4,q9,d3[0] @ q4 = 75*(B0 - B7) 763*c83a76b0SSuyog Pawar VSUB.S32 q15,q10,q3 @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7 764*c83a76b0SSuyog Pawar@ VSHL.S32 q14,q14,#6 ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7) 765*c83a76b0SSuyog Pawar@ VSHL.S32 q15,q15,#6 ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7) 766*c83a76b0SSuyog Pawar 767*c83a76b0SSuyog Pawar VMLA.S32 q12,q11,d0[0] @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6) 768*c83a76b0SSuyog Pawar VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in G0 769*c83a76b0SSuyog Pawar VMLS.S32 q2,q11,d0[1] @ q2 = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6) 770*c83a76b0SSuyog Pawar VRSHRN.I32 d30,q15,#5 @ Truncating last 11 bits in G4 771*c83a76b0SSuyog Pawar 772*c83a76b0SSuyog Pawar LDR r4,[sp,#80] @ r4 = dst_strd 773*c83a76b0SSuyog Pawar LSL r4,r4,#2 @ r4 = 2*dst_strd*2 774*c83a76b0SSuyog Pawar 775*c83a76b0SSuyog Pawar VMUL.S32 q3,q9,d2[1] @ q3 = 50*(B0 - B7) 776*c83a76b0SSuyog Pawar VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in G2 777*c83a76b0SSuyog Pawar VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7) 778*c83a76b0SSuyog Pawar VRSHRN.I32 d4,q2,#11 @ Truncating last 11 bits in G6 779*c83a76b0SSuyog Pawar 780*c83a76b0SSuyog Pawar VMLA.S32 q5,q8,d3[0] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) 781*c83a76b0SSuyog Pawar VST1.64 d28,[r3],r4 @ First half-row of row 1 of transform stage 2 (G0) stored 782*c83a76b0SSuyog Pawar VMLS.S32 q4,q8,d2[0] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) 783*c83a76b0SSuyog Pawar 784*c83a76b0SSuyog Pawar VMLS.S32 q3,q8,d3[1] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) 785*c83a76b0SSuyog Pawar VST1.64 d24,[r3],r4 @ First half-row of row 3 of transform stage 2 (G2) stored 786*c83a76b0SSuyog Pawar VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) 787*c83a76b0SSuyog Pawar 788*c83a76b0SSuyog Pawar VMLA.S32 q5,q7,d2[1] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) 789*c83a76b0SSuyog Pawar VST1.64 d30,[r3],r4 @ First half-row of row 5 of transform stage 2 (G4) stored 790*c83a76b0SSuyog Pawar VMLS.S32 q4,q7,d3[1] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) 791*c83a76b0SSuyog Pawar 792*c83a76b0SSuyog Pawar VMLA.S32 q3,q7,d2[0] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) 793*c83a76b0SSuyog Pawar VST1.64 d4,[r3] @ First half-row of row 7 of transform stage 2 (G6) stored 794*c83a76b0SSuyog Pawar VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) 795*c83a76b0SSuyog Pawar 796*c83a76b0SSuyog Pawar VMLA.S32 q5,q6,d2[0] @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) 797*c83a76b0SSuyog Pawar VMLS.S32 q4,q6,d2[1] @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) 798*c83a76b0SSuyog Pawar VMLA.S32 q3,q6,d3[0] @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) 799*c83a76b0SSuyog Pawar VMLS.S32 q9,q6,d3[1] @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) 800*c83a76b0SSuyog Pawar 801*c83a76b0SSuyog Pawar SUB r3,r3,r4,LSL #1 802*c83a76b0SSuyog Pawar SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd*2 803*c83a76b0SSuyog Pawar @ r3 is moved from row 7 to row 2 804*c83a76b0SSuyog Pawar VRSHRN.I32 d10,q5,#11 @ Truncating last 11 bits in G1 805*c83a76b0SSuyog Pawar VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in G3 806*c83a76b0SSuyog Pawar VRSHRN.I32 d6,q3,#11 @ Truncating last 11 bits in G5 807*c83a76b0SSuyog Pawar VST1.64 d10,[r3],r4 @ First half-row of row 2 of transform stage 2 (G1) stored 808*c83a76b0SSuyog Pawar VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in G7 809*c83a76b0SSuyog Pawar 810*c83a76b0SSuyog Pawar VST1.64 d8,[r3],r4 @ First half-row of row 4 of transform stage 2 (G3) stored 811*c83a76b0SSuyog Pawar VST1.64 d6,[r3],r4 @ First half-row of row 6 of transform stage 2 (G5) stored 812*c83a76b0SSuyog Pawar VST1.64 d18,[r3]! @ First half-row of row 8 of transform stage 2 (G7) stored 813*c83a76b0SSuyog Pawar 814*c83a76b0SSuyog Pawar @ Transform stage 2 (for rows 5-8 of transform stage 1) 815*c83a76b0SSuyog Pawar @ Loading the 4 rows (F4, F5, F6, F7) 816*c83a76b0SSuyog Pawar 817*c83a76b0SSuyog Pawar SUB r2,r2,#112 @ r2 jumps from row 8 to row 5 in temporary memory 818*c83a76b0SSuyog Pawar VLD1.64 {d20,d21},[r2]! @ q10 = F4[0] 819*c83a76b0SSuyog Pawar VLD1.64 {d22,d23},[r2]! @ q11 = F4[1] 820*c83a76b0SSuyog Pawar VLD1.64 {d8,d9},[r2]! @ q4 = F5[0] 821*c83a76b0SSuyog Pawar @ Transposing the 4 rows 822*c83a76b0SSuyog Pawar @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12} 823*c83a76b0SSuyog Pawar 824*c83a76b0SSuyog Pawar VTRN.32 q10,q4 @ Transposing second half of transform stage 1 (1a) 825*c83a76b0SSuyog Pawar VLD1.64 {d10,d11},[r2]! @ q5 = F5[1] 826*c83a76b0SSuyog Pawar VLD1.64 {d4,d5},[r2]! @ q2 = F6[0] 827*c83a76b0SSuyog Pawar VLD1.64 {d6,d7},[r2]! @ q3 = F6[1] 828*c83a76b0SSuyog Pawar VLD1.64 {d24,d25},[r2]! @ q12 = F7[0] 829*c83a76b0SSuyog Pawar VTRN.32 q2,q12 @ Transposing second half of transform stage 1 (1b) 830*c83a76b0SSuyog Pawar VLD1.64 {d26,d27},[r2] @ q13 = F7[1] 831*c83a76b0SSuyog Pawar 832*c83a76b0SSuyog Pawar VSWP d21,d4 @ Transposing second half of transform stage 1 (2a) 833*c83a76b0SSuyog Pawar VSWP d24,d9 @ Transposing second half of transform stage 1 (2b) 834*c83a76b0SSuyog Pawar 835*c83a76b0SSuyog Pawar VTRN.32 q11,q5 @ Transposing second half of transform stage 1 (3a) 836*c83a76b0SSuyog Pawar VTRN.32 q3,q13 @ Transposing second half of transform stage 1 (3b) 837*c83a76b0SSuyog Pawar VSWP d26,d11 @ Transposing second half of transform stage 1 (4b) 838*c83a76b0SSuyog Pawar VSWP d23,d6 @ Transposing second half of transform stage 1 (4a) 839*c83a76b0SSuyog Pawar @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13 840*c83a76b0SSuyog Pawar 841*c83a76b0SSuyog Pawar @ Evaluating first step in Butterfly diagram 842*c83a76b0SSuyog Pawar 843*c83a76b0SSuyog Pawar VADD.S32 q0,q10,q13 @ q0 = B0 + B7 844*c83a76b0SSuyog Pawar VADD.S32 q15,q12,q11 @ q15 = B3 + B4 845*c83a76b0SSuyog Pawar VADD.S32 q1,q4,q3 @ q1 = B1 + B6 846*c83a76b0SSuyog Pawar VADD.S32 q14,q2,q5 @ q14 = B2 + B5 847*c83a76b0SSuyog Pawar 848*c83a76b0SSuyog Pawar VSUB.S32 q9,q10,q13 @ q9 = B0 - B7 849*c83a76b0SSuyog Pawar VSUB.S32 q6,q12,q11 @ q6 = B3 - B4 850*c83a76b0SSuyog Pawar VSUB.S32 q7,q2,q5 @ q7 = B2 - B5 851*c83a76b0SSuyog Pawar VSUB.S32 q8,q4,q3 @ q8 = B1 - B6 852*c83a76b0SSuyog Pawar 853*c83a76b0SSuyog Pawar @ Calculating H0, H2, H4 and H6 854*c83a76b0SSuyog Pawar 855*c83a76b0SSuyog Pawar VADD.S32 q3,q1,q14 @ q3 = B1 + B2 + B5 + B6 856*c83a76b0SSuyog Pawar VSUB.S32 q5,q1,q14 @ q5 = B1 - B2 - B5 + B6 857*c83a76b0SSuyog Pawar 858*c83a76b0SSuyog Pawar MOV r4,#18 859*c83a76b0SSuyog Pawar MOV r5,#50 860*c83a76b0SSuyog Pawar VSUB.S32 q4,q0,q15 @ q4 = B0 - B3 - B4 + B7 861*c83a76b0SSuyog Pawar VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18 862*c83a76b0SSuyog Pawar 863*c83a76b0SSuyog Pawar MOV r4,#75 864*c83a76b0SSuyog Pawar MOV r5,#89 865*c83a76b0SSuyog Pawar VADD.S32 q2,q0,q15 @ q2 = B0 + B3 + B4 + B7 866*c83a76b0SSuyog Pawar VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75 867*c83a76b0SSuyog Pawar 868*c83a76b0SSuyog Pawar MOV r4,#36 869*c83a76b0SSuyog Pawar MOV r5,#83 870*c83a76b0SSuyog Pawar 871*c83a76b0SSuyog Pawar @ Calculating H1, H3, H5 and H7 872*c83a76b0SSuyog Pawar 873*c83a76b0SSuyog Pawar VMUL.S32 q10,q9,d3[1] @ q10 = 89*(B0 - B7) 874*c83a76b0SSuyog Pawar VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36 875*c83a76b0SSuyog Pawar 876*c83a76b0SSuyog Pawar VMUL.S32 q13,q9,d3[0] @ q13 = 75*(B0 - B7) 877*c83a76b0SSuyog Pawar 878*c83a76b0SSuyog Pawar VMUL.S32 q12,q4,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7) 879*c83a76b0SSuyog Pawar VADD.S32 q14,q2,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 880*c83a76b0SSuyog Pawar VMUL.S32 q4,q4,d0[0] @ q4 = 36*(B0 - B3 - B4 + B7) 881*c83a76b0SSuyog Pawar VSUB.S32 q2,q2,q3 @ q2 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7 882*c83a76b0SSuyog Pawar 883*c83a76b0SSuyog Pawar 884*c83a76b0SSuyog Pawar VMLA.S32 q12,q5,d0[0] @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6) 885*c83a76b0SSuyog Pawar@ VSHL.S32 q14,q14,#6 ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7) 886*c83a76b0SSuyog Pawar VMLS.S32 q4,q5,d0[1] @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6) 887*c83a76b0SSuyog Pawar@ VSHL.S32 q2,q15,#6 ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7) 888*c83a76b0SSuyog Pawar 889*c83a76b0SSuyog Pawar VMUL.S32 q11,q9,d2[1] @ q11 = 50*(B0 - B7) 890*c83a76b0SSuyog Pawar VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in H0 891*c83a76b0SSuyog Pawar VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7) 892*c83a76b0SSuyog Pawar VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in H2 893*c83a76b0SSuyog Pawar 894*c83a76b0SSuyog Pawar VMLA.S32 q10,q8,d3[0] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) 895*c83a76b0SSuyog Pawar VRSHRN.I32 d4,q2,#5 @ Truncating last 11 bits in H4 896*c83a76b0SSuyog Pawar VMLS.S32 q13,q8,d2[0] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) 897*c83a76b0SSuyog Pawar VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in H6 898*c83a76b0SSuyog Pawar 899*c83a76b0SSuyog Pawar LDR r4,[sp,#80] @ r4 = dst_strd 900*c83a76b0SSuyog Pawar LSL r4,r4,#2 @ r4 = 2*dst_strd*2 901*c83a76b0SSuyog Pawar 902*c83a76b0SSuyog Pawar SUB r3,r3,r4,LSL #2 903*c83a76b0SSuyog Pawar ADD r3,r3,r4,ASR #1 @ r3 = r3 - 7*dst_strd*2 904*c83a76b0SSuyog Pawar @ r3 is moved from row 8 to row 1 905*c83a76b0SSuyog Pawar VMLS.S32 q11,q8,d3[1] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) 906*c83a76b0SSuyog Pawar VST1.64 d28,[r3],r4 @ Second half-row of row 1 of transform stage 2 (H0) stored 907*c83a76b0SSuyog Pawar VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) 908*c83a76b0SSuyog Pawar 909*c83a76b0SSuyog Pawar VMLA.S32 q10,q7,d2[1] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) 910*c83a76b0SSuyog Pawar VST1.64 d24,[r3],r4 @ Second half-row of row 3 of transform stage 2 (H2) stored 911*c83a76b0SSuyog Pawar VMLS.S32 q13,q7,d3[1] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) 912*c83a76b0SSuyog Pawar 913*c83a76b0SSuyog Pawar VMLA.S32 q11,q7,d2[0] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) 914*c83a76b0SSuyog Pawar VST1.64 d4,[r3],r4 @ Second half-row of row 5 of transform stage 2 (H4) stored 915*c83a76b0SSuyog Pawar VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) 916*c83a76b0SSuyog Pawar 917*c83a76b0SSuyog Pawar VMLA.S32 q10,q6,d2[0] @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) 918*c83a76b0SSuyog Pawar VST1.64 d8,[r3] @ Second half-row of row 7 of transform stage 2 (H6) stored 919*c83a76b0SSuyog Pawar VMLS.S32 q13,q6,d2[1] @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) 920*c83a76b0SSuyog Pawar 921*c83a76b0SSuyog Pawar VMLA.S32 q11,q6,d3[0] @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) 922*c83a76b0SSuyog Pawar VMLS.S32 q9,q6,d3[1] @ q9 = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) 923*c83a76b0SSuyog Pawar 924*c83a76b0SSuyog Pawar SUB r3,r3,r4,LSL #1 925*c83a76b0SSuyog Pawar SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd 926*c83a76b0SSuyog Pawar @ r3 is moved from row 7 to row 2 927*c83a76b0SSuyog Pawar VRSHRN.I32 d20,q10,#11 @ Truncating last 11 bits in H1 928*c83a76b0SSuyog Pawar VRSHRN.I32 d26,q13,#11 @ Truncating last 11 bits in H3 929*c83a76b0SSuyog Pawar VRSHRN.I32 d22,q11,#11 @ Truncating last 11 bits in H5 930*c83a76b0SSuyog Pawar VST1.64 d20,[r3],r4 @ Second half-row of row 2 of transform stage 2 (H1) stored 931*c83a76b0SSuyog Pawar VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in H7 932*c83a76b0SSuyog Pawar 933*c83a76b0SSuyog Pawar VST1.64 d26,[r3],r4 @ Second half-row of row 4 of transform stage 2 (H3) stored 934*c83a76b0SSuyog Pawar VST1.64 d22,[r3],r4 @ Second half-row of row 6 of transform stage 2 (H5) stored 935*c83a76b0SSuyog Pawar VST1.64 d18,[r3] @ Second half-row of row 8 of transform stage 2 (H7) stored 936*c83a76b0SSuyog Pawar 937*c83a76b0SSuyog Pawar vpop {d8 - d15} 938*c83a76b0SSuyog Pawar POP {r4,r5} 939*c83a76b0SSuyog Pawar MOV pc,lr 940*c83a76b0SSuyog Pawar 941*c83a76b0SSuyog Pawar@/** 942*c83a76b0SSuyog Pawar@*/ ******************************************************************************* 943*c83a76b0SSuyog Pawar@*/ 944*c83a76b0SSuyog Pawar@*/@brief 945*c83a76b0SSuyog Pawar@*/ This function performs residue calculation and forward transform on 946*c83a76b0SSuyog Pawar@*/ input pixels 947*c83a76b0SSuyog Pawar@*/ 948*c83a76b0SSuyog Pawar@*/@par Description: 949*c83a76b0SSuyog Pawar@*/ Performs residue calculation by subtracting source and prediction and 950*c83a76b0SSuyog Pawar@*/ followed by forward transform 951*c83a76b0SSuyog Pawar@*/ 952*c83a76b0SSuyog Pawar@*/ @param[in] pu1_src 953*c83a76b0SSuyog Pawar@*/ Input 16x16 pixels 954*c83a76b0SSuyog Pawar@*/ 955*c83a76b0SSuyog Pawar@*/ @param[in] pu1_pred 956*c83a76b0SSuyog Pawar@*/ Prediction data 957*c83a76b0SSuyog Pawar@*/ 958*c83a76b0SSuyog Pawar@*/ @param[in] pi2_tmp 959*c83a76b0SSuyog Pawar@*/ Temporary buffer of size 16x16 960*c83a76b0SSuyog Pawar@*/ 961*c83a76b0SSuyog Pawar@*/ @param[out] pi2_dst 962*c83a76b0SSuyog Pawar@*/ Output 16x16 coefficients 963*c83a76b0SSuyog Pawar@*/ 964*c83a76b0SSuyog Pawar@*/ @param[in] src_strd 965*c83a76b0SSuyog Pawar@*/ Input stride 966*c83a76b0SSuyog Pawar@*/ 967*c83a76b0SSuyog Pawar@*/ @param[in] pred_strd 968*c83a76b0SSuyog Pawar@*/ Prediction Stride 969*c83a76b0SSuyog Pawar@*/ 970*c83a76b0SSuyog Pawar@*/ @param[in] dst_strd 971*c83a76b0SSuyog Pawar@*/ Output Stride 972*c83a76b0SSuyog Pawar@*/ 973*c83a76b0SSuyog Pawar@*/ @param[in] chr_plane 974*c83a76b0SSuyog Pawar@*/ Chroma plane 975*c83a76b0SSuyog Pawar@*/ 976*c83a76b0SSuyog Pawar@*/ @returns Void 977*c83a76b0SSuyog Pawar@*/ 978*c83a76b0SSuyog Pawar@*/ @remarks 979*c83a76b0SSuyog Pawar@*/ None 980*c83a76b0SSuyog Pawar@*/ 981*c83a76b0SSuyog Pawar@*/******************************************************************************* 982*c83a76b0SSuyog Pawar@*/ 983*c83a76b0SSuyog Pawar 984*c83a76b0SSuyog Pawar.extern g_ai2_ihevc_trans_16 985*c83a76b0SSuyog Pawar.extern g_ai4_ihevc_trans_16 986*c83a76b0SSuyog Pawar 987*c83a76b0SSuyog Pawarg_ai2_ihevc_trans_16_addr_1: 988*c83a76b0SSuyog Pawar.long g_ai2_ihevc_trans_16 - ulbl1 - 8 989*c83a76b0SSuyog Pawar 990*c83a76b0SSuyog Pawarg_ai2_ihevc_trans_16_addr_2: 991*c83a76b0SSuyog Pawar.long g_ai2_ihevc_trans_16 - ulbl2 - 8 992*c83a76b0SSuyog Pawar 993*c83a76b0SSuyog Pawarg_ai4_ihevc_trans_16_addr: 994*c83a76b0SSuyog Pawar.long g_ai4_ihevc_trans_16 - ulbl3 - 8 995*c83a76b0SSuyog Pawar 996*c83a76b0SSuyog Pawar .global ihevc_resi_trans_16x16_a9q 997*c83a76b0SSuyog Pawar 998*c83a76b0SSuyog Pawarihevc_resi_trans_16x16_a9q: 999*c83a76b0SSuyog Pawar 1000*c83a76b0SSuyog Pawar.equ TMP_STRIDE , 64 @16*4, Stride of tmp register 1001*c83a76b0SSuyog Pawar.equ SHIFT , 13 @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement 1002*c83a76b0SSuyog Pawar.equ RADD , 4096 @1 << (shift - 1); 1003*c83a76b0SSuyog Pawar 1004*c83a76b0SSuyog Pawar.equ COFF_STD_2B , 32 @Stride for g_ai2_ihevc_trans_16 in bytes 1005*c83a76b0SSuyog Pawar.equ COFF_STD_W , 32 @Stride for g_ai4_ihevc_trans_16 in bytes 1006*c83a76b0SSuyog Pawar 1007*c83a76b0SSuyog Pawar@;LOAD the fucntion 1008*c83a76b0SSuyog Pawar STMFD SP!,{r4-r12,LR} @stack store values of the arguments 1009*c83a76b0SSuyog Pawar vpush {d8 - d15} 1010*c83a76b0SSuyog Pawar SUB SP,SP,#32 1011*c83a76b0SSuyog Pawar 1012*c83a76b0SSuyog Pawar LDR R4,[SP,#136] @get src_strd 1013*c83a76b0SSuyog Pawar LDR R5,[SP,#140] @get pred_strd 1014*c83a76b0SSuyog Pawar LDR R6,[SP,#144] @get dst_strd 1015*c83a76b0SSuyog Pawar LDR R14,[SP,#148] @get chroma_plane 1016*c83a76b0SSuyog Pawar 1017*c83a76b0SSuyog Pawar MOV R8,#0 @Set loop counter 1018*c83a76b0SSuyog Pawar LDR R9,g_ai2_ihevc_trans_16_addr_1 @get 16 bit transform matrix 1019*c83a76b0SSuyog Pawarulbl1: 1020*c83a76b0SSuyog Pawar ADD R9, R9, PC 1021*c83a76b0SSuyog Pawar @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16 1022*c83a76b0SSuyog Pawar @and write to stack 1023*c83a76b0SSuyog Pawar MOV R12,#COFF_STD_2B 1024*c83a76b0SSuyog Pawar LSL R12,#2 1025*c83a76b0SSuyog Pawar 1026*c83a76b0SSuyog Pawar VLD1.S32 D30[0],[R9],R12 1027*c83a76b0SSuyog Pawar VLD1.S32 D30[1],[R9],R12 1028*c83a76b0SSuyog Pawar VLD1.S32 D31[0],[R9],R12 1029*c83a76b0SSuyog Pawar VLD1.S32 D31[1],[R9],R12 1030*c83a76b0SSuyog Pawar 1031*c83a76b0SSuyog Pawar VTRN.S32 D30,D31 1032*c83a76b0SSuyog Pawar VTRN.S16 D30,D31 1033*c83a76b0SSuyog Pawar VST1.S16 {d30,d31},[SP] 1034*c83a76b0SSuyog Pawar 1035*c83a76b0SSuyog Pawar LDR R9,g_ai2_ihevc_trans_16_addr_2 @get back 16 bit transform matrix 1036*c83a76b0SSuyog Pawarulbl2: 1037*c83a76b0SSuyog Pawar ADD R9, R9, PC 1038*c83a76b0SSuyog Pawar 1039*c83a76b0SSuyog Pawar MOV R7,#TMP_STRIDE 1040*c83a76b0SSuyog Pawar 1041*c83a76b0SSuyog Pawar VMOV.S32 Q14,#0 1042*c83a76b0SSuyog Pawar 1043*c83a76b0SSuyog Pawar@R0 pu1_src 1044*c83a76b0SSuyog Pawar@R1 pu1_pred 1045*c83a76b0SSuyog Pawar@R2 pi4_tmp 1046*c83a76b0SSuyog Pawar@R3 pi2_dst 1047*c83a76b0SSuyog Pawar@R4 src_strd 1048*c83a76b0SSuyog Pawar@R5 pred_strd 1049*c83a76b0SSuyog Pawar@R6 dst_strd 1050*c83a76b0SSuyog Pawar@R7 tmp_dst Nx4 block stride 1051*c83a76b0SSuyog Pawar@R8 loop cntr 1052*c83a76b0SSuyog Pawar@R9 g_ai2_ihevc_trans_16 1053*c83a76b0SSuyog Pawar@R10 tmp_dst Nx4 block offset 1054*c83a76b0SSuyog Pawar@R11 tmp register 1055*c83a76b0SSuyog Pawar@R12 ------ 1056*c83a76b0SSuyog Pawar@R14 chroma_plane 1057*c83a76b0SSuyog Pawar@q14 shift 32 bit 1058*c83a76b0SSuyog Pawar@q15 add 32 bit 1059*c83a76b0SSuyog Pawar 1060*c83a76b0SSuyog PawarCORE_LOOP_16X16_HORIZ: 1061*c83a76b0SSuyog Pawar 1062*c83a76b0SSuyog Pawar CMP R14,#-1 1063*c83a76b0SSuyog Pawar BGT INTERLEAVED_LOAD_S1 1064*c83a76b0SSuyog Pawar 1065*c83a76b0SSuyog Pawar VLD1.U8 {d0,d1},[R0],R4 @LOAD 1-16 src row 1 1066*c83a76b0SSuyog Pawar VLD1.U8 {d2,d3},[R1],R5 @LOAD 1-16 pred row 1 1067*c83a76b0SSuyog Pawar VLD1.U8 {d4,d5},[R0],R4 @LOAD 1-16 src row 2 1068*c83a76b0SSuyog Pawar VLD1.U8 {d6,d7},[R1],R5 @LOAD 1-16 pred row 2 1069*c83a76b0SSuyog Pawar B LOAD_DONE 1070*c83a76b0SSuyog Pawar 1071*c83a76b0SSuyog PawarINTERLEAVED_LOAD_S1: 1072*c83a76b0SSuyog Pawar CMP R14,#1 1073*c83a76b0SSuyog Pawar BEQ INTERLEAVED_LOAD_S2 1074*c83a76b0SSuyog Pawar VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1 1075*c83a76b0SSuyog Pawar VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1 1076*c83a76b0SSuyog Pawar VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2 1077*c83a76b0SSuyog Pawar VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2 1078*c83a76b0SSuyog Pawar B LOAD_DONE 1079*c83a76b0SSuyog Pawar 1080*c83a76b0SSuyog PawarINTERLEAVED_LOAD_S2: 1081*c83a76b0SSuyog Pawar VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1 1082*c83a76b0SSuyog Pawar VSWP.U8 Q0,Q1 1083*c83a76b0SSuyog Pawar VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1 1084*c83a76b0SSuyog Pawar VSWP.U8 Q1,Q2 1085*c83a76b0SSuyog Pawar VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2 1086*c83a76b0SSuyog Pawar VSWP.U8 Q2,Q3 1087*c83a76b0SSuyog Pawar VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2 1088*c83a76b0SSuyog Pawar VSWP.U8 Q3,Q4 1089*c83a76b0SSuyog Pawar 1090*c83a76b0SSuyog PawarLOAD_DONE: 1091*c83a76b0SSuyog Pawar 1092*c83a76b0SSuyog Pawar VSUBL.U8 Q4,D0,D2 @Get residue 1-8 row 1 1093*c83a76b0SSuyog Pawar VSUBL.U8 Q5,D1,D3 @Get residue 9-16 row 1 1094*c83a76b0SSuyog Pawar VSUBL.U8 Q6,D4,D6 @Get residue 1-8 row 2 1095*c83a76b0SSuyog Pawar VSUBL.U8 Q7,D5,D7 @Get residue 9-16 row 2 1096*c83a76b0SSuyog Pawar 1097*c83a76b0SSuyog Pawar @Get blk sads 1098*c83a76b0SSuyog Pawar VABDL.U8 Q15,D0,D2 1099*c83a76b0SSuyog Pawar VABAL.U8 Q15,D1,D3 1100*c83a76b0SSuyog Pawar VABAL.U8 Q15,D4,D6 1101*c83a76b0SSuyog Pawar VABAL.U8 Q15,D5,D7 1102*c83a76b0SSuyog Pawar VADDW.S16 Q14,Q14,D30 1103*c83a76b0SSuyog Pawar VADDW.S16 Q14,Q14,D31 1104*c83a76b0SSuyog Pawar 1105*c83a76b0SSuyog Pawar VREV64.S16 Q5,Q5 @Rev row 1 1106*c83a76b0SSuyog Pawar VREV64.S16 Q7,Q7 @Rev row 2 1107*c83a76b0SSuyog Pawar VSWP D10,D11 1108*c83a76b0SSuyog Pawar VSWP D14,D15 1109*c83a76b0SSuyog Pawar 1110*c83a76b0SSuyog Pawar VADD.S16 Q8 ,Q4,Q5 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 1 1111*c83a76b0SSuyog Pawar VSUB.S16 Q9 ,Q4,Q5 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 1 1112*c83a76b0SSuyog Pawar VADD.S16 Q10,Q6,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 2 1113*c83a76b0SSuyog Pawar VSUB.S16 Q11,Q6,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 2 1114*c83a76b0SSuyog Pawar 1115*c83a76b0SSuyog Pawar VREV64.S16 D24,D17 @rev e[k] k-> 4-7 row 1 1116*c83a76b0SSuyog Pawar VREV64.S16 D25,D21 @rev e[k] k-> 4-7 row 2 1117*c83a76b0SSuyog Pawar VMOV.S16 D17,D20 1118*c83a76b0SSuyog Pawar 1119*c83a76b0SSuyog Pawar @arrangement OF DATA 1120*c83a76b0SSuyog Pawar @Q8 A1 A2 A3 A4 B1 B2 B3 B4 1121*c83a76b0SSuyog Pawar @Q12 A8 A7 A6 A5 B8 B7 B6 B5 1122*c83a76b0SSuyog Pawar 1123*c83a76b0SSuyog Pawar VADD.S16 Q13,Q8,Q12 @ee[k] = e[k] + e[7 - k] row 1 & 2 1124*c83a76b0SSuyog Pawar VSUB.S16 Q0,Q8,Q12 @eo[k] = e[k] - e[7 - k] row 1 & 2 1125*c83a76b0SSuyog Pawar 1126*c83a76b0SSuyog Pawar @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3] 1127*c83a76b0SSuyog Pawar @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3] 1128*c83a76b0SSuyog Pawar VTRN.S32 D26,D27 @1-cycle stall before it? 1129*c83a76b0SSuyog Pawar @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1130*c83a76b0SSuyog Pawar @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3] 1131*c83a76b0SSuyog Pawar VREV32.16 D2,D27 @1-cycle stall before it? 1132*c83a76b0SSuyog Pawar @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1133*c83a76b0SSuyog Pawar @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2] 1134*c83a76b0SSuyog Pawar VMOV.S16 D27,D26 1135*c83a76b0SSuyog Pawar VNEG.S16 D3,D2 1136*c83a76b0SSuyog Pawar @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1] R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1137*c83a76b0SSuyog Pawar @Q1 R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2] 1138*c83a76b0SSuyog Pawar 1139*c83a76b0SSuyog Pawar @D8 : [0 0] [4 0] [8 0] [12 0] 1140*c83a76b0SSuyog Pawar @D9 : [0 1] [4 1] [8 1] [12 1] 1141*c83a76b0SSuyog Pawar VLD1.S16 {d8,d9},[SP] @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1] 1142*c83a76b0SSuyog Pawar VADD.S16 Q1,Q13,Q1 @ 1-cycle stall before it? 1143*c83a76b0SSuyog Pawar @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1] 1144*c83a76b0SSuyog Pawar 1145*c83a76b0SSuyog Pawar @Q1 R1eee[0] R1eee[1] R2eee[0] R2eee[1] 1146*c83a76b0SSuyog Pawar @ R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1] 1147*c83a76b0SSuyog Pawar VTRN.S16 D2,D3 @2-cycle stall before it? 1148*c83a76b0SSuyog Pawar @Q1 R1eee[0] R1eeo[0] R2eee[0] R2eeo[0] 1149*c83a76b0SSuyog Pawar @ R1eee[1] R1eeo[1] R2eee[1] R2eeo[1] 1150*c83a76b0SSuyog Pawar 1151*c83a76b0SSuyog Pawar VDUP.S32 D4,D2[0] @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] ;1-cycle stall? 1152*c83a76b0SSuyog Pawar VDUP.S32 D5,D2[1] @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0] 1153*c83a76b0SSuyog Pawar VDUP.S32 D6,D3[0] @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1] 1154*c83a76b0SSuyog Pawar VDUP.S32 D7,D3[1] @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1] 1155*c83a76b0SSuyog Pawar 1156*c83a76b0SSuyog Pawar @---------------Process EO-------------------- 1157*c83a76b0SSuyog Pawar @ Early start to avoid stalls 1158*c83a76b0SSuyog Pawar MOV R12,#COFF_STD_2B @Get stride of coeffs 1159*c83a76b0SSuyog Pawar 1160*c83a76b0SSuyog Pawar VMULL.S16 Q5,D4,D8 @ g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] 1161*c83a76b0SSuyog Pawar VMLAL.S16 Q5,D6,D9 @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1] 1162*c83a76b0SSuyog Pawar VMULL.S16 Q6,D5,D8 @ g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0] 1163*c83a76b0SSuyog Pawar VMLAL.S16 Q6,D7,D9 @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1] 1164*c83a76b0SSuyog Pawar 1165*c83a76b0SSuyog Pawar ADD R11,R9,R12,LSL #1 @Load address of g_ai2_ihevc_trans_16[2] 1166*c83a76b0SSuyog Pawar LSL R12,R12,#2 1167*c83a76b0SSuyog Pawar 1168*c83a76b0SSuyog Pawar VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4]] 1169*c83a76b0SSuyog Pawar 1170*c83a76b0SSuyog Pawar VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4] 1171*c83a76b0SSuyog Pawar VMULL.S16 Q1,D26,D0 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R1 1172*c83a76b0SSuyog Pawar 1173*c83a76b0SSuyog Pawar VMULL.S16 Q2,D26,D1 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R2 1174*c83a76b0SSuyog Pawar 1175*c83a76b0SSuyog Pawar VZIP.S32 Q5,Q6 @3-cycle instruction 1176*c83a76b0SSuyog Pawar VMULL.S16 Q3,D27,D0 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R1 1177*c83a76b0SSuyog Pawar 1178*c83a76b0SSuyog Pawar 1179*c83a76b0SSuyog Pawar VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4] 1180*c83a76b0SSuyog Pawar VMULL.S16 Q4,D27,D1 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R2 1181*c83a76b0SSuyog Pawar 1182*c83a76b0SSuyog Pawar @These values must go to 0 4 8 12 colums hence we need stride *4 1183*c83a76b0SSuyog Pawar LSL R10,R7,#2 1184*c83a76b0SSuyog Pawar 1185*c83a76b0SSuyog Pawar VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4] 1186*c83a76b0SSuyog Pawar 1187*c83a76b0SSuyog Pawar VST1.32 D10,[R2],R10 1188*c83a76b0SSuyog Pawar VMULL.S16 Q8,D27,D1 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2 1189*c83a76b0SSuyog Pawar 1190*c83a76b0SSuyog Pawar VST1.32 D11,[R2],R10 1191*c83a76b0SSuyog Pawar VMULL.S16 Q7,D27,D0 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1 1192*c83a76b0SSuyog Pawar 1193*c83a76b0SSuyog Pawar VST1.32 D12,[R2],R10 1194*c83a76b0SSuyog Pawar VMULL.S16 Q5,D26,D0 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1 1195*c83a76b0SSuyog Pawar 1196*c83a76b0SSuyog Pawar VST1.32 D13,[R2],R10 1197*c83a76b0SSuyog Pawar VMULL.S16 Q6,D26,D1 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2 1198*c83a76b0SSuyog Pawar 1199*c83a76b0SSuyog Pawar SUB R2,R2,R10,LSL #2 1200*c83a76b0SSuyog Pawar 1201*c83a76b0SSuyog Pawar @transpose the 4x4 matrix row1 1202*c83a76b0SSuyog Pawar VTRN.32 Q1, Q3 @R1 transpose1 -- 2 cycles 1203*c83a76b0SSuyog Pawar 1204*c83a76b0SSuyog Pawar @transpose the 4x4 matrix row2 1205*c83a76b0SSuyog Pawar VTRN.32 Q2,Q4 @R2 transpose1 -- 2 cycles 1206*c83a76b0SSuyog Pawar 1207*c83a76b0SSuyog Pawar VTRN.32 Q5, Q7 @R1 transpose1 -- 2 cycles 1208*c83a76b0SSuyog Pawar 1209*c83a76b0SSuyog Pawar VTRN.32 Q6,Q8 @R2 transpose1 -- 2 cycles 1210*c83a76b0SSuyog Pawar 1211*c83a76b0SSuyog Pawar VSWP D10,D3 @R1 transpose2 1212*c83a76b0SSuyog Pawar VSWP D14,D7 @R1 transpose2 1213*c83a76b0SSuyog Pawar 1214*c83a76b0SSuyog Pawar VSWP D12,D5 @R2 transpose2 1215*c83a76b0SSuyog Pawar VSWP D16,D9 @R2 transpose2 1216*c83a76b0SSuyog Pawar 1217*c83a76b0SSuyog Pawar VADD.S32 Q5,Q5,Q1 @R1 add 1218*c83a76b0SSuyog Pawar VADD.S32 Q3,Q3,Q7 @R1 add 1219*c83a76b0SSuyog Pawar 1220*c83a76b0SSuyog Pawar VADD.S32 Q2,Q2,Q4 @R2 add 1221*c83a76b0SSuyog Pawar VADD.S32 Q6,Q6,Q8 @R2 add 1222*c83a76b0SSuyog Pawar 1223*c83a76b0SSuyog Pawar VADD.S32 Q5,Q5,Q3 @R1 add 1224*c83a76b0SSuyog Pawar 1225*c83a76b0SSuyog Pawar VADD.S32 Q4,Q6,Q2 @R2 add 1226*c83a76b0SSuyog Pawar 1227*c83a76b0SSuyog Pawar @-----------------------Processing O ---------------------------- 1228*c83a76b0SSuyog Pawar @ Early start to avoid stalls 1229*c83a76b0SSuyog Pawar MOV R12,#COFF_STD_2B @Get coeffs stride 1230*c83a76b0SSuyog Pawar LSL R12,R12,#1 1231*c83a76b0SSuyog Pawar ADD R11,R9,#COFF_STD_2B @Get address of g_ai2_ihevc_trans_16[1] 1232*c83a76b0SSuyog Pawar 1233*c83a76b0SSuyog Pawar VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles 1234*c83a76b0SSuyog Pawar 1235*c83a76b0SSuyog Pawar VZIP.S32 Q5,Q4 @ 3 cycle instruction 1236*c83a76b0SSuyog Pawar VMULL.S16 Q6,D18,D4 @o[0][0-3]* R1 1237*c83a76b0SSuyog Pawar 1238*c83a76b0SSuyog Pawar 1239*c83a76b0SSuyog Pawar VMLAL.S16 Q6,D19,D5 @o[0][4-7]* R1 ; follows MULL instruction: Multiplier accumulator forwarding 1240*c83a76b0SSuyog Pawar @write to memory 1241*c83a76b0SSuyog Pawar @this should go to 2 6 10 14 1242*c83a76b0SSuyog Pawar LSL R10,R7,#2 1243*c83a76b0SSuyog Pawar ADD R2,R2,R7,LSL #1 @move to third row 1244*c83a76b0SSuyog Pawar VST1.32 D10,[R2],R10 1245*c83a76b0SSuyog Pawar VMULL.S16 Q7,D22,D4 @o[0][0-3]* R2 1246*c83a76b0SSuyog Pawar 1247*c83a76b0SSuyog Pawar VST1.32 D11,[R2],R10 1248*c83a76b0SSuyog Pawar VMLAL.S16 Q7,D23,D5 @o[0][4-7]* R2 1249*c83a76b0SSuyog Pawar 1250*c83a76b0SSuyog Pawar VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1251*c83a76b0SSuyog Pawar 1252*c83a76b0SSuyog Pawar VST1.32 D8,[R2],R10 1253*c83a76b0SSuyog Pawar VMULL.S16 Q8,D18,D4 @o[1][0-3]* R1 1254*c83a76b0SSuyog Pawar 1255*c83a76b0SSuyog Pawar VST1.32 D9,[R2],R10 1256*c83a76b0SSuyog Pawar VMLAL.S16 Q8,D19,D5 @o[1][4-7]* R1 1257*c83a76b0SSuyog Pawar SUB R2,R2,R10,LSL #2 1258*c83a76b0SSuyog Pawar SUB R2,R2,R7,LSL #1 1259*c83a76b0SSuyog Pawar 1260*c83a76b0SSuyog Pawar @--------------------Done procrssing EO ------------------------- 1261*c83a76b0SSuyog Pawar 1262*c83a76b0SSuyog Pawar @ -----------------Processing O continues------------------------ 1263*c83a76b0SSuyog Pawar 1264*c83a76b0SSuyog Pawar VMULL.S16 Q10,D22,D4 @o[1][0-3]* R2 1265*c83a76b0SSuyog Pawar VMLAL.S16 Q10,D23,D5 @o[1][4-7]* R2 1266*c83a76b0SSuyog Pawar 1267*c83a76b0SSuyog Pawar VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7] 1268*c83a76b0SSuyog Pawar 1269*c83a76b0SSuyog Pawar VLD1.S16 {d6,d7},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7] 1270*c83a76b0SSuyog Pawar VMULL.S16 Q12,D18,D4 @o[2][0-3]* R1 1271*c83a76b0SSuyog Pawar 1272*c83a76b0SSuyog Pawar VMLAL.S16 Q12,D19,D5 @o[2][4-7]* R1 1273*c83a76b0SSuyog Pawar VMULL.S16 Q0,D18,D6 @o[3][0-3]* R1 1274*c83a76b0SSuyog Pawar VMLAL.S16 Q0,D19,D7 @o[3][4-7]* R1 1275*c83a76b0SSuyog Pawar 1276*c83a76b0SSuyog Pawar VMULL.S16 Q13,D22,D4 @o[2][0-3]* R2 1277*c83a76b0SSuyog Pawar VMLAL.S16 Q13,D23,D5 @o[2][4-7]* R2 1278*c83a76b0SSuyog Pawar VMULL.S16 Q1,D22,D6 @o[3][0-3]* R2 1279*c83a76b0SSuyog Pawar VMLAL.S16 Q1,D23,D7 @o[3][4-7]* R2 1280*c83a76b0SSuyog Pawar 1281*c83a76b0SSuyog Pawar @transpose the 4x4 matrix R1 1282*c83a76b0SSuyog Pawar VTRN.32 Q6, Q8 @ 2-cycle instruction 1283*c83a76b0SSuyog Pawar 1284*c83a76b0SSuyog Pawar VTRN.32 Q12,Q0 @ 2-cycle instruction 1285*c83a76b0SSuyog Pawar 1286*c83a76b0SSuyog Pawar @transpose the 4x4 matrix R2 1287*c83a76b0SSuyog Pawar VTRN.32 Q7,Q10 @ 2-cycle instruction 1288*c83a76b0SSuyog Pawar 1289*c83a76b0SSuyog Pawar VTRN.32 Q13,Q1 @ 2-cycle instruction 1290*c83a76b0SSuyog Pawar 1291*c83a76b0SSuyog Pawar VSWP D24,D13 1292*c83a76b0SSuyog Pawar VSWP D0, D17 1293*c83a76b0SSuyog Pawar 1294*c83a76b0SSuyog Pawar VSWP D26,D15 1295*c83a76b0SSuyog Pawar VSWP D2,D21 1296*c83a76b0SSuyog Pawar 1297*c83a76b0SSuyog Pawar VADD.S32 Q8 ,Q8 ,Q6 1298*c83a76b0SSuyog Pawar VADD.S32 Q12,Q12,Q0 1299*c83a76b0SSuyog Pawar 1300*c83a76b0SSuyog Pawar VADD.S32 Q10,Q10,Q7 1301*c83a76b0SSuyog Pawar VADD.S32 Q13,Q13,Q1 1302*c83a76b0SSuyog Pawar 1303*c83a76b0SSuyog Pawar VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[9][0-7] 1304*c83a76b0SSuyog Pawar VADD.S32 Q12 ,Q12 ,Q8 1305*c83a76b0SSuyog Pawar 1306*c83a76b0SSuyog Pawar VADD.S32 Q13,Q13,Q10 1307*c83a76b0SSuyog Pawar VMULL.S16 Q3,D18,D4 @o[4][0-3]* R1 1308*c83a76b0SSuyog Pawar VMLAL.S16 Q3,D19,D5 @o[4][4-7]* R1 1309*c83a76b0SSuyog Pawar 1310*c83a76b0SSuyog Pawar VZIP.S32 Q12,Q13 1311*c83a76b0SSuyog Pawar VMULL.S16 Q4,D22,D4 @o[0][0-3]* R2 1312*c83a76b0SSuyog Pawar 1313*c83a76b0SSuyog Pawar 1314*c83a76b0SSuyog Pawar VMLAL.S16 Q4,D23,D5 @o[0][4-7]* R2 1315*c83a76b0SSuyog Pawar @write to memory 1316*c83a76b0SSuyog Pawar @this should go to 1 3 5 7 1317*c83a76b0SSuyog Pawar ADD R2,R2,R7 1318*c83a76b0SSuyog Pawar LSL R7,R7,#1 1319*c83a76b0SSuyog Pawar VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[11][0-7] 1320*c83a76b0SSuyog Pawar 1321*c83a76b0SSuyog Pawar VST1.32 D24,[R2],R7 1322*c83a76b0SSuyog Pawar VMULL.S16 Q5,D18,D4 @o[5][0-3]* R1 1323*c83a76b0SSuyog Pawar 1324*c83a76b0SSuyog Pawar VST1.32 D25,[R2],R7 1325*c83a76b0SSuyog Pawar VMLAL.S16 Q5,D19,D5 @o[5][4-7]* R1 1326*c83a76b0SSuyog Pawar 1327*c83a76b0SSuyog Pawar VST1.32 D26,[R2],R7 1328*c83a76b0SSuyog Pawar VMULL.S16 Q6,D22,D4 @o[0][0-3]* R2 1329*c83a76b0SSuyog Pawar 1330*c83a76b0SSuyog Pawar VST1.32 D27,[R2],R7 1331*c83a76b0SSuyog Pawar VMLAL.S16 Q6,D23,D5 @o[0][4-7]* R2 1332*c83a76b0SSuyog Pawar 1333*c83a76b0SSuyog Pawar VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[13][0-7] 1334*c83a76b0SSuyog Pawar 1335*c83a76b0SSuyog Pawar VLD1.S16 {d2,d3},[R11],R12 @g_ai2_ihevc_trans_16[15][0-7] 1336*c83a76b0SSuyog Pawar VMULL.S16 Q7,D18,D4 @o[6][0-3]* R1 1337*c83a76b0SSuyog Pawar 1338*c83a76b0SSuyog Pawar VMLAL.S16 Q7,D19,D5 @o[6][4-7]* R1 1339*c83a76b0SSuyog Pawar VMULL.S16 Q10,D18,D2 @o[7][0-3]* R1 1340*c83a76b0SSuyog Pawar VMLAL.S16 Q10,D19,D3 @o[7][4-7]* R1 1341*c83a76b0SSuyog Pawar 1342*c83a76b0SSuyog Pawar VMULL.S16 Q8,D22,D4 @o[0][0-3]* R2 1343*c83a76b0SSuyog Pawar VMLAL.S16 Q8,D23,D5 @o[0][4-7]* R2 1344*c83a76b0SSuyog Pawar VMULL.S16 Q12,D22,D2 @o[0][0-3]* R2 1345*c83a76b0SSuyog Pawar VMLAL.S16 Q12,D23,D3 @o[0][4-7]* R2 1346*c83a76b0SSuyog Pawar 1347*c83a76b0SSuyog Pawar 1348*c83a76b0SSuyog Pawar @transpose the 4x4 matrix R1 1349*c83a76b0SSuyog Pawar VTRN.32 Q3 ,Q5 @ 2-cycle instruction 1350*c83a76b0SSuyog Pawar 1351*c83a76b0SSuyog Pawar VTRN.32 Q7 ,Q10 @ transpose step 2 R1 , 2-cycle instruction 1352*c83a76b0SSuyog Pawar 1353*c83a76b0SSuyog Pawar @transpose the 4x4 matrix R2 1354*c83a76b0SSuyog Pawar VTRN.32 Q4 ,Q6 @ 2-cycle instruction 1355*c83a76b0SSuyog Pawar 1356*c83a76b0SSuyog Pawar VTRN.32 Q8 ,Q12 @ transpose step 2 R2 , 2-cycle instruction 1357*c83a76b0SSuyog Pawar 1358*c83a76b0SSuyog Pawar VSWP D14,D7 @ transpose step 3, R1 1359*c83a76b0SSuyog Pawar VSWP D20,D11 @ transpose step 4, R1 1360*c83a76b0SSuyog Pawar VSWP D16,D9 @ transpose step 3, R2 1361*c83a76b0SSuyog Pawar VSWP D24,D13 @ transpose step 4, R2 1362*c83a76b0SSuyog Pawar 1363*c83a76b0SSuyog Pawar VADD.S32 Q5 ,Q5 ,Q3 1364*c83a76b0SSuyog Pawar VADD.S32 Q10,Q10,Q7 1365*c83a76b0SSuyog Pawar VADD.S32 Q6 ,Q6 ,Q4 1366*c83a76b0SSuyog Pawar VADD.S32 Q12,Q12,Q8 1367*c83a76b0SSuyog Pawar VADD.S32 Q10,Q10,Q5 1368*c83a76b0SSuyog Pawar VADD.S32 Q12,Q12,Q6 1369*c83a76b0SSuyog Pawar 1370*c83a76b0SSuyog Pawar @ 2-cycle stall 1371*c83a76b0SSuyog Pawar VZIP.S32 Q10,Q12 @ 3-cycle instruction 1372*c83a76b0SSuyog Pawar 1373*c83a76b0SSuyog Pawar @ 2-cycle stall 1374*c83a76b0SSuyog Pawar @this should go to 9 11 13 15 1375*c83a76b0SSuyog Pawar VST1.32 D20,[R2],R7 1376*c83a76b0SSuyog Pawar 1377*c83a76b0SSuyog Pawar VST1.32 D21,[R2],R7 1378*c83a76b0SSuyog Pawar 1379*c83a76b0SSuyog Pawar VST1.32 D24,[R2],R7 1380*c83a76b0SSuyog Pawar 1381*c83a76b0SSuyog Pawar VST1.32 D25,[R2],R7 1382*c83a76b0SSuyog Pawar 1383*c83a76b0SSuyog Pawar SUB R2,R2,R7,LSL #3 1384*c83a76b0SSuyog Pawar LSR R7,R7,#1 1385*c83a76b0SSuyog Pawar SUB R2,R2,R7 1386*c83a76b0SSuyog Pawar 1387*c83a76b0SSuyog Pawar ADD R2,R2,#8 @MOVE TO NEXT to next COLUMN - pi4_tmp 1388*c83a76b0SSuyog Pawar 1389*c83a76b0SSuyog Pawar ADD R8,R8,#2 @increment loop cntr 1390*c83a76b0SSuyog Pawar CMP R8,#16 @check lllop cntr 1391*c83a76b0SSuyog Pawar BNE CORE_LOOP_16X16_HORIZ @jump acc 1392*c83a76b0SSuyog Pawar 1393*c83a76b0SSuyog Pawar 1394*c83a76b0SSuyog Pawar@*****************Vertical transform************************************ 1395*c83a76b0SSuyog Pawar 1396*c83a76b0SSuyog Pawar@Initialization for vert transform 1397*c83a76b0SSuyog Pawar@pi4_tmp will be the new src 1398*c83a76b0SSuyog Pawar@tmp stride will be new src stride 1399*c83a76b0SSuyog Pawar@dst will be new pi4_tmp 1400*c83a76b0SSuyog Pawar@dst stride will be new tmp stride 1401*c83a76b0SSuyog Pawar@trans table will be of 32 bit 1402*c83a76b0SSuyog Pawar 1403*c83a76b0SSuyog Pawar LDR R9,g_ai4_ihevc_trans_16_addr @get 32 bit transform matrix 1404*c83a76b0SSuyog Pawarulbl3: 1405*c83a76b0SSuyog Pawar ADD R9, R9, PC 1406*c83a76b0SSuyog Pawar 1407*c83a76b0SSuyog Pawar SUB R0,R2,#64 @set tmp as src [-32 to move back to orgin] 1408*c83a76b0SSuyog Pawar MOV R2,R3 @set dst as tmp 1409*c83a76b0SSuyog Pawar MOV R4,#TMP_STRIDE @set tmp stride as src stride 1410*c83a76b0SSuyog Pawar LSL R7,R6,#1 @Set dst stride as tmp stride 1411*c83a76b0SSuyog Pawar SUB R4,#48 @Adjust stride 3 previous loads 1412*c83a76b0SSuyog Pawar 1413*c83a76b0SSuyog Pawar @Block SAD 1414*c83a76b0SSuyog Pawar VADD.S32 D28,D28,D29 1415*c83a76b0SSuyog Pawar VPADD.S32 D28,D28,D29 1416*c83a76b0SSuyog Pawar VMOV.S32 R3,D28[0] 1417*c83a76b0SSuyog Pawar @ SAD calculation ends -- final value in R3. 1418*c83a76b0SSuyog Pawar 1419*c83a76b0SSuyog Pawar @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] 1420*c83a76b0SSuyog Pawar @values of g_ai4_ihevc_trans_16 and write to stack 1421*c83a76b0SSuyog Pawar MOV R12,#COFF_STD_W 1422*c83a76b0SSuyog Pawar LSL R12,R12,#2 1423*c83a76b0SSuyog Pawar VLD1.S32 D28,[R9],R12 1424*c83a76b0SSuyog Pawar VLD1.S32 D29,[R9],R12 1425*c83a76b0SSuyog Pawar VLD1.S32 D30,[R9],R12 1426*c83a76b0SSuyog Pawar VLD1.S32 D31,[R9],R12 1427*c83a76b0SSuyog Pawar SUB R9,R9,R12,LSL #2 1428*c83a76b0SSuyog Pawar 1429*c83a76b0SSuyog Pawar VREV64.32 Q15,Q15 1430*c83a76b0SSuyog Pawar VTRN.S32 Q14,Q15 1431*c83a76b0SSuyog Pawar VST1.S32 {Q14-Q15},[SP] 1432*c83a76b0SSuyog Pawar 1433*c83a76b0SSuyog Pawar VMOV.U32 Q14,#RADD @get the round factor to q14 1434*c83a76b0SSuyog Pawar VMOV.U32 Q15,#SHIFT @Get the shift to neon 1435*c83a76b0SSuyog Pawar 1436*c83a76b0SSuyog Pawar MOV R8,#0 @INIT LOOP 1437*c83a76b0SSuyog Pawar 1438*c83a76b0SSuyog PawarCORE_LOOP_16X16_VERT: 1439*c83a76b0SSuyog Pawar 1440*c83a76b0SSuyog Pawar VLD1.S32 {D0,D1},[R0]! @LOAD 1-4 src R1 1441*c83a76b0SSuyog Pawar VLD1.S32 {D2,D3},[R0]! @LOAD 5-8 pred R1 1442*c83a76b0SSuyog Pawar VLD1.S32 {D4,D5},[R0]! @LOAD 9-12 src R1 1443*c83a76b0SSuyog Pawar VLD1.S32 {D6,D7},[R0],R4 @LOAD 12-16 pred R1 1444*c83a76b0SSuyog Pawar 1445*c83a76b0SSuyog Pawar VLD1.S32 {D8,D9},[R0]! @LOAD 1-4 src R2 1446*c83a76b0SSuyog Pawar VLD1.S32 {D10,D11},[R0]! @LOAD 5-8 pred R2 1447*c83a76b0SSuyog Pawar VLD1.S32 {D12,D13},[R0]! @LOAD 9-12 src R2 1448*c83a76b0SSuyog Pawar VLD1.S32 {D14,D15},[R0],R4 @LOAD 12-16 pred R2 1449*c83a76b0SSuyog Pawar 1450*c83a76b0SSuyog Pawar VREV64.S32 Q2,Q2 @Rev 9-12 R1 1451*c83a76b0SSuyog Pawar VREV64.S32 Q3,Q3 @Rev 12-16 R1 1452*c83a76b0SSuyog Pawar VREV64.S32 Q6,Q6 @Rev 9-12 R2 1453*c83a76b0SSuyog Pawar VREV64.S32 Q7,Q7 @Rev 12-16 R2 1454*c83a76b0SSuyog Pawar 1455*c83a76b0SSuyog Pawar VSWP D6,D7 1456*c83a76b0SSuyog Pawar VSWP D4,D5 1457*c83a76b0SSuyog Pawar VADD.S32 Q8 ,Q0,Q3 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R1 1458*c83a76b0SSuyog Pawar VSWP D12,D13 @ dual issued with prev. instruction 1459*c83a76b0SSuyog Pawar VADD.S32 Q9 ,Q1,Q2 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R1 1460*c83a76b0SSuyog Pawar VSWP D14,D15 @ dual issued with prev. instruction 1461*c83a76b0SSuyog Pawar VSUB.S32 Q10,Q0,Q3 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R1 1462*c83a76b0SSuyog Pawar VSUB.S32 Q11,Q1,Q2 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R1 1463*c83a76b0SSuyog Pawar 1464*c83a76b0SSuyog Pawar VADD.S32 Q12,Q4,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R2 1465*c83a76b0SSuyog Pawar VREV64.S32 Q9 ,Q9 @rev e[k] k-> 4-7 R1, dual issued with prev. instruction 1466*c83a76b0SSuyog Pawar VADD.S32 Q13,Q5,Q6 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R2 1467*c83a76b0SSuyog Pawar VSUB.S32 Q0 ,Q4,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R2 1468*c83a76b0SSuyog Pawar VSWP D18,D19 @ dual issued with prev. instruction 1469*c83a76b0SSuyog Pawar VSUB.S32 Q1 ,Q5,Q6 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R2 1470*c83a76b0SSuyog Pawar VREV64.S32 Q13,Q13 @rev e[k] k-> 4-7 R2, dual issued with prev. instruction 1471*c83a76b0SSuyog Pawar 1472*c83a76b0SSuyog Pawar VADD.S32 Q2,Q8,Q9 @ee[k] = e[k] + e[7 - k] row R1 1473*c83a76b0SSuyog Pawar VSUB.S32 Q3,Q8,Q9 @eo[k] = e[k] - e[7 - k] row R1 1474*c83a76b0SSuyog Pawar VSWP D26,D27 1475*c83a76b0SSuyog Pawar 1476*c83a76b0SSuyog Pawar 1477*c83a76b0SSuyog Pawar VADD.S32 Q4,Q12,Q13 @ee[k] = e[k] + e[7 - k] row R2 1478*c83a76b0SSuyog Pawar VSUB.S32 Q5,Q12,Q13 @eo[k] = e[k] - e[7 - k] row R2 1479*c83a76b0SSuyog Pawar VREV64.S32 D5,D5 @rev ee[k] 4-7 R1, dual issued with prev. instruction 1480*c83a76b0SSuyog Pawar 1481*c83a76b0SSuyog Pawar VADD.S32 D12,D4,D5 @eee[0] eee[1] R1 1482*c83a76b0SSuyog Pawar VSUB.S32 D13,D4,D5 @eeo[0] eeo[1] R1 1483*c83a76b0SSuyog Pawar VREV64.S32 D9,D9 @rev ee[k] 4-7 R2, dual issued with prev. instruction 1484*c83a76b0SSuyog Pawar 1485*c83a76b0SSuyog Pawar 1486*c83a76b0SSuyog Pawar VADD.S32 D14,D8,D9 @eee[0] eee[1] R2 1487*c83a76b0SSuyog Pawar VSUB.S32 D15,D8,D9 @eeo[0] eeo[1] R2 1488*c83a76b0SSuyog Pawar 1489*c83a76b0SSuyog Pawar VLD1.S32 {Q12,Q13},[SP] @Load g_ai2_ihevc_trans_16[xx]-> Q12 : [0 0] [8 0] [4 0] [12 0] Q13 : [0 1] [8 1] [4 1] [12 1] 1490*c83a76b0SSuyog Pawar VREV64.S32 Q8,Q6 @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1 -> ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1 1491*c83a76b0SSuyog Pawar 1492*c83a76b0SSuyog Pawar VREV64.S32 Q9,Q7 @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2 -> ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2 1493*c83a76b0SSuyog Pawar 1494*c83a76b0SSuyog Pawar 1495*c83a76b0SSuyog Pawar VMUL.S32 Q4,Q6,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R1 1496*c83a76b0SSuyog Pawar VMLA.S32 Q4,Q8,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R1 1497*c83a76b0SSuyog Pawar 1498*c83a76b0SSuyog Pawar VMUL.S32 Q6,Q7,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R2 1499*c83a76b0SSuyog Pawar VMLA.S32 Q6,Q9,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2 1500*c83a76b0SSuyog Pawar 1501*c83a76b0SSuyog Pawar @Q3 :R1E00 R1E01 R1E02 R1E03 1502*c83a76b0SSuyog Pawar @Q5 :R2E00 R2E01 R2E02 R2E03 1503*c83a76b0SSuyog Pawar VSWP D7,D10 @ dual issued with prev. instruction 1504*c83a76b0SSuyog Pawar @Q3 :R1E00 R1E01 R2E00 R2E01 1505*c83a76b0SSuyog Pawar @Q5 :R1E02 R1E03 R2E02 R2E03 1506*c83a76b0SSuyog Pawar VSWP D7,D11 1507*c83a76b0SSuyog Pawar @Q3 :R1E00 R1E01 R2E02 R2E03 1508*c83a76b0SSuyog Pawar @Q5 :R1E02 R1E03 R2E00 R2E01 1509*c83a76b0SSuyog Pawar 1510*c83a76b0SSuyog Pawar MOV R12,#COFF_STD_W 1511*c83a76b0SSuyog Pawar ADD R11,R9,R12,LSL #1 @Get to the 2nd row of src 1512*c83a76b0SSuyog Pawar LSL R12,R12,#2 1513*c83a76b0SSuyog Pawar 1514*c83a76b0SSuyog Pawar VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr. 1515*c83a76b0SSuyog Pawar 1516*c83a76b0SSuyog Pawar VADD.S32 Q4,Q4,Q14 @ROUND R1 1517*c83a76b0SSuyog Pawar VMUL.S32 Q12,Q3,Q7 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction 1518*c83a76b0SSuyog Pawar VSWP D14,D15 @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction 1519*c83a76b0SSuyog Pawar 1520*c83a76b0SSuyog Pawar VADD.S32 Q6,Q6,Q14 @ROUND R2 1521*c83a76b0SSuyog Pawar 1522*c83a76b0SSuyog Pawar VSHRN.S32 D8,Q4,#SHIFT @NARROW R1 1523*c83a76b0SSuyog Pawar 1524*c83a76b0SSuyog Pawar VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4] 1525*c83a76b0SSuyog Pawar VSHRN.S32 D9,Q6,#SHIFT @NARROW R2, dual issued in 2nd cycle 1526*c83a76b0SSuyog Pawar 1527*c83a76b0SSuyog Pawar VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction 1528*c83a76b0SSuyog Pawar VSWP D16,D17 @dual issued with prev. instr. 1529*c83a76b0SSuyog Pawar 1530*c83a76b0SSuyog Pawar VZIP.S16 D8,D9 @INTERLEAVE R1 R2 R1 R2 R1 R2 to write 1531*c83a76b0SSuyog Pawar VMLA.S32 Q12,Q5,Q7 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction 1532*c83a76b0SSuyog Pawar 1533*c83a76b0SSuyog Pawar 1534*c83a76b0SSuyog Pawar @WRITE INTO MEM the values or wait to be shuffled 1535*c83a76b0SSuyog Pawar @These values must go to 0 4 8 12 colums 1536*c83a76b0SSuyog Pawar LSL R10,R7,#2 1537*c83a76b0SSuyog Pawar VST1.S32 D8[0],[R2],R10 1538*c83a76b0SSuyog Pawar 1539*c83a76b0SSuyog Pawar VST1.S32 D9[0],[R2],R10 1540*c83a76b0SSuyog Pawar 1541*c83a76b0SSuyog Pawar VST1.S32 D8[1],[R2],R10 1542*c83a76b0SSuyog Pawar VPADD.S32 D18,D24,D25 @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03 1543*c83a76b0SSuyog Pawar @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01 1544*c83a76b0SSuyog Pawar 1545*c83a76b0SSuyog Pawar VST1.S32 D9[1],[R2],R10 1546*c83a76b0SSuyog Pawar VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1547*c83a76b0SSuyog Pawar LSL R10,R10,#2 1548*c83a76b0SSuyog Pawar SUB R2,R2,R10 1549*c83a76b0SSuyog Pawar 1550*c83a76b0SSuyog Pawar VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4] 1551*c83a76b0SSuyog Pawar 1552*c83a76b0SSuyog Pawar VMUL.S32 Q6,Q3,Q7 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] 1553*c83a76b0SSuyog Pawar VSWP D14,D15 @ dual issued with prev. instruction 1554*c83a76b0SSuyog Pawar VPADD.S32 D19,D4,D5 1555*c83a76b0SSuyog Pawar 1556*c83a76b0SSuyog Pawar VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4] 1557*c83a76b0SSuyog Pawar VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] 1558*c83a76b0SSuyog Pawar VSWP D16,D17 1559*c83a76b0SSuyog Pawar 1560*c83a76b0SSuyog Pawar VMLA.S32 Q6,Q5,Q7 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1561*c83a76b0SSuyog Pawar VADD.S32 Q9,Q9,Q14 @Round by RADD R1 1562*c83a76b0SSuyog Pawar VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1563*c83a76b0SSuyog Pawar VSHRN.S32 D8,Q9,#SHIFT @Shift by SHIFT 1564*c83a76b0SSuyog Pawar VPADD.S32 D24,D12,D13 1565*c83a76b0SSuyog Pawar @---------------Processing O, Row 1 and Row 2-------------------------------------- 1566*c83a76b0SSuyog Pawar @ Early start to avoid stalls 1567*c83a76b0SSuyog Pawar MOV R12,#COFF_STD_W 1568*c83a76b0SSuyog Pawar ADD R11,R9,R12 @Get 1ST row 1569*c83a76b0SSuyog Pawar LSL R12,R12,#1 1570*c83a76b0SSuyog Pawar 1571*c83a76b0SSuyog Pawar LSL R10,R7,#2 1572*c83a76b0SSuyog Pawar ADD R2,R2,R7,LSL #1 @move to third row 1573*c83a76b0SSuyog Pawar @this should go to 2 6 10 14 1574*c83a76b0SSuyog Pawar VST1.S32 D8[0],[R2],R10 1575*c83a76b0SSuyog Pawar 1576*c83a76b0SSuyog Pawar VST1.S32 D8[1],[R2],R10 1577*c83a76b0SSuyog Pawar VPADD.S32 D25,D4,D5 @ dual issued with prev. instruction in 2nd cycle 1578*c83a76b0SSuyog Pawar 1579*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] 1580*c83a76b0SSuyog Pawar VADD.S32 Q12,Q12,Q14 @Round by RADD R2, dual issued with prev. instruction in 2nd cycle 1581*c83a76b0SSuyog Pawar VMUL.S32 Q6,Q2,Q0 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2 1582*c83a76b0SSuyog Pawar VMLA.S32 Q6,Q3,Q1 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2 1583*c83a76b0SSuyog Pawar VSHRN.S32 D9,Q12,#SHIFT @Shift by SHIFT 1584*c83a76b0SSuyog Pawar 1585*c83a76b0SSuyog Pawar VMUL.S32 Q2,Q2,Q10 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1 1586*c83a76b0SSuyog Pawar VMLA.S32 Q2,Q3,Q11 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1 1587*c83a76b0SSuyog Pawar VADD.S32 D11,D12,D13 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr. 1588*c83a76b0SSuyog Pawar VST1.S32 D9[0],[R2],R10 1589*c83a76b0SSuyog Pawar 1590*c83a76b0SSuyog Pawar VST1.S32 D9[1],[R2],R10 1591*c83a76b0SSuyog Pawar VADD.S32 D10,D4,D5 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr. 1592*c83a76b0SSuyog Pawar LSL R10,R10,#2 @go back to orgin 1593*c83a76b0SSuyog Pawar SUB R2,R2,R10 1594*c83a76b0SSuyog Pawar SUB R2,R2,R7,LSL #1 1595*c83a76b0SSuyog Pawar 1596*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1597*c83a76b0SSuyog Pawar 1598*c83a76b0SSuyog Pawar VMUL.S32 Q7,Q2,Q10 @o[0][0-3] 1599*c83a76b0SSuyog Pawar VMLA.S32 Q7,Q3,Q11 @o[0][4-7] 1600*c83a76b0SSuyog Pawar VMUL.S32 Q8,Q2,Q0 @o[0][0-3] 1601*c83a76b0SSuyog Pawar VMLA.S32 Q8,Q3,Q1 @o[0][4-7] 1602*c83a76b0SSuyog Pawar 1603*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7] 1604*c83a76b0SSuyog Pawar VADD.S32 D18,D14,D15 1605*c83a76b0SSuyog Pawar VMUL.S32 Q12,Q2,Q10 @o[0][0-3] 1606*c83a76b0SSuyog Pawar VMLA.S32 Q12,Q3,Q11 @o[0][4-7] 1607*c83a76b0SSuyog Pawar VADD.S32 D19,D16,D17 1608*c83a76b0SSuyog Pawar VMUL.S32 Q4,Q2,Q0 1609*c83a76b0SSuyog Pawar VMLA.S32 Q4,Q3,Q1 1610*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7] 1611*c83a76b0SSuyog Pawar VADD.S32 D26,D24,D25 @ dual issued with prev. instr. 1612*c83a76b0SSuyog Pawar VMUL.S32 Q6,Q2,Q10 @o[0][0-3] 1613*c83a76b0SSuyog Pawar VMLA.S32 Q6,Q3,Q11 @o[0][4-7] 1614*c83a76b0SSuyog Pawar VADD.S32 D27,D8,D9 1615*c83a76b0SSuyog Pawar VMUL.S32 Q4,Q2,Q0 1616*c83a76b0SSuyog Pawar VMLA.S32 Q4,Q3,Q1 1617*c83a76b0SSuyog Pawar VADD.S32 D12,D12,D13 1618*c83a76b0SSuyog Pawar @Q5 Q9 Q13 Q6 1619*c83a76b0SSuyog Pawar VPADD.S32 D14,D10,D11 1620*c83a76b0SSuyog Pawar VPADD.S32 D15,D18,D19 1621*c83a76b0SSuyog Pawar VPADD.S32 D16,D26,D27 1622*c83a76b0SSuyog Pawar VADD.S32 D13,D8,D9 1623*c83a76b0SSuyog Pawar VADD.S32 Q9,Q7,Q14 1624*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[0][0-7] 1625*c83a76b0SSuyog Pawar VPADD.S32 D17,D12,D13 @ dual issued with prev. instr. in 2nd cycle 1626*c83a76b0SSuyog Pawar 1627*c83a76b0SSuyog Pawar VMUL.S32 Q4,Q2,Q10 @o[0][0-3] 1628*c83a76b0SSuyog Pawar VMLA.S32 Q4,Q3,Q11 @o[0][4-7] 1629*c83a76b0SSuyog Pawar 1630*c83a76b0SSuyog Pawar VADD.S32 Q12,Q8,Q14 1631*c83a76b0SSuyog Pawar 1632*c83a76b0SSuyog Pawar VMUL.S32 Q6,Q2,Q0 @o[0][0-3] 1633*c83a76b0SSuyog Pawar VMLA.S32 Q6,Q3,Q1 @o[0][4-7] 1634*c83a76b0SSuyog Pawar 1635*c83a76b0SSuyog Pawar VSHRN.S32 D26,Q9,#SHIFT 1636*c83a76b0SSuyog Pawar VSHRN.S32 D27,Q12,#SHIFT 1637*c83a76b0SSuyog Pawar VADD.S32 D10,D8,D9 1638*c83a76b0SSuyog Pawar @write to memory this should go to 1 3 5 7 1639*c83a76b0SSuyog Pawar ADD R2,R2,R7 1640*c83a76b0SSuyog Pawar LSL R7,R7,#1 1641*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] 1642*c83a76b0SSuyog Pawar VADD.S32 D11,D12,D13 @ dual issued with prev. instr. 1643*c83a76b0SSuyog Pawar 1644*c83a76b0SSuyog Pawar VST1.S32 D26[0],[R2],R7 1645*c83a76b0SSuyog Pawar VMUL.S32 Q7,Q2,Q10 @o[0][0-3] 1646*c83a76b0SSuyog Pawar VMLA.S32 Q7,Q3,Q11 @o[0][4-7] 1647*c83a76b0SSuyog Pawar VST1.S32 D26[1],[R2],R7 1648*c83a76b0SSuyog Pawar VMUL.S32 Q8,Q2,Q0 @o[0][0-3] 1649*c83a76b0SSuyog Pawar VMLA.S32 Q8,Q3,Q1 @o[0][4-7] 1650*c83a76b0SSuyog Pawar VST1.S32 D27[0],[R2],R7 1651*c83a76b0SSuyog Pawar VADD.S32 D18,D14,D15 1652*c83a76b0SSuyog Pawar VST1.S32 D27[1],[R2],R7 1653*c83a76b0SSuyog Pawar 1654*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[2][0-7] 1655*c83a76b0SSuyog Pawar VADD.S32 D19,D16,D17 @ dual issued with prev. instr. 1656*c83a76b0SSuyog Pawar 1657*c83a76b0SSuyog Pawar VMUL.S32 Q12,Q2,Q10 @o[0][0-3] 1658*c83a76b0SSuyog Pawar VMLA.S32 Q12,Q3,Q11 @o[0][4-7] 1659*c83a76b0SSuyog Pawar VMUL.S32 Q4,Q2,Q0 1660*c83a76b0SSuyog Pawar VMLA.S32 Q4,Q3,Q1 1661*c83a76b0SSuyog Pawar 1662*c83a76b0SSuyog Pawar VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1663*c83a76b0SSuyog Pawar VADD.S32 D26,D24,D25 1664*c83a76b0SSuyog Pawar 1665*c83a76b0SSuyog Pawar VMUL.S32 Q6,Q2,Q10 @o[0][0-3] 1666*c83a76b0SSuyog Pawar VMLA.S32 Q6,Q3,Q11 @o[0][4-7] 1667*c83a76b0SSuyog Pawar VADD.S32 D27,D8,D9 1668*c83a76b0SSuyog Pawar 1669*c83a76b0SSuyog Pawar VMUL.S32 Q4,Q2,Q0 1670*c83a76b0SSuyog Pawar VMLA.S32 Q4,Q3,Q1 1671*c83a76b0SSuyog Pawar VADD.S32 D12,D12,D13 1672*c83a76b0SSuyog Pawar @Q5 Q9 Q13 Q6 1673*c83a76b0SSuyog Pawar VPADD.S32 D14,D10,D11 1674*c83a76b0SSuyog Pawar VPADD.S32 D15,D18,D19 1675*c83a76b0SSuyog Pawar VPADD.S32 D16,D26,D27 1676*c83a76b0SSuyog Pawar VADD.S32 D13,D8,D9 1677*c83a76b0SSuyog Pawar VADD.S32 Q9,Q7,Q14 1678*c83a76b0SSuyog Pawar @ 1- cycle stall? 1679*c83a76b0SSuyog Pawar VPADD.S32 D17,D12,D13 1680*c83a76b0SSuyog Pawar VSHRN.S32 D22,Q9,#SHIFT 1681*c83a76b0SSuyog Pawar VADD.S32 Q10,Q8,Q14 1682*c83a76b0SSuyog Pawar @ 2-cycle stall? 1683*c83a76b0SSuyog Pawar VSHRN.S32 D23,Q10,#SHIFT 1684*c83a76b0SSuyog Pawar 1685*c83a76b0SSuyog Pawar @this should go to 9 11 13 15 1686*c83a76b0SSuyog Pawar @LSL R11,R7,#1 1687*c83a76b0SSuyog Pawar VST1.S32 D22[0],[R2],R7 1688*c83a76b0SSuyog Pawar VST1.S32 D22[1],[R2],R7 1689*c83a76b0SSuyog Pawar VST1.S32 D23[0],[R2],R7 1690*c83a76b0SSuyog Pawar VST1.S32 D23[1],[R2],R7 1691*c83a76b0SSuyog Pawar 1692*c83a76b0SSuyog Pawar SUB R2,R2,R7,LSL #3 1693*c83a76b0SSuyog Pawar LSR R7,R7,#1 1694*c83a76b0SSuyog Pawar SUB R2,R2,R7 1695*c83a76b0SSuyog Pawar 1696*c83a76b0SSuyog Pawar ADD R2,R2,#4 @MOVE TO NEXT to next COLUMN 1697*c83a76b0SSuyog Pawar 1698*c83a76b0SSuyog Pawar ADD R8,R8,#2 @increment loop cntr by 2 since we process loop as 2 cols 1699*c83a76b0SSuyog Pawar CMP R8,#16 @check loop cntr 1700*c83a76b0SSuyog Pawar BNE CORE_LOOP_16X16_VERT @jump acc 1701*c83a76b0SSuyog Pawar 1702*c83a76b0SSuyog Pawar MOV R0,R3 1703*c83a76b0SSuyog Pawar 1704*c83a76b0SSuyog Pawar ADD SP,SP,#32 1705*c83a76b0SSuyog Pawar vpop {d8 - d15} 1706*c83a76b0SSuyog Pawar LDMFD sp!,{r4-r12,PC} @stack store values of the arguments 1707*c83a76b0SSuyog Pawar 1708