1*a97c2a1fSXin Li@/****************************************************************************** 2*a97c2a1fSXin Li@ * 3*a97c2a1fSXin Li@ * Copyright (C) 2015 The Android Open Source Project 4*a97c2a1fSXin Li@ * 5*a97c2a1fSXin Li@ * Licensed under the Apache License, Version 2.0 (the "License"); 6*a97c2a1fSXin Li@ * you may not use this file except in compliance with the License. 7*a97c2a1fSXin Li@ * You may obtain a copy of the License at: 8*a97c2a1fSXin Li@ * 9*a97c2a1fSXin Li@ * http://www.apache.org/licenses/LICENSE-2.0 10*a97c2a1fSXin Li@ * 11*a97c2a1fSXin Li@ * Unless required by applicable law or agreed to in writing, software 12*a97c2a1fSXin Li@ * distributed under the License is distributed on an "AS IS" BASIS, 13*a97c2a1fSXin Li@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*a97c2a1fSXin Li@ * See the License for the specific language governing permissions and 15*a97c2a1fSXin Li@ * limitations under the License. 16*a97c2a1fSXin Li@ * 17*a97c2a1fSXin Li@ ***************************************************************************** 18*a97c2a1fSXin Li@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*a97c2a1fSXin Li@*/ 20*a97c2a1fSXin Li 21*a97c2a1fSXin Li@/* 22*a97c2a1fSXin Li@//---------------------------------------------------------------------------- 23*a97c2a1fSXin Li@// File Name : impeg2_idct.s 24*a97c2a1fSXin Li@// 25*a97c2a1fSXin Li@// Description : This file has the Idct Implementations for the 26*a97c2a1fSXin Li@// MPEG2 SP decoder on neon platform. 27*a97c2a1fSXin Li@// 28*a97c2a1fSXin Li@// Reference Document : 29*a97c2a1fSXin Li@// 30*a97c2a1fSXin Li@// Revision History : 31*a97c2a1fSXin Li@// Date Author Detail Description 32*a97c2a1fSXin Li@// ------------ ---------------- ---------------------------------- 33*a97c2a1fSXin Li@// Feb 22, 2008 Naveen Kumar T Created 34*a97c2a1fSXin Li@// 35*a97c2a1fSXin Li@//------------------------------------------------------------------------- 36*a97c2a1fSXin Li@*/ 37*a97c2a1fSXin Li 38*a97c2a1fSXin Li@/* 39*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 40*a97c2a1fSXin Li@// Include Files 41*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 42*a97c2a1fSXin Li@*/ 43*a97c2a1fSXin Li 44*a97c2a1fSXin Li.text 45*a97c2a1fSXin Li.p2align 2 46*a97c2a1fSXin Li.equ idct_stg1_shift , 12 47*a97c2a1fSXin Li.equ idct_stg2_shift , 16 48*a97c2a1fSXin Li.equ idct_stg1_round , (1 << (idct_stg1_shift - 1)) 49*a97c2a1fSXin Li.equ idct_stg2_round , (1 << (idct_stg2_shift - 1)) 50*a97c2a1fSXin Li@/* 51*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 52*a97c2a1fSXin Li@// Struct/Union Types and Define 53*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 54*a97c2a1fSXin Li@*/ 55*a97c2a1fSXin Li 56*a97c2a1fSXin Li@/* 57*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 58*a97c2a1fSXin Li@// Static Global Data section variables 59*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 60*a97c2a1fSXin Li@*/ 61*a97c2a1fSXin Li@//--------------------------- NONE -------------------------------------------- 62*a97c2a1fSXin Li 63*a97c2a1fSXin Li@/* 64*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 65*a97c2a1fSXin Li@// Static Prototype Functions 66*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 67*a97c2a1fSXin Li@*/ 68*a97c2a1fSXin Li@// -------------------------- NONE -------------------------------------------- 69*a97c2a1fSXin Li 70*a97c2a1fSXin Li@/* 71*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 72*a97c2a1fSXin Li@// Exported functions 73*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 74*a97c2a1fSXin Li@*/ 75*a97c2a1fSXin Li 76*a97c2a1fSXin Li .extern gai2_impeg2_idct_q15 77*a97c2a1fSXin Li.hidden gai2_impeg2_idct_q15 78*a97c2a1fSXin Li .extern gai2_impeg2_idct_q11 79*a97c2a1fSXin Li.hidden gai2_impeg2_idct_q11 80*a97c2a1fSXin Li .extern gai2_impeg2_idct_first_col_q15 81*a97c2a1fSXin Li.hidden gai2_impeg2_idct_first_col_q15 82*a97c2a1fSXin Li .extern gai2_impeg2_idct_first_col_q11 83*a97c2a1fSXin Li.hidden gai2_impeg2_idct_first_col_q11 84*a97c2a1fSXin Li .extern gai2_impeg2_mismatch_stg2_additive 85*a97c2a1fSXin Li.hidden gai2_impeg2_mismatch_stg2_additive 86*a97c2a1fSXin Li 87*a97c2a1fSXin Ligai2_impeg2_idct_q15_addr1: 88*a97c2a1fSXin Li .long gai2_impeg2_idct_q15 - q15lbl1 - 8 89*a97c2a1fSXin Ligai2_impeg2_idct_q15_addr2: 90*a97c2a1fSXin Li .long gai2_impeg2_idct_q15 - q15lbl2 - 8 91*a97c2a1fSXin Ligai2_impeg2_idct_q11_addr1: 92*a97c2a1fSXin Li .long gai2_impeg2_idct_q11 - q11lbl1 - 8 93*a97c2a1fSXin Ligai2_impeg2_idct_q11_addr2: 94*a97c2a1fSXin Li .long gai2_impeg2_idct_q11 - q11lbl2 - 8 95*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q15_addr1: 96*a97c2a1fSXin Li .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl1 - 8 97*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q15_addr2: 98*a97c2a1fSXin Li .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl2 - 8 99*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q15_addr3: 100*a97c2a1fSXin Li .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl3 - 8 101*a97c2a1fSXin Ligai2_impeg2_mismatch_stg2_additive_addr: 102*a97c2a1fSXin Li .long gai2_impeg2_mismatch_stg2_additive - additive_lbl - 8 103*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q11_addr1: 104*a97c2a1fSXin Li .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl1 - 8 105*a97c2a1fSXin Ligai2_impeg2_idct_first_col_q11_addr2: 106*a97c2a1fSXin Li .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl2 - 8 107*a97c2a1fSXin Li 108*a97c2a1fSXin Li .global impeg2_idct_recon_dc_a9q 109*a97c2a1fSXin Liimpeg2_idct_recon_dc_a9q: 110*a97c2a1fSXin Li stmfd sp!, {r4, r6, r12, lr} 111*a97c2a1fSXin Li vpush {d8-d15} 112*a97c2a1fSXin Li @//r0: pi2_src 113*a97c2a1fSXin Li @//r1: pi2_tmp - not used, used as pred_strd 114*a97c2a1fSXin Li @//r2: pu1_pred 115*a97c2a1fSXin Li @//r3: pu1_dst 116*a97c2a1fSXin Li @//r4: used as scratch 117*a97c2a1fSXin Li @//r5: 118*a97c2a1fSXin Li 119*a97c2a1fSXin Li ldr r1, [sp, #84] @//pred_strd 120*a97c2a1fSXin Li ldr r6, [sp, #88] @//dst_strd 121*a97c2a1fSXin Li 122*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_q15_addr1 123*a97c2a1fSXin Liq15lbl1: 124*a97c2a1fSXin Li add r14, r14, pc 125*a97c2a1fSXin Li ldrsh r12, [r14] 126*a97c2a1fSXin Li ldrsh r4, [r0] 127*a97c2a1fSXin Li 128*a97c2a1fSXin Li vld1.8 d0, [r2], r1 129*a97c2a1fSXin Li mul r4, r4, r12 130*a97c2a1fSXin Li 131*a97c2a1fSXin Li vld1.8 d1, [r2], r1 132*a97c2a1fSXin Li add r4, #idct_stg1_round 133*a97c2a1fSXin Li 134*a97c2a1fSXin Li vld1.8 d2, [r2], r1 135*a97c2a1fSXin Li asr r4, r4, #idct_stg1_shift 136*a97c2a1fSXin Li 137*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_q11_addr1 138*a97c2a1fSXin Liq11lbl1: 139*a97c2a1fSXin Li add r14, r14, pc 140*a97c2a1fSXin Li ldrsh r12, [r14] 141*a97c2a1fSXin Li 142*a97c2a1fSXin Li vld1.8 d3, [r2], r1 143*a97c2a1fSXin Li mul r4, r4, r12 144*a97c2a1fSXin Li 145*a97c2a1fSXin Li vld1.8 d4, [r2], r1 146*a97c2a1fSXin Li add r4, #idct_stg2_round 147*a97c2a1fSXin Li 148*a97c2a1fSXin Li vld1.8 d5, [r2], r1 149*a97c2a1fSXin Li asr r4, r4, #idct_stg2_shift 150*a97c2a1fSXin Li 151*a97c2a1fSXin Li vld1.8 d6, [r2], r1 152*a97c2a1fSXin Li vdup.s16 q15, r4 153*a97c2a1fSXin Li 154*a97c2a1fSXin Li 155*a97c2a1fSXin Li vld1.8 d7, [r2], r1 156*a97c2a1fSXin Li 157*a97c2a1fSXin Li vaddw.u8 q4, q15, d0 158*a97c2a1fSXin Li 159*a97c2a1fSXin Li vaddw.u8 q5, q15, d1 160*a97c2a1fSXin Li vqmovun.s16 d0, q4 161*a97c2a1fSXin Li 162*a97c2a1fSXin Li vaddw.u8 q6, q15, d2 163*a97c2a1fSXin Li vqmovun.s16 d1, q5 164*a97c2a1fSXin Li vst1.8 d0, [r3], r6 165*a97c2a1fSXin Li 166*a97c2a1fSXin Li vaddw.u8 q7, q15, d3 167*a97c2a1fSXin Li vqmovun.s16 d2, q6 168*a97c2a1fSXin Li vst1.8 d1, [r3], r6 169*a97c2a1fSXin Li 170*a97c2a1fSXin Li vaddw.u8 q8, q15, d4 171*a97c2a1fSXin Li vqmovun.s16 d3, q7 172*a97c2a1fSXin Li vst1.8 d2, [r3], r6 173*a97c2a1fSXin Li 174*a97c2a1fSXin Li vaddw.u8 q9, q15, d5 175*a97c2a1fSXin Li vqmovun.s16 d4, q8 176*a97c2a1fSXin Li vst1.8 d3, [r3], r6 177*a97c2a1fSXin Li 178*a97c2a1fSXin Li vaddw.u8 q10, q15, d6 179*a97c2a1fSXin Li vqmovun.s16 d5, q9 180*a97c2a1fSXin Li vst1.8 d4, [r3], r6 181*a97c2a1fSXin Li 182*a97c2a1fSXin Li vaddw.u8 q11, q15, d7 183*a97c2a1fSXin Li vqmovun.s16 d6, q10 184*a97c2a1fSXin Li vst1.8 d5, [r3], r6 185*a97c2a1fSXin Li 186*a97c2a1fSXin Li vqmovun.s16 d7, q11 187*a97c2a1fSXin Li vst1.8 d6, [r3], r6 188*a97c2a1fSXin Li 189*a97c2a1fSXin Li 190*a97c2a1fSXin Li vst1.8 d7, [r3], r6 191*a97c2a1fSXin Li 192*a97c2a1fSXin Li vpop {d8-d15} 193*a97c2a1fSXin Li ldmfd sp!, {r4, r6, r12, pc} 194*a97c2a1fSXin Li 195*a97c2a1fSXin Li 196*a97c2a1fSXin Li 197*a97c2a1fSXin Li 198*a97c2a1fSXin Li .global impeg2_idct_recon_dc_mismatch_a9q 199*a97c2a1fSXin Liimpeg2_idct_recon_dc_mismatch_a9q: 200*a97c2a1fSXin Li stmfd sp!, {r4-r12, lr} 201*a97c2a1fSXin Li vpush {d8-d15} 202*a97c2a1fSXin Li 203*a97c2a1fSXin Li ldr r1, [sp, #108] @//pred_strd 204*a97c2a1fSXin Li ldr r6, [sp, #112] @//dst_strd 205*a97c2a1fSXin Li 206*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_q15_addr2 207*a97c2a1fSXin Liq15lbl2: 208*a97c2a1fSXin Li add r14, r14, pc 209*a97c2a1fSXin Li ldrsh r12, [r14] 210*a97c2a1fSXin Li ldrsh r4, [r0] 211*a97c2a1fSXin Li 212*a97c2a1fSXin Li mul r4, r4, r12 213*a97c2a1fSXin Li add r4, #idct_stg1_round 214*a97c2a1fSXin Li asr r4, r4, #idct_stg1_shift 215*a97c2a1fSXin Li 216*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_q11_addr2 217*a97c2a1fSXin Liq11lbl2: 218*a97c2a1fSXin Li add r14, r14, pc 219*a97c2a1fSXin Li ldrsh r12, [r14] 220*a97c2a1fSXin Li mul r4, r4, r12 221*a97c2a1fSXin Li vdup.s32 q0, r4 222*a97c2a1fSXin Li 223*a97c2a1fSXin Li mov r14, #16 @//Increment for table read 224*a97c2a1fSXin Li ldr r4, gai2_impeg2_mismatch_stg2_additive_addr 225*a97c2a1fSXin Liadditive_lbl: 226*a97c2a1fSXin Li add r4, r4, pc 227*a97c2a1fSXin Li 228*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 229*a97c2a1fSXin Li 230*a97c2a1fSXin Li vld1.8 d30, [r2], r1 231*a97c2a1fSXin Li vmovl.s16 q4, d2 232*a97c2a1fSXin Li vmovl.s16 q5, d3 233*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 234*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 235*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 236*a97c2a1fSXin Li vqmovun.s16 d30, q7 237*a97c2a1fSXin Li vst1.8 d30, [r3], r6 238*a97c2a1fSXin Li 239*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 240*a97c2a1fSXin Li vld1.8 d30, [r2], r1 241*a97c2a1fSXin Li vmovl.s16 q4, d2 242*a97c2a1fSXin Li vmovl.s16 q5, d3 243*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 244*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 245*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 246*a97c2a1fSXin Li vqmovun.s16 d30, q7 247*a97c2a1fSXin Li vst1.8 d30, [r3], r6 248*a97c2a1fSXin Li 249*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 250*a97c2a1fSXin Li vld1.8 d30, [r2], r1 251*a97c2a1fSXin Li vmovl.s16 q4, d2 252*a97c2a1fSXin Li vmovl.s16 q5, d3 253*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 254*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 255*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 256*a97c2a1fSXin Li vqmovun.s16 d30, q7 257*a97c2a1fSXin Li vst1.8 d30, [r3], r6 258*a97c2a1fSXin Li 259*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 260*a97c2a1fSXin Li vld1.8 d30, [r2], r1 261*a97c2a1fSXin Li vmovl.s16 q4, d2 262*a97c2a1fSXin Li vmovl.s16 q5, d3 263*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 264*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 265*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 266*a97c2a1fSXin Li vqmovun.s16 d30, q7 267*a97c2a1fSXin Li vst1.8 d30, [r3], r6 268*a97c2a1fSXin Li 269*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 270*a97c2a1fSXin Li vld1.8 d30, [r2], r1 271*a97c2a1fSXin Li vmovl.s16 q4, d2 272*a97c2a1fSXin Li vmovl.s16 q5, d3 273*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 274*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 275*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 276*a97c2a1fSXin Li vqmovun.s16 d30, q7 277*a97c2a1fSXin Li vst1.8 d30, [r3], r6 278*a97c2a1fSXin Li 279*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 280*a97c2a1fSXin Li vld1.8 d30, [r2], r1 281*a97c2a1fSXin Li vmovl.s16 q4, d2 282*a97c2a1fSXin Li vmovl.s16 q5, d3 283*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 284*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 285*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 286*a97c2a1fSXin Li vqmovun.s16 d30, q7 287*a97c2a1fSXin Li vst1.8 d30, [r3], r6 288*a97c2a1fSXin Li 289*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 290*a97c2a1fSXin Li vld1.8 d30, [r2], r1 291*a97c2a1fSXin Li vmovl.s16 q4, d2 292*a97c2a1fSXin Li vmovl.s16 q5, d3 293*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 294*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 295*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 296*a97c2a1fSXin Li vqmovun.s16 d30, q7 297*a97c2a1fSXin Li vst1.8 d30, [r3], r6 298*a97c2a1fSXin Li 299*a97c2a1fSXin Li vld1.16 {q1}, [r4], r14 300*a97c2a1fSXin Li vld1.8 d30, [r2], r1 301*a97c2a1fSXin Li vmovl.s16 q4, d2 302*a97c2a1fSXin Li vmovl.s16 q5, d3 303*a97c2a1fSXin Li vraddhn.s32 d12, q0, q4 304*a97c2a1fSXin Li vraddhn.s32 d13, q0, q5 305*a97c2a1fSXin Li vaddw.u8 q7, q6, d30 306*a97c2a1fSXin Li vqmovun.s16 d30, q7 307*a97c2a1fSXin Li vst1.8 d30, [r3], r6 308*a97c2a1fSXin Li 309*a97c2a1fSXin Li 310*a97c2a1fSXin Li vpop {d8-d15} 311*a97c2a1fSXin Li ldmfd sp!, {r4-r12, pc} 312*a97c2a1fSXin Li 313*a97c2a1fSXin Li 314*a97c2a1fSXin Li 315*a97c2a1fSXin Li 316*a97c2a1fSXin Li@/** 317*a97c2a1fSXin Li@ ******************************************************************************* 318*a97c2a1fSXin Li@ * 319*a97c2a1fSXin Li@ * ;brief 320*a97c2a1fSXin Li@ * This function performs Inverse transform and reconstruction for 8x8 321*a97c2a1fSXin Li@ * input block 322*a97c2a1fSXin Li@ * 323*a97c2a1fSXin Li@ * ;par Description: 324*a97c2a1fSXin Li@ * Performs inverse transform and adds the prediction data and clips output 325*a97c2a1fSXin Li@ * to 8 bit 326*a97c2a1fSXin Li@ * 327*a97c2a1fSXin Li@ * ;param[in] pi2_src 328*a97c2a1fSXin Li@ * Input 8x8 coefficients 329*a97c2a1fSXin Li@ * 330*a97c2a1fSXin Li@ * ;param[in] pi2_tmp 331*a97c2a1fSXin Li@ * Temporary 8x8 buffer for storing inverse 332*a97c2a1fSXin Li@ * 333*a97c2a1fSXin Li@ * transform 334*a97c2a1fSXin Li@ * 1st stage output 335*a97c2a1fSXin Li@ * 336*a97c2a1fSXin Li@ * ;param[in] pu1_pred 337*a97c2a1fSXin Li@ * Prediction 8x8 block 338*a97c2a1fSXin Li@ * 339*a97c2a1fSXin Li@ * ;param[out] pu1_dst 340*a97c2a1fSXin Li@ * Output 8x8 block 341*a97c2a1fSXin Li@ * 342*a97c2a1fSXin Li@ * ;param[in] src_strd 343*a97c2a1fSXin Li@ * Input stride 344*a97c2a1fSXin Li@ * 345*a97c2a1fSXin Li@ * ;param[in] pred_strd 346*a97c2a1fSXin Li@ * Prediction stride 347*a97c2a1fSXin Li@ * 348*a97c2a1fSXin Li@ * ;param[in] dst_strd 349*a97c2a1fSXin Li@ * Output Stride 350*a97c2a1fSXin Li@ * 351*a97c2a1fSXin Li@ * ;param[in] shift 352*a97c2a1fSXin Li@ * Output shift 353*a97c2a1fSXin Li@ * 354*a97c2a1fSXin Li@ * ;param[in] zero_cols 355*a97c2a1fSXin Li@ * Zero columns in pi2_src 356*a97c2a1fSXin Li@ * 357*a97c2a1fSXin Li@ * ;returns Void 358*a97c2a1fSXin Li@ * 359*a97c2a1fSXin Li@ * ;remarks 360*a97c2a1fSXin Li@ * None 361*a97c2a1fSXin Li@ * 362*a97c2a1fSXin Li@ ******************************************************************************* 363*a97c2a1fSXin Li@ */ 364*a97c2a1fSXin Li 365*a97c2a1fSXin Li@void impeg2_itrans_recon_8x8(WORD16 *pi2_src, 366*a97c2a1fSXin Li@ WORD16 *pi2_tmp, 367*a97c2a1fSXin Li@ UWORD8 *pu1_pred, 368*a97c2a1fSXin Li@ UWORD8 *pu1_dst, 369*a97c2a1fSXin Li@ WORD32 src_strd, 370*a97c2a1fSXin Li@ WORD32 pred_strd, 371*a97c2a1fSXin Li@ WORD32 dst_strd, 372*a97c2a1fSXin Li@ WORD32 zero_cols 373*a97c2a1fSXin Li@ WORD32 zero_rows ) 374*a97c2a1fSXin Li 375*a97c2a1fSXin Li@**************Variables Vs Registers************************* 376*a97c2a1fSXin Li@ r0 => *pi2_src 377*a97c2a1fSXin Li@ r1 => *pi2_tmp 378*a97c2a1fSXin Li@ r2 => *pu1_pred 379*a97c2a1fSXin Li@ r3 => *pu1_dst 380*a97c2a1fSXin Li@ src_strd 381*a97c2a1fSXin Li@ pred_strd 382*a97c2a1fSXin Li@ dst_strd 383*a97c2a1fSXin Li@ zero_cols 384*a97c2a1fSXin Li 385*a97c2a1fSXin Li 386*a97c2a1fSXin Li 387*a97c2a1fSXin Li .global impeg2_idct_recon_a9q 388*a97c2a1fSXin Liimpeg2_idct_recon_a9q: 389*a97c2a1fSXin Li@//Register Usage Reference - loading and Until IDCT of columns 390*a97c2a1fSXin Li@// Cosine Constants - D0 391*a97c2a1fSXin Li@// Sine Constants - D1 392*a97c2a1fSXin Li@// Row 0 First Half - D2 - y0 393*a97c2a1fSXin Li@// Row 1 First Half - D6 - y1 394*a97c2a1fSXin Li@// Row 2 First Half - D3 - y2 395*a97c2a1fSXin Li@// Row 3 First Half - D7 - y3 396*a97c2a1fSXin Li@// Row 4 First Half - D10 - y4 397*a97c2a1fSXin Li@// Row 5 First Half - D14 - y5 398*a97c2a1fSXin Li@// Row 6 First Half - D11 - y6 399*a97c2a1fSXin Li@// Row 7 First Half - D15 - y7 400*a97c2a1fSXin Li 401*a97c2a1fSXin Li@// Row 0 Second Half - D4 - y0 402*a97c2a1fSXin Li@// Row 1 Second Half - D8 - y1 403*a97c2a1fSXin Li@// Row 2 Second Half - D5 - y2 404*a97c2a1fSXin Li@// Row 3 Second Half - D9 - y3 405*a97c2a1fSXin Li@// Row 4 Second Half - D12 - y4 406*a97c2a1fSXin Li@// Row 5 Second Half - D16 - y5 407*a97c2a1fSXin Li@// Row 6 Second Half - D13 - y6 408*a97c2a1fSXin Li@// Row 7 Second Half - D17 - y7 409*a97c2a1fSXin Li 410*a97c2a1fSXin Li @// Copy the input pointer to another register 411*a97c2a1fSXin Li @// Step 1 : load all constants 412*a97c2a1fSXin Li stmfd sp!, {r4-r12, lr} 413*a97c2a1fSXin Li vpush {d8-d15} 414*a97c2a1fSXin Li 415*a97c2a1fSXin Li ldr r8, [sp, #108] @ prediction stride 416*a97c2a1fSXin Li ldr r7, [sp, #112] @ destination stride 417*a97c2a1fSXin Li ldr r6, [sp, #104] @ src stride 418*a97c2a1fSXin Li ldr r12, [sp, #116] 419*a97c2a1fSXin Li ldr r11, [sp, #120] 420*a97c2a1fSXin Li 421*a97c2a1fSXin Li mov r6, r6, lsl #1 @ x sizeof(word16) 422*a97c2a1fSXin Li add r9, r0, r6, lsl #1 @ 2 rows 423*a97c2a1fSXin Li 424*a97c2a1fSXin Li add r10, r6, r6, lsl #1 @ 3 rows 425*a97c2a1fSXin Li 426*a97c2a1fSXin Li sub r10, r10, #8 @ - 4 cols * sizeof(WORD16) 427*a97c2a1fSXin Li sub r5, r6, #8 @ src_strd - 4 cols * sizeof(WORD16) 428*a97c2a1fSXin Li 429*a97c2a1fSXin Li 430*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_first_col_q15_addr1 431*a97c2a1fSXin Lifcq15_lbl1: 432*a97c2a1fSXin Li add r14, r14, pc 433*a97c2a1fSXin Li vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data 434*a97c2a1fSXin Li 435*a97c2a1fSXin Li @//Step 2 Load all the input data 436*a97c2a1fSXin Li @//Step 3 Operate first 4 colums at a time 437*a97c2a1fSXin Li 438*a97c2a1fSXin Li and r11, r11, #0xff 439*a97c2a1fSXin Li and r12, r12, #0xff 440*a97c2a1fSXin Li 441*a97c2a1fSXin Li cmp r11, #0xf0 442*a97c2a1fSXin Li bge skip_last4_rows 443*a97c2a1fSXin Li 444*a97c2a1fSXin Li 445*a97c2a1fSXin Li vld1.16 d2, [r0]! 446*a97c2a1fSXin Li vld1.16 d3, [r9]! 447*a97c2a1fSXin Li vld1.16 d4, [r0], r5 448*a97c2a1fSXin Li vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) 449*a97c2a1fSXin Li vld1.16 d5, [r9], r5 450*a97c2a1fSXin Li vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) 451*a97c2a1fSXin Li vld1.16 d6, [r0]! 452*a97c2a1fSXin Li vld1.16 d7, [r9]! 453*a97c2a1fSXin Li vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) 454*a97c2a1fSXin Li vld1.16 d8, [r0], r10 455*a97c2a1fSXin Li vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) 456*a97c2a1fSXin Li vld1.16 d9, [r9], r10 457*a97c2a1fSXin Li vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) 458*a97c2a1fSXin Li vld1.16 d10, [r0]! 459*a97c2a1fSXin Li vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) 460*a97c2a1fSXin Li vld1.16 d11, [r9]! 461*a97c2a1fSXin Li vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 462*a97c2a1fSXin Li vld1.16 d12, [r0], r5 463*a97c2a1fSXin Li vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 464*a97c2a1fSXin Li vld1.16 d13, [r9], r5 465*a97c2a1fSXin Li vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 466*a97c2a1fSXin Li vld1.16 d14, [r0]! 467*a97c2a1fSXin Li vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 468*a97c2a1fSXin Li vld1.16 d15, [r9]! 469*a97c2a1fSXin Li vmull.s16 q11, d10, d0[0] @// y4 * cos4(part of c0 and c1) 470*a97c2a1fSXin Li vld1.16 d16, [r0], r10 471*a97c2a1fSXin Li vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) 472*a97c2a1fSXin Li vld1.16 d17, [r9], r10 473*a97c2a1fSXin Li 474*a97c2a1fSXin Li @/* This following was activated when alignment is not there */ 475*a97c2a1fSXin Li@// VLD1.16 D2,[r0]! 476*a97c2a1fSXin Li@// VLD1.16 D3,[r2]! 477*a97c2a1fSXin Li@// VLD1.16 D4,[r0]! 478*a97c2a1fSXin Li@// VLD1.16 D5,[r2]! 479*a97c2a1fSXin Li@// VLD1.16 D6,[r0]! 480*a97c2a1fSXin Li@// VLD1.16 D7,[r2]! 481*a97c2a1fSXin Li@// VLD1.16 D8,[r0],r3 482*a97c2a1fSXin Li@// VLD1.16 D9,[r2],r3 483*a97c2a1fSXin Li@// VLD1.16 D10,[r0]! 484*a97c2a1fSXin Li@// VLD1.16 D11,[r2]! 485*a97c2a1fSXin Li@// VLD1.16 D12,[r0]! 486*a97c2a1fSXin Li@// VLD1.16 D13,[r2]! 487*a97c2a1fSXin Li@// VLD1.16 D14,[r0]! 488*a97c2a1fSXin Li@// VLD1.16 D15,[r2]! 489*a97c2a1fSXin Li@// VLD1.16 D16,[r0],r3 490*a97c2a1fSXin Li@// VLD1.16 D17,[r2],r3 491*a97c2a1fSXin Li 492*a97c2a1fSXin Li 493*a97c2a1fSXin Li 494*a97c2a1fSXin Li 495*a97c2a1fSXin Li vmlal.s16 q12, d14, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 496*a97c2a1fSXin Li vmlsl.s16 q13, d14, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 497*a97c2a1fSXin Li vmlal.s16 q14, d14, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 498*a97c2a1fSXin Li vmlal.s16 q15, d14, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 499*a97c2a1fSXin Li 500*a97c2a1fSXin Li vmlsl.s16 q9, d11, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 501*a97c2a1fSXin Li vmlal.s16 q3, d11, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 502*a97c2a1fSXin Li 503*a97c2a1fSXin Li vadd.s32 q5, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 504*a97c2a1fSXin Li vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 505*a97c2a1fSXin Li 506*a97c2a1fSXin Li vmlal.s16 q12, d15, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) 507*a97c2a1fSXin Li vmlsl.s16 q13, d15, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) 508*a97c2a1fSXin Li vmlal.s16 q14, d15, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) 509*a97c2a1fSXin Li vmlsl.s16 q15, d15, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) 510*a97c2a1fSXin Li 511*a97c2a1fSXin Li vadd.s32 q7, q5, q3 @// a0 = c0 + d0(part of r0,r7) 512*a97c2a1fSXin Li vsub.s32 q5, q5, q3 @// a3 = c0 - d0(part of r3,r4) 513*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) 514*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) 515*a97c2a1fSXin Li 516*a97c2a1fSXin Li vadd.s32 q10, q7, q12 @// a0 + b0(part of r0) 517*a97c2a1fSXin Li vsub.s32 q3, q7, q12 @// a0 - b0(part of r7) 518*a97c2a1fSXin Li 519*a97c2a1fSXin Li vadd.s32 q12, q11, q14 @// a2 + b2(part of r2) 520*a97c2a1fSXin Li vsub.s32 q11, q11, q14 @// a2 - b2(part of r5) 521*a97c2a1fSXin Li 522*a97c2a1fSXin Li vadd.s32 q14, q9, q13 @// a1 + b1(part of r1) 523*a97c2a1fSXin Li vsub.s32 q9, q9, q13 @// a1 - b1(part of r6) 524*a97c2a1fSXin Li 525*a97c2a1fSXin Li vadd.s32 q13, q5, q15 @// a3 + b3(part of r3) 526*a97c2a1fSXin Li vsub.s32 q15, q5, q15 @// a3 - b3(part of r4) 527*a97c2a1fSXin Li 528*a97c2a1fSXin Li vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) 529*a97c2a1fSXin Li vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) 530*a97c2a1fSXin Li vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) 531*a97c2a1fSXin Li vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) 532*a97c2a1fSXin Li vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) 533*a97c2a1fSXin Li vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) 534*a97c2a1fSXin Li vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) 535*a97c2a1fSXin Li vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) 536*a97c2a1fSXin Li 537*a97c2a1fSXin Li 538*a97c2a1fSXin Li b last4_cols 539*a97c2a1fSXin Li 540*a97c2a1fSXin Li 541*a97c2a1fSXin Li 542*a97c2a1fSXin Liskip_last4_rows: 543*a97c2a1fSXin Li 544*a97c2a1fSXin Li 545*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_first_col_q15_addr2 546*a97c2a1fSXin Lifcq15_lbl2: 547*a97c2a1fSXin Li add r14, r14, pc 548*a97c2a1fSXin Li vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data 549*a97c2a1fSXin Li 550*a97c2a1fSXin Li vld1.16 d2, [r0]! 551*a97c2a1fSXin Li vld1.16 d3, [r9]! 552*a97c2a1fSXin Li vld1.16 d4, [r0], r5 553*a97c2a1fSXin Li vld1.16 d5, [r9], r5 554*a97c2a1fSXin Li vld1.16 d6, [r0]! 555*a97c2a1fSXin Li vld1.16 d7, [r9]! 556*a97c2a1fSXin Li vld1.16 d8, [r0], r10 557*a97c2a1fSXin Li vld1.16 d9, [r9], r10 558*a97c2a1fSXin Li 559*a97c2a1fSXin Li 560*a97c2a1fSXin Li 561*a97c2a1fSXin Li vmov.s16 q6, #0 562*a97c2a1fSXin Li vmov.s16 q8, #0 563*a97c2a1fSXin Li 564*a97c2a1fSXin Li 565*a97c2a1fSXin Li 566*a97c2a1fSXin Li 567*a97c2a1fSXin Li vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) 568*a97c2a1fSXin Li vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) 569*a97c2a1fSXin Li vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) 570*a97c2a1fSXin Li vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) 571*a97c2a1fSXin Li 572*a97c2a1fSXin Li vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 573*a97c2a1fSXin Li vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 574*a97c2a1fSXin Li vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 575*a97c2a1fSXin Li vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 576*a97c2a1fSXin Li 577*a97c2a1fSXin Li vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) 578*a97c2a1fSXin Li vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) 579*a97c2a1fSXin Li 580*a97c2a1fSXin Li vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) 581*a97c2a1fSXin Li 582*a97c2a1fSXin Li 583*a97c2a1fSXin Li vadd.s32 q7, q10, q3 @// a0 = c0 + d0(part of r0,r7) 584*a97c2a1fSXin Li vsub.s32 q5, q10, q3 @// a3 = c0 - d0(part of r3,r4) 585*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) 586*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) 587*a97c2a1fSXin Li 588*a97c2a1fSXin Li vadd.s32 q10, q7, q12 @// a0 + b0(part of r0) 589*a97c2a1fSXin Li vsub.s32 q3, q7, q12 @// a0 - b0(part of r7) 590*a97c2a1fSXin Li 591*a97c2a1fSXin Li vadd.s32 q12, q11, q14 @// a2 + b2(part of r2) 592*a97c2a1fSXin Li vsub.s32 q11, q11, q14 @// a2 - b2(part of r5) 593*a97c2a1fSXin Li 594*a97c2a1fSXin Li vadd.s32 q14, q9, q13 @// a1 + b1(part of r1) 595*a97c2a1fSXin Li vsub.s32 q9, q9, q13 @// a1 - b1(part of r6) 596*a97c2a1fSXin Li 597*a97c2a1fSXin Li vadd.s32 q13, q5, q15 @// a3 + b3(part of r3) 598*a97c2a1fSXin Li vsub.s32 q15, q5, q15 @// a3 - b3(part of r4) 599*a97c2a1fSXin Li 600*a97c2a1fSXin Li vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) 601*a97c2a1fSXin Li vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) 602*a97c2a1fSXin Li vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) 603*a97c2a1fSXin Li vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) 604*a97c2a1fSXin Li vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) 605*a97c2a1fSXin Li vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) 606*a97c2a1fSXin Li vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) 607*a97c2a1fSXin Li vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) 608*a97c2a1fSXin Li 609*a97c2a1fSXin Li 610*a97c2a1fSXin Lilast4_cols: 611*a97c2a1fSXin Li 612*a97c2a1fSXin Li 613*a97c2a1fSXin Li cmp r12, #0xf0 614*a97c2a1fSXin Li bge skip_last4cols 615*a97c2a1fSXin Li 616*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_first_col_q15_addr3 617*a97c2a1fSXin Lifcq15_lbl3: 618*a97c2a1fSXin Li add r14, r14, pc 619*a97c2a1fSXin Li vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data 620*a97c2a1fSXin Li 621*a97c2a1fSXin Li vmull.s16 q12, d8, d0[1] @// y1 * cos1(part of b0) 622*a97c2a1fSXin Li vmull.s16 q13, d8, d0[3] @// y1 * cos3(part of b1) 623*a97c2a1fSXin Li vmull.s16 q14, d8, d1[1] @// y1 * sin3(part of b2) 624*a97c2a1fSXin Li vmull.s16 q15, d8, d1[3] @// y1 * sin1(part of b3) 625*a97c2a1fSXin Li 626*a97c2a1fSXin Li vmlal.s16 q12, d9, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 627*a97c2a1fSXin Li vmlsl.s16 q13, d9, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 628*a97c2a1fSXin Li vmlsl.s16 q14, d9, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 629*a97c2a1fSXin Li vmlsl.s16 q15, d9, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 630*a97c2a1fSXin Li 631*a97c2a1fSXin Li vmull.s16 q9, d5, d1[2] @// y2 * sin2 (Q4 is freed by this time)(part of d1) 632*a97c2a1fSXin Li vmull.s16 q4, d5, d0[2] @// y2 * cos2(part of d0) 633*a97c2a1fSXin Li 634*a97c2a1fSXin Li vmull.s16 q10, d4, d0[0] @// y0 * cos4(part of c0 and c1) 635*a97c2a1fSXin Li vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1) 636*a97c2a1fSXin Li 637*a97c2a1fSXin Li vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 638*a97c2a1fSXin Li vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 639*a97c2a1fSXin Li vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 640*a97c2a1fSXin Li vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 641*a97c2a1fSXin Li 642*a97c2a1fSXin Li vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 643*a97c2a1fSXin Li vmlal.s16 q4, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 644*a97c2a1fSXin Li 645*a97c2a1fSXin Li vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 646*a97c2a1fSXin Li vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 647*a97c2a1fSXin Li 648*a97c2a1fSXin Li vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) 649*a97c2a1fSXin Li vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) 650*a97c2a1fSXin Li vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) 651*a97c2a1fSXin Li vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) 652*a97c2a1fSXin Li 653*a97c2a1fSXin Li vadd.s32 q8, q6, q4 @// a0 = c0 + d0(part of e0,e7) 654*a97c2a1fSXin Li vsub.s32 q6, q6, q4 @// a3 = c0 - d0(part of e3,e4) 655*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of e2,e5) 656*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of e1,e6) 657*a97c2a1fSXin Li 658*a97c2a1fSXin Li vadd.s32 q10, q8, q12 @// a0 + b0(part of e0) 659*a97c2a1fSXin Li vsub.s32 q4, q8, q12 @// a0 - b0(part of e7) 660*a97c2a1fSXin Li 661*a97c2a1fSXin Li vadd.s32 q12, q11, q14 @// a2 + b2(part of e2) 662*a97c2a1fSXin Li vsub.s32 q11, q11, q14 @// a2 - b2(part of e5) 663*a97c2a1fSXin Li 664*a97c2a1fSXin Li vadd.s32 q14, q9, q13 @// a1 + b1(part of e1) 665*a97c2a1fSXin Li vsub.s32 q9, q9, q13 @// a1 - b1(part of e6) 666*a97c2a1fSXin Li 667*a97c2a1fSXin Li vadd.s32 q13, q6, q15 @// a3 + b3(part of e3) 668*a97c2a1fSXin Li vsub.s32 q15, q6, q15 @// a3 - b3(part of r4) 669*a97c2a1fSXin Li 670*a97c2a1fSXin Li vqrshrn.s32 d4, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) 671*a97c2a1fSXin Li vqrshrn.s32 d17, q4, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) 672*a97c2a1fSXin Li vqrshrn.s32 d5, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) 673*a97c2a1fSXin Li vqrshrn.s32 d16, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) 674*a97c2a1fSXin Li vqrshrn.s32 d8, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) 675*a97c2a1fSXin Li vqrshrn.s32 d13, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) 676*a97c2a1fSXin Li vqrshrn.s32 d9, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) 677*a97c2a1fSXin Li vqrshrn.s32 d12, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) 678*a97c2a1fSXin Li b end_skip_last4cols 679*a97c2a1fSXin Li 680*a97c2a1fSXin Li 681*a97c2a1fSXin Li 682*a97c2a1fSXin Liskip_last4cols: 683*a97c2a1fSXin Li 684*a97c2a1fSXin Li 685*a97c2a1fSXin Li 686*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_first_col_q11_addr1 687*a97c2a1fSXin Lifcq11_lbl1: 688*a97c2a1fSXin Li add r14, r14, pc 689*a97c2a1fSXin Li vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data 690*a97c2a1fSXin Li 691*a97c2a1fSXin Li 692*a97c2a1fSXin Li 693*a97c2a1fSXin Li vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing 694*a97c2a1fSXin Li 695*a97c2a1fSXin Li vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing 696*a97c2a1fSXin Li 697*a97c2a1fSXin Li 698*a97c2a1fSXin Li vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued..... 699*a97c2a1fSXin Li vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued..... 700*a97c2a1fSXin Li 701*a97c2a1fSXin Li vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued..... 702*a97c2a1fSXin Li vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued..... 703*a97c2a1fSXin Li 704*a97c2a1fSXin Li 705*a97c2a1fSXin Li vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) 706*a97c2a1fSXin Li vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) 707*a97c2a1fSXin Li vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) 708*a97c2a1fSXin Li vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) 709*a97c2a1fSXin Li 710*a97c2a1fSXin Li vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 711*a97c2a1fSXin Li vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 712*a97c2a1fSXin Li vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 713*a97c2a1fSXin Li vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 714*a97c2a1fSXin Li 715*a97c2a1fSXin Li vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) 716*a97c2a1fSXin Li@ VMULL.S16 Q11,D4,D0[0] ;// y4 * cos4(part of c0 and c1) 717*a97c2a1fSXin Li 718*a97c2a1fSXin Li vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) 719*a97c2a1fSXin Li vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) 720*a97c2a1fSXin Li 721*a97c2a1fSXin Li 722*a97c2a1fSXin Li 723*a97c2a1fSXin Li 724*a97c2a1fSXin Li vsub.s32 q11, q10, q3 @// a3 = c0 - d0(part of r3,r4) 725*a97c2a1fSXin Li vadd.s32 q2, q10, q3 @// a0 = c0 + d0(part of r0,r7) 726*a97c2a1fSXin Li 727*a97c2a1fSXin Li 728*a97c2a1fSXin Li vadd.s32 q1, q2, q12 729*a97c2a1fSXin Li 730*a97c2a1fSXin Li vsub.s32 q3, q2, q12 731*a97c2a1fSXin Li 732*a97c2a1fSXin Li vadd.s32 q4, q11, q15 733*a97c2a1fSXin Li 734*a97c2a1fSXin Li vsub.s32 q12, q11, q15 735*a97c2a1fSXin Li 736*a97c2a1fSXin Li vqrshrn.s32 d5, q4, #idct_stg2_shift 737*a97c2a1fSXin Li vqrshrn.s32 d2, q1, #idct_stg2_shift 738*a97c2a1fSXin Li vqrshrn.s32 d9, q3, #idct_stg2_shift 739*a97c2a1fSXin Li vqrshrn.s32 d6, q12, #idct_stg2_shift 740*a97c2a1fSXin Li 741*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) 742*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) 743*a97c2a1fSXin Li 744*a97c2a1fSXin Li 745*a97c2a1fSXin Li vadd.s32 q15, q11, q14 746*a97c2a1fSXin Li 747*a97c2a1fSXin Li vsub.s32 q12, q11, q14 748*a97c2a1fSXin Li 749*a97c2a1fSXin Li vadd.s32 q14, q9, q13 750*a97c2a1fSXin Li 751*a97c2a1fSXin Li vsub.s32 q11, q9, q13 752*a97c2a1fSXin Li vqrshrn.s32 d4, q15, #idct_stg2_shift 753*a97c2a1fSXin Li vqrshrn.s32 d7, q12, #idct_stg2_shift 754*a97c2a1fSXin Li vqrshrn.s32 d3, q14, #idct_stg2_shift 755*a97c2a1fSXin Li vqrshrn.s32 d8, q11, #idct_stg2_shift 756*a97c2a1fSXin Li 757*a97c2a1fSXin Li 758*a97c2a1fSXin Li 759*a97c2a1fSXin Li 760*a97c2a1fSXin Li 761*a97c2a1fSXin Li 762*a97c2a1fSXin Li 763*a97c2a1fSXin Li 764*a97c2a1fSXin Li 765*a97c2a1fSXin Li 766*a97c2a1fSXin Li vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0) 767*a97c2a1fSXin Li 768*a97c2a1fSXin Li vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1) 769*a97c2a1fSXin Li vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2) 770*a97c2a1fSXin Li vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3) 771*a97c2a1fSXin Li 772*a97c2a1fSXin Li vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 773*a97c2a1fSXin Li vtrn.16 d2, d3 774*a97c2a1fSXin Li vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 775*a97c2a1fSXin Li vtrn.16 d4, d5 776*a97c2a1fSXin Li vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 777*a97c2a1fSXin Li vtrn.16 d6, d7 778*a97c2a1fSXin Li vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 779*a97c2a1fSXin Li vtrn.16 d8, d9 780*a97c2a1fSXin Li vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1) 781*a97c2a1fSXin Li vtrn.32 d2, d4 782*a97c2a1fSXin Li 783*a97c2a1fSXin Li vtrn.32 d3, d5 784*a97c2a1fSXin Li vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1) 785*a97c2a1fSXin Li vtrn.32 d6, d8 786*a97c2a1fSXin Li vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0) 787*a97c2a1fSXin Li vtrn.32 d7, d9 788*a97c2a1fSXin Li 789*a97c2a1fSXin Li 790*a97c2a1fSXin Li add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data 791*a97c2a1fSXin Li 792*a97c2a1fSXin Li 793*a97c2a1fSXin Li add r5, r8, r8, lsl #1 @ 794*a97c2a1fSXin Li 795*a97c2a1fSXin Li 796*a97c2a1fSXin Li add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data 797*a97c2a1fSXin Li 798*a97c2a1fSXin Li 799*a97c2a1fSXin Li add r10, r7, r7, lsl #1 @ 800*a97c2a1fSXin Li 801*a97c2a1fSXin Li 802*a97c2a1fSXin Li vswp d3, d6 803*a97c2a1fSXin Li 804*a97c2a1fSXin Li 805*a97c2a1fSXin Li vswp d5, d8 806*a97c2a1fSXin Li 807*a97c2a1fSXin Li 808*a97c2a1fSXin Li vsub.s32 q11, q10, q7 @// a3 = c0 - d0(part of r3,r4) 809*a97c2a1fSXin Li vadd.s32 q6, q10, q7 @// a0 = c0 + d0(part of r0,r7) 810*a97c2a1fSXin Li 811*a97c2a1fSXin Li 812*a97c2a1fSXin Li vadd.s32 q0, q6, q12 813*a97c2a1fSXin Li 814*a97c2a1fSXin Li 815*a97c2a1fSXin Li vsub.s32 q12, q6, q12 816*a97c2a1fSXin Li 817*a97c2a1fSXin Li 818*a97c2a1fSXin Li vadd.s32 q6, q11, q15 819*a97c2a1fSXin Li 820*a97c2a1fSXin Li 821*a97c2a1fSXin Li vsub.s32 q7, q11, q15 822*a97c2a1fSXin Li 823*a97c2a1fSXin Li vqrshrn.s32 d10, q0, #idct_stg2_shift 824*a97c2a1fSXin Li vqrshrn.s32 d17, q12, #idct_stg2_shift 825*a97c2a1fSXin Li vqrshrn.s32 d13, q6, #idct_stg2_shift 826*a97c2a1fSXin Li vqrshrn.s32 d14, q7, #idct_stg2_shift 827*a97c2a1fSXin Li 828*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) 829*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) 830*a97c2a1fSXin Li 831*a97c2a1fSXin Li 832*a97c2a1fSXin Li vadd.s32 q0, q11, q14 833*a97c2a1fSXin Li 834*a97c2a1fSXin Li 835*a97c2a1fSXin Li vsub.s32 q12, q11, q14 836*a97c2a1fSXin Li 837*a97c2a1fSXin Li 838*a97c2a1fSXin Li vadd.s32 q14, q9, q13 839*a97c2a1fSXin Li 840*a97c2a1fSXin Li 841*a97c2a1fSXin Li vsub.s32 q13, q9, q13 842*a97c2a1fSXin Li vld1.8 d18, [r2], r8 843*a97c2a1fSXin Li 844*a97c2a1fSXin Li vqrshrn.s32 d12, q0, #idct_stg2_shift 845*a97c2a1fSXin Li vld1.8 d20, [r2], r5 846*a97c2a1fSXin Li 847*a97c2a1fSXin Li 848*a97c2a1fSXin Li vqrshrn.s32 d15, q12, #idct_stg2_shift 849*a97c2a1fSXin Li vld1.8 d19, [r2], r8 850*a97c2a1fSXin Li 851*a97c2a1fSXin Li 852*a97c2a1fSXin Li 853*a97c2a1fSXin Li 854*a97c2a1fSXin Li vqrshrn.s32 d11, q14, #idct_stg2_shift 855*a97c2a1fSXin Li vld1.8 d22, [r4], r8 856*a97c2a1fSXin Li 857*a97c2a1fSXin Li 858*a97c2a1fSXin Li 859*a97c2a1fSXin Li 860*a97c2a1fSXin Li vqrshrn.s32 d16, q13, #idct_stg2_shift 861*a97c2a1fSXin Li vld1.8 d21, [r2], r5 862*a97c2a1fSXin Li 863*a97c2a1fSXin Li 864*a97c2a1fSXin Li b pred_buff_addition 865*a97c2a1fSXin Liend_skip_last4cols: 866*a97c2a1fSXin Li 867*a97c2a1fSXin Li ldr r14, gai2_impeg2_idct_first_col_q11_addr2 868*a97c2a1fSXin Lifcq11_lbl2: 869*a97c2a1fSXin Li add r14, r14, pc 870*a97c2a1fSXin Li vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data 871*a97c2a1fSXin Li 872*a97c2a1fSXin Li 873*a97c2a1fSXin Li@/* Now the Idct of columns is done, transpose so that row idct done efficiently(step5) */ 874*a97c2a1fSXin Li vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing 875*a97c2a1fSXin Li vtrn.16 q2, q4 @//[r3,r1],[r2,r0] second qudrant transposing 876*a97c2a1fSXin Li vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing 877*a97c2a1fSXin Li vtrn.16 q6, q8 @//[r7,r5],[r6,r4] fourth qudrant transposing 878*a97c2a1fSXin Li 879*a97c2a1fSXin Li vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued..... 880*a97c2a1fSXin Li vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued..... 881*a97c2a1fSXin Li vtrn.32 d4, d5 @//r0,r1,r2,r3 second qudrant transposing continued..... 882*a97c2a1fSXin Li vtrn.32 d8, d9 @//r0,r1,r2,r3 second qudrant transposing continued..... 883*a97c2a1fSXin Li vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued..... 884*a97c2a1fSXin Li vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued..... 885*a97c2a1fSXin Li vtrn.32 d12, d13 @//r4,r5,r6,r7 fourth qudrant transposing continued..... 886*a97c2a1fSXin Li vtrn.32 d16, d17 @//r4,r5,r6,r7 fourth qudrant transposing continued..... 887*a97c2a1fSXin Li 888*a97c2a1fSXin Li @//step6 Operate on first four rows and find their idct 889*a97c2a1fSXin Li @//Register Usage Reference - storing and IDCT of rows 890*a97c2a1fSXin Li@// Cosine Constants - D0 891*a97c2a1fSXin Li@// Sine Constants - D1 892*a97c2a1fSXin Li@// Element 0 First four - D2 - y0 893*a97c2a1fSXin Li@// Element 1 First four - D6 - y1 894*a97c2a1fSXin Li@// Element 2 First four - D3 - y2 895*a97c2a1fSXin Li@// Element 3 First four - D7 - y3 896*a97c2a1fSXin Li@// Element 4 First four - D4 - y4 897*a97c2a1fSXin Li@// Element 5 First four - D8 - y5 898*a97c2a1fSXin Li@// Element 6 First four - D5 - y6 899*a97c2a1fSXin Li@// Element 7 First four - D9 - y7 900*a97c2a1fSXin Li@// Element 0 Second four - D10 - y0 901*a97c2a1fSXin Li@// Element 1 Second four - D14 - y1 902*a97c2a1fSXin Li@// Element 2 Second four - D11 - y2 903*a97c2a1fSXin Li@// Element 3 Second four - D15 - y3 904*a97c2a1fSXin Li@// Element 4 Second four - D12 - y4 905*a97c2a1fSXin Li@// Element 5 Second four - D16 - y5 906*a97c2a1fSXin Li@// Element 6 Second four - D13 - y6 907*a97c2a1fSXin Li@// Element 7 Second four - D17 - y7 908*a97c2a1fSXin Li 909*a97c2a1fSXin Li @// Map between first kernel code seq and current 910*a97c2a1fSXin Li@// D2 -> D2 911*a97c2a1fSXin Li@// D6 -> D6 912*a97c2a1fSXin Li@// D3 -> D3 913*a97c2a1fSXin Li@// D7 -> D7 914*a97c2a1fSXin Li@// D10 -> D4 915*a97c2a1fSXin Li@// D14 -> D8 916*a97c2a1fSXin Li@// D11 -> D5 917*a97c2a1fSXin Li@// D15 -> D9 918*a97c2a1fSXin Li@// Q3 -> Q3 919*a97c2a1fSXin Li@// Q5 -> Q2 920*a97c2a1fSXin Li@// Q7 -> Q4 921*a97c2a1fSXin Li 922*a97c2a1fSXin Li vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) 923*a97c2a1fSXin Li vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) 924*a97c2a1fSXin Li vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) 925*a97c2a1fSXin Li vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) 926*a97c2a1fSXin Li 927*a97c2a1fSXin Li vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 928*a97c2a1fSXin Li vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 929*a97c2a1fSXin Li vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 930*a97c2a1fSXin Li vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 931*a97c2a1fSXin Li 932*a97c2a1fSXin Li vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) 933*a97c2a1fSXin Li vmull.s16 q11, d4, d0[0] @// y4 * cos4(part of c0 and c1) 934*a97c2a1fSXin Li 935*a97c2a1fSXin Li vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) 936*a97c2a1fSXin Li vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) 937*a97c2a1fSXin Li 938*a97c2a1fSXin Li 939*a97c2a1fSXin Li vmlal.s16 q12, d8, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 940*a97c2a1fSXin Li vmlsl.s16 q13, d8, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 941*a97c2a1fSXin Li vmlal.s16 q14, d8, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 942*a97c2a1fSXin Li vmlal.s16 q15, d8, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 943*a97c2a1fSXin Li 944*a97c2a1fSXin Li vmlsl.s16 q9, d5, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 945*a97c2a1fSXin Li vmlal.s16 q3, d5, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 946*a97c2a1fSXin Li 947*a97c2a1fSXin Li vadd.s32 q1, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 948*a97c2a1fSXin Li vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 949*a97c2a1fSXin Li 950*a97c2a1fSXin Li vmlal.s16 q12, d9, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) 951*a97c2a1fSXin Li vmlsl.s16 q13, d9, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) 952*a97c2a1fSXin Li vmlal.s16 q14, d9, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) 953*a97c2a1fSXin Li vmlsl.s16 q15, d9, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) 954*a97c2a1fSXin Li 955*a97c2a1fSXin Li vsub.s32 q11, q1, q3 @// a3 = c0 - d0(part of r3,r4) 956*a97c2a1fSXin Li vadd.s32 q2, q1, q3 @// a0 = c0 + d0(part of r0,r7) 957*a97c2a1fSXin Li 958*a97c2a1fSXin Li 959*a97c2a1fSXin Li vadd.s32 q1, q2, q12 960*a97c2a1fSXin Li 961*a97c2a1fSXin Li vsub.s32 q3, q2, q12 962*a97c2a1fSXin Li 963*a97c2a1fSXin Li vadd.s32 q4, q11, q15 964*a97c2a1fSXin Li 965*a97c2a1fSXin Li vsub.s32 q12, q11, q15 966*a97c2a1fSXin Li 967*a97c2a1fSXin Li vqrshrn.s32 d5, q4, #idct_stg2_shift 968*a97c2a1fSXin Li vqrshrn.s32 d2, q1, #idct_stg2_shift 969*a97c2a1fSXin Li vqrshrn.s32 d9, q3, #idct_stg2_shift 970*a97c2a1fSXin Li vqrshrn.s32 d6, q12, #idct_stg2_shift 971*a97c2a1fSXin Li 972*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) 973*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) 974*a97c2a1fSXin Li 975*a97c2a1fSXin Li 976*a97c2a1fSXin Li vadd.s32 q15, q11, q14 977*a97c2a1fSXin Li 978*a97c2a1fSXin Li vsub.s32 q12, q11, q14 979*a97c2a1fSXin Li 980*a97c2a1fSXin Li vadd.s32 q14, q9, q13 981*a97c2a1fSXin Li 982*a97c2a1fSXin Li vsub.s32 q11, q9, q13 983*a97c2a1fSXin Li vqrshrn.s32 d4, q15, #idct_stg2_shift 984*a97c2a1fSXin Li vqrshrn.s32 d7, q12, #idct_stg2_shift 985*a97c2a1fSXin Li vqrshrn.s32 d3, q14, #idct_stg2_shift 986*a97c2a1fSXin Li vqrshrn.s32 d8, q11, #idct_stg2_shift 987*a97c2a1fSXin Li 988*a97c2a1fSXin Li 989*a97c2a1fSXin Li 990*a97c2a1fSXin Li 991*a97c2a1fSXin Li 992*a97c2a1fSXin Li 993*a97c2a1fSXin Li 994*a97c2a1fSXin Li 995*a97c2a1fSXin Li 996*a97c2a1fSXin Li 997*a97c2a1fSXin Li vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0) 998*a97c2a1fSXin Li 999*a97c2a1fSXin Li vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1) 1000*a97c2a1fSXin Li vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2) 1001*a97c2a1fSXin Li vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3) 1002*a97c2a1fSXin Li 1003*a97c2a1fSXin Li vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) 1004*a97c2a1fSXin Li vtrn.16 d2, d3 1005*a97c2a1fSXin Li vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) 1006*a97c2a1fSXin Li vtrn.16 d4, d5 1007*a97c2a1fSXin Li vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) 1008*a97c2a1fSXin Li vtrn.16 d6, d7 1009*a97c2a1fSXin Li vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) 1010*a97c2a1fSXin Li vtrn.16 d8, d9 1011*a97c2a1fSXin Li vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1) 1012*a97c2a1fSXin Li vtrn.32 d2, d4 1013*a97c2a1fSXin Li vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1) 1014*a97c2a1fSXin Li vtrn.32 d3, d5 1015*a97c2a1fSXin Li vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1) 1016*a97c2a1fSXin Li vtrn.32 d6, d8 1017*a97c2a1fSXin Li vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0) 1018*a97c2a1fSXin Li vtrn.32 d7, d9 1019*a97c2a1fSXin Li vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 1020*a97c2a1fSXin Li 1021*a97c2a1fSXin Li add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data 1022*a97c2a1fSXin Li vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 1023*a97c2a1fSXin Li 1024*a97c2a1fSXin Li add r5, r8, r8, lsl #1 @ 1025*a97c2a1fSXin Li vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 1026*a97c2a1fSXin Li 1027*a97c2a1fSXin Li add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data 1028*a97c2a1fSXin Li vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 1029*a97c2a1fSXin Li 1030*a97c2a1fSXin Li add r10, r7, r7, lsl #1 @ 1031*a97c2a1fSXin Li vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 1032*a97c2a1fSXin Li 1033*a97c2a1fSXin Li 1034*a97c2a1fSXin Li vmlal.s16 q7, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 1035*a97c2a1fSXin Li 1036*a97c2a1fSXin Li vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 1037*a97c2a1fSXin Li vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 1038*a97c2a1fSXin Li 1039*a97c2a1fSXin Li vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) 1040*a97c2a1fSXin Li vswp d3, d6 1041*a97c2a1fSXin Li vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) 1042*a97c2a1fSXin Li 1043*a97c2a1fSXin Li vswp d5, d8 1044*a97c2a1fSXin Li vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) 1045*a97c2a1fSXin Li vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) 1046*a97c2a1fSXin Li 1047*a97c2a1fSXin Li vsub.s32 q11, q6, q7 @// a3 = c0 - d0(part of r3,r4) 1048*a97c2a1fSXin Li vadd.s32 q6, q6, q7 @// a0 = c0 + d0(part of r0,r7) 1049*a97c2a1fSXin Li 1050*a97c2a1fSXin Li 1051*a97c2a1fSXin Li vadd.s32 q0, q6, q12 1052*a97c2a1fSXin Li 1053*a97c2a1fSXin Li 1054*a97c2a1fSXin Li vsub.s32 q12, q6, q12 1055*a97c2a1fSXin Li 1056*a97c2a1fSXin Li 1057*a97c2a1fSXin Li vadd.s32 q6, q11, q15 1058*a97c2a1fSXin Li 1059*a97c2a1fSXin Li 1060*a97c2a1fSXin Li vsub.s32 q7, q11, q15 1061*a97c2a1fSXin Li 1062*a97c2a1fSXin Li vqrshrn.s32 d10, q0, #idct_stg2_shift 1063*a97c2a1fSXin Li vqrshrn.s32 d17, q12, #idct_stg2_shift 1064*a97c2a1fSXin Li vqrshrn.s32 d13, q6, #idct_stg2_shift 1065*a97c2a1fSXin Li vqrshrn.s32 d14, q7, #idct_stg2_shift 1066*a97c2a1fSXin Li 1067*a97c2a1fSXin Li vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) 1068*a97c2a1fSXin Li vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) 1069*a97c2a1fSXin Li 1070*a97c2a1fSXin Li 1071*a97c2a1fSXin Li vadd.s32 q0, q11, q14 1072*a97c2a1fSXin Li 1073*a97c2a1fSXin Li 1074*a97c2a1fSXin Li vsub.s32 q12, q11, q14 1075*a97c2a1fSXin Li 1076*a97c2a1fSXin Li 1077*a97c2a1fSXin Li vadd.s32 q14, q9, q13 1078*a97c2a1fSXin Li 1079*a97c2a1fSXin Li 1080*a97c2a1fSXin Li vsub.s32 q13, q9, q13 1081*a97c2a1fSXin Li vld1.8 d18, [r2], r8 1082*a97c2a1fSXin Li 1083*a97c2a1fSXin Li vqrshrn.s32 d12, q0, #idct_stg2_shift 1084*a97c2a1fSXin Li vld1.8 d20, [r2], r5 1085*a97c2a1fSXin Li 1086*a97c2a1fSXin Li 1087*a97c2a1fSXin Li vqrshrn.s32 d15, q12, #idct_stg2_shift 1088*a97c2a1fSXin Li vld1.8 d19, [r2], r8 1089*a97c2a1fSXin Li 1090*a97c2a1fSXin Li 1091*a97c2a1fSXin Li 1092*a97c2a1fSXin Li 1093*a97c2a1fSXin Li vqrshrn.s32 d11, q14, #idct_stg2_shift 1094*a97c2a1fSXin Li vld1.8 d22, [r4], r8 1095*a97c2a1fSXin Li 1096*a97c2a1fSXin Li 1097*a97c2a1fSXin Li 1098*a97c2a1fSXin Li 1099*a97c2a1fSXin Li vqrshrn.s32 d16, q13, #idct_stg2_shift 1100*a97c2a1fSXin Li vld1.8 d21, [r2], r5 1101*a97c2a1fSXin Li 1102*a97c2a1fSXin Li 1103*a97c2a1fSXin Li 1104*a97c2a1fSXin Li 1105*a97c2a1fSXin Lipred_buff_addition: 1106*a97c2a1fSXin Li 1107*a97c2a1fSXin Li 1108*a97c2a1fSXin Li vtrn.16 d10, d11 1109*a97c2a1fSXin Li vld1.8 d24, [r4], r5 1110*a97c2a1fSXin Li 1111*a97c2a1fSXin Li vtrn.16 d12, d13 1112*a97c2a1fSXin Li vld1.8 d23, [r4], r8 1113*a97c2a1fSXin Li 1114*a97c2a1fSXin Li vaddw.u8 q1, q1, d18 1115*a97c2a1fSXin Li vld1.8 d25, [r4], r5 1116*a97c2a1fSXin Li 1117*a97c2a1fSXin Li vtrn.16 d14, d15 1118*a97c2a1fSXin Li vaddw.u8 q2, q2, d22 1119*a97c2a1fSXin Li 1120*a97c2a1fSXin Li vtrn.16 d16, d17 1121*a97c2a1fSXin Li vaddw.u8 q3, q3, d20 1122*a97c2a1fSXin Li 1123*a97c2a1fSXin Li vtrn.32 d10, d12 1124*a97c2a1fSXin Li vaddw.u8 q4, q4, d24 1125*a97c2a1fSXin Li 1126*a97c2a1fSXin Li vtrn.32 d11, d13 1127*a97c2a1fSXin Li vtrn.32 d14, d16 1128*a97c2a1fSXin Li vtrn.32 d15, d17 1129*a97c2a1fSXin Li 1130*a97c2a1fSXin Li vswp d11, d14 1131*a97c2a1fSXin Li vswp d13, d16 1132*a97c2a1fSXin Li 1133*a97c2a1fSXin Li@ Row values stored in the q register. 1134*a97c2a1fSXin Li 1135*a97c2a1fSXin Li@Q1 :r0 1136*a97c2a1fSXin Li@Q3: r1 1137*a97c2a1fSXin Li@Q2: r2 1138*a97c2a1fSXin Li@Q4: r3 1139*a97c2a1fSXin Li@Q5: r4 1140*a97c2a1fSXin Li@Q7: r5 1141*a97c2a1fSXin Li@Q6: r6 1142*a97c2a1fSXin Li@Q8: r7 1143*a97c2a1fSXin Li 1144*a97c2a1fSXin Li 1145*a97c2a1fSXin Li 1146*a97c2a1fSXin Li@/// Adding the prediction buffer 1147*a97c2a1fSXin Li 1148*a97c2a1fSXin Li 1149*a97c2a1fSXin Li 1150*a97c2a1fSXin Li 1151*a97c2a1fSXin Li 1152*a97c2a1fSXin Li 1153*a97c2a1fSXin Li 1154*a97c2a1fSXin Li 1155*a97c2a1fSXin Li 1156*a97c2a1fSXin Li @ Load prediction data 1157*a97c2a1fSXin Li 1158*a97c2a1fSXin Li 1159*a97c2a1fSXin Li 1160*a97c2a1fSXin Li 1161*a97c2a1fSXin Li 1162*a97c2a1fSXin Li @Adding recon with prediction 1163*a97c2a1fSXin Li 1164*a97c2a1fSXin Li 1165*a97c2a1fSXin Li 1166*a97c2a1fSXin Li 1167*a97c2a1fSXin Li 1168*a97c2a1fSXin Li vaddw.u8 q5, q5, d19 1169*a97c2a1fSXin Li vqmovun.s16 d2, q1 1170*a97c2a1fSXin Li vaddw.u8 q7, q7, d21 1171*a97c2a1fSXin Li vqmovun.s16 d4, q2 1172*a97c2a1fSXin Li vaddw.u8 q6, q6, d23 1173*a97c2a1fSXin Li vqmovun.s16 d6, q3 1174*a97c2a1fSXin Li vaddw.u8 q8, q8, d25 1175*a97c2a1fSXin Li vqmovun.s16 d8, q4 1176*a97c2a1fSXin Li 1177*a97c2a1fSXin Li 1178*a97c2a1fSXin Li 1179*a97c2a1fSXin Li 1180*a97c2a1fSXin Li 1181*a97c2a1fSXin Li 1182*a97c2a1fSXin Li 1183*a97c2a1fSXin Li vst1.8 {d2}, [r3], r7 1184*a97c2a1fSXin Li vqmovun.s16 d10, q5 1185*a97c2a1fSXin Li vst1.8 {d6}, [r3], r10 1186*a97c2a1fSXin Li vqmovun.s16 d14, q7 1187*a97c2a1fSXin Li vst1.8 {d4}, [r0], r7 1188*a97c2a1fSXin Li vqmovun.s16 d12, q6 1189*a97c2a1fSXin Li vst1.8 {d8}, [r0], r10 1190*a97c2a1fSXin Li vqmovun.s16 d16, q8 1191*a97c2a1fSXin Li 1192*a97c2a1fSXin Li 1193*a97c2a1fSXin Li 1194*a97c2a1fSXin Li 1195*a97c2a1fSXin Li 1196*a97c2a1fSXin Li 1197*a97c2a1fSXin Li 1198*a97c2a1fSXin Li vst1.8 {d10}, [r3], r7 1199*a97c2a1fSXin Li vst1.8 {d14}, [r3], r10 1200*a97c2a1fSXin Li vst1.8 {d12}, [r0], r7 1201*a97c2a1fSXin Li vst1.8 {d16}, [r0], r10 1202*a97c2a1fSXin Li 1203*a97c2a1fSXin Li 1204*a97c2a1fSXin Li 1205*a97c2a1fSXin Li 1206*a97c2a1fSXin Li 1207*a97c2a1fSXin Li vpop {d8-d15} 1208*a97c2a1fSXin Li ldmfd sp!, {r4-r12, pc} 1209*a97c2a1fSXin Li 1210*a97c2a1fSXin Li 1211*a97c2a1fSXin Li 1212