1*a97c2a1fSXin Li//****************************************************************************** 2*a97c2a1fSXin Li//* 3*a97c2a1fSXin Li//* Copyright (C) 2015 The Android Open Source Project 4*a97c2a1fSXin Li//* 5*a97c2a1fSXin Li//* Licensed under the Apache License, Version 2.0 (the "License"); 6*a97c2a1fSXin Li//* you may not use this file except in compliance with the License. 7*a97c2a1fSXin Li//* You may obtain a copy of the License at: 8*a97c2a1fSXin Li//* 9*a97c2a1fSXin Li//* http://www.apache.org/licenses/LICENSE-2.0 10*a97c2a1fSXin Li//* 11*a97c2a1fSXin Li//* Unless required by applicable law or agreed to in writing, software 12*a97c2a1fSXin Li//* distributed under the License is distributed on an "AS IS" BASIS, 13*a97c2a1fSXin Li//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*a97c2a1fSXin Li//* See the License for the specific language governing permissions and 15*a97c2a1fSXin Li//* limitations under the License. 16*a97c2a1fSXin Li//* 17*a97c2a1fSXin Li//***************************************************************************** 18*a97c2a1fSXin Li//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*a97c2a1fSXin Li//*/ 20*a97c2a1fSXin Li 21*a97c2a1fSXin Li///* 22*a97c2a1fSXin Li////---------------------------------------------------------------------------- 23*a97c2a1fSXin Li//// File Name : impeg2_inter_pred.s 24*a97c2a1fSXin Li//// 25*a97c2a1fSXin Li//// Description : This file has motion compensation related 26*a97c2a1fSXin Li//// interpolation functions on Neon + CortexA-8 platform 27*a97c2a1fSXin Li//// 28*a97c2a1fSXin Li//// Reference Document : 29*a97c2a1fSXin Li//// 30*a97c2a1fSXin Li//// Revision History : 31*a97c2a1fSXin Li//// Date Author Detail Description 32*a97c2a1fSXin Li//// ------------ ---------------- ---------------------------------- 33*a97c2a1fSXin Li//// 18 jun 2010 S Hamsalekha Created 34*a97c2a1fSXin Li//// 35*a97c2a1fSXin Li////------------------------------------------------------------------------- 36*a97c2a1fSXin Li//*/ 37*a97c2a1fSXin Li 38*a97c2a1fSXin Li///* 39*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 40*a97c2a1fSXin Li//// Include Files 41*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 42*a97c2a1fSXin Li//*/ 43*a97c2a1fSXin Li// PRESERVE8 44*a97c2a1fSXin Li.text 45*a97c2a1fSXin Li.include "impeg2_neon_macros.s" 46*a97c2a1fSXin Li 47*a97c2a1fSXin Li///* 48*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 49*a97c2a1fSXin Li//// Struct/Union Types and Define 50*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 51*a97c2a1fSXin Li//*/ 52*a97c2a1fSXin Li 53*a97c2a1fSXin Li 54*a97c2a1fSXin Li///* 55*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 56*a97c2a1fSXin Li//// Static Global Data section variables 57*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 58*a97c2a1fSXin Li//*/ 59*a97c2a1fSXin Li//// -------------------------- NONE -------------------------------------------- 60*a97c2a1fSXin Li 61*a97c2a1fSXin Li 62*a97c2a1fSXin Li///* 63*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 64*a97c2a1fSXin Li//// Static Prototype Functions 65*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 66*a97c2a1fSXin Li//*/ 67*a97c2a1fSXin Li//// -------------------------- NONE -------------------------------------------- 68*a97c2a1fSXin Li 69*a97c2a1fSXin Li///* 70*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 71*a97c2a1fSXin Li//// Exported functions 72*a97c2a1fSXin Li//// ---------------------------------------------------------------------------- 73*a97c2a1fSXin Li//*/ 74*a97c2a1fSXin Li 75*a97c2a1fSXin Li 76*a97c2a1fSXin Li///* 77*a97c2a1fSXin Li////--------------------------------------------------------------------------- 78*a97c2a1fSXin Li//// Function Name : impeg2_copy_mb_av8() 79*a97c2a1fSXin Li//// 80*a97c2a1fSXin Li//// Detail Description : Copies one MB worth of data from src to the dst 81*a97c2a1fSXin Li//// 82*a97c2a1fSXin Li//// Inputs : x0 - pointer to src 83*a97c2a1fSXin Li//// x1 - pointer to dst 84*a97c2a1fSXin Li//// x2 - source width 85*a97c2a1fSXin Li//// x3 - destination width 86*a97c2a1fSXin Li//// Registers Used : v0, v1 87*a97c2a1fSXin Li//// 88*a97c2a1fSXin Li//// Stack Usage : 64 bytes 89*a97c2a1fSXin Li//// 90*a97c2a1fSXin Li//// Outputs : 91*a97c2a1fSXin Li//// 92*a97c2a1fSXin Li//// Return Data : None 93*a97c2a1fSXin Li//// 94*a97c2a1fSXin Li//// Programming Note : <program limitation> 95*a97c2a1fSXin Li////----------------------------------------------------------------------------- 96*a97c2a1fSXin Li//*/ 97*a97c2a1fSXin Li 98*a97c2a1fSXin Li 99*a97c2a1fSXin Li 100*a97c2a1fSXin Li.global impeg2_copy_mb_av8 101*a97c2a1fSXin Li 102*a97c2a1fSXin Li 103*a97c2a1fSXin Liimpeg2_copy_mb_av8: 104*a97c2a1fSXin Li 105*a97c2a1fSXin Li//STMFD x13!,{x4,x5,x12,x14} 106*a97c2a1fSXin Li push_v_regs 107*a97c2a1fSXin Li 108*a97c2a1fSXin Li 109*a97c2a1fSXin Li ldr x4, [x0] //src->y 110*a97c2a1fSXin Li ldr x5, [x1] //dst->y 111*a97c2a1fSXin Li 112*a97c2a1fSXin Li //Read one row of data from the src 113*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 114*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 115*a97c2a1fSXin Li 116*a97c2a1fSXin Li ////Repeat 15 times for y 117*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 118*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 119*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 120*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 121*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 122*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 123*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 124*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 125*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 126*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 127*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 128*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 129*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 130*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 131*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 132*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 133*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 134*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 135*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 136*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 137*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 138*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 139*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 140*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 141*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 142*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 143*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 144*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 145*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 146*a97c2a1fSXin Li st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 147*a97c2a1fSXin Li 148*a97c2a1fSXin Li lsr x2, x2, #1 //src_offset /= 2 149*a97c2a1fSXin Li lsr x3, x3, #1 //dst_offset /= 2 150*a97c2a1fSXin Li 151*a97c2a1fSXin Li ldr x4, [x0, #8] //src->u 152*a97c2a1fSXin Li ldr x5, [x1, #8] //dst->u 153*a97c2a1fSXin Li 154*a97c2a1fSXin Li //Read one row of data from the src 155*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 156*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 157*a97c2a1fSXin Li 158*a97c2a1fSXin Li ////Repeat 7 times for u 159*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 160*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 161*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 162*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 163*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 164*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 165*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 166*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 167*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 168*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 169*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 170*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 171*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 172*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 173*a97c2a1fSXin Li 174*a97c2a1fSXin Li ldr x4, [x0, #16] //src->v 175*a97c2a1fSXin Li ldr x5, [x1, #16] //dst->v 176*a97c2a1fSXin Li 177*a97c2a1fSXin Li //Read one row of data from the src 178*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 179*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 180*a97c2a1fSXin Li 181*a97c2a1fSXin Li ////Repeat 7 times for v 182*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 183*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 184*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 185*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 186*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 187*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 188*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 189*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 190*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 191*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 192*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 193*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 194*a97c2a1fSXin Li ld1 {v0.8b}, [x4], x2 //Load and increment src 195*a97c2a1fSXin Li st1 {v0.8b}, [x5], x3 //Store and increment dst 196*a97c2a1fSXin Li 197*a97c2a1fSXin Li//LDMFD x13!,{x4,x5,x12,PC} 198*a97c2a1fSXin Li pop_v_regs 199*a97c2a1fSXin Li ret 200*a97c2a1fSXin Li 201*a97c2a1fSXin Li 202*a97c2a1fSXin Li///* 203*a97c2a1fSXin Li////--------------------------------------------------------------------------- 204*a97c2a1fSXin Li//// Function Name : impeg2_mc_fullx_halfy_8x8_av8() 205*a97c2a1fSXin Li//// 206*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the 207*a97c2a1fSXin Li//// current frame buffer.This function is called for 208*a97c2a1fSXin Li//// blocks that are not coded and have motion vectors 209*a97c2a1fSXin Li//// with a half pel resolution. 210*a97c2a1fSXin Li//// 211*a97c2a1fSXin Li//// Inputs : x0 - out : Current Block Pointer 212*a97c2a1fSXin Li//// x1 - ref : Refernce Block Pointer 213*a97c2a1fSXin Li//// x2 - ref_wid : Refernce Block Width 214*a97c2a1fSXin Li//// x3 - out_wid @ Current Block Width 215*a97c2a1fSXin Li//// 216*a97c2a1fSXin Li//// Registers Used : x14, D0-D9 217*a97c2a1fSXin Li//// 218*a97c2a1fSXin Li//// Stack Usage : 64 bytes 219*a97c2a1fSXin Li//// 220*a97c2a1fSXin Li//// Outputs : The Motion Compensated Block 221*a97c2a1fSXin Li//// 222*a97c2a1fSXin Li//// Return Data : None 223*a97c2a1fSXin Li//// 224*a97c2a1fSXin Li//// Programming Note : <program limitation> 225*a97c2a1fSXin Li////----------------------------------------------------------------------------- 226*a97c2a1fSXin Li//*/ 227*a97c2a1fSXin Li 228*a97c2a1fSXin Li.global impeg2_mc_fullx_halfy_8x8_av8 229*a97c2a1fSXin Li 230*a97c2a1fSXin Liimpeg2_mc_fullx_halfy_8x8_av8: 231*a97c2a1fSXin Li 232*a97c2a1fSXin Li//STMFD x13!,{x12,x14} 233*a97c2a1fSXin Li push_v_regs 234*a97c2a1fSXin Li add x14, x1, x2 235*a97c2a1fSXin Li lsl x2, x2, #1 236*a97c2a1fSXin Li 237*a97c2a1fSXin Li///* Load 8 + 1 rows from reference block */ 238*a97c2a1fSXin Li///* Do the addition with out rounding off as rounding value is 1 */ 239*a97c2a1fSXin Li ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0 240*a97c2a1fSXin Li ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2 241*a97c2a1fSXin Li ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4 242*a97c2a1fSXin Li ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6 243*a97c2a1fSXin Li ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1 244*a97c2a1fSXin Li ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3 245*a97c2a1fSXin Li urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9 246*a97c2a1fSXin Li ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5 247*a97c2a1fSXin Li urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1 248*a97c2a1fSXin Li urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1 249*a97c2a1fSXin Li ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7 250*a97c2a1fSXin Li urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3 251*a97c2a1fSXin Li urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3 252*a97c2a1fSXin Li ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8 253*a97c2a1fSXin Li urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5 254*a97c2a1fSXin Li urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5 255*a97c2a1fSXin Li 256*a97c2a1fSXin Li add x14, x0, x3 257*a97c2a1fSXin Li lsl x3, x3, #1 258*a97c2a1fSXin Li 259*a97c2a1fSXin Li///* Store the eight rows calculated above */ 260*a97c2a1fSXin Li st1 {v2.8b}, [x14], x3 //// second row hence D2 261*a97c2a1fSXin Li urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7 262*a97c2a1fSXin Li st1 {v0.8b}, [x0], x3 //// first row hence D0 263*a97c2a1fSXin Li st1 {v9.8b}, [x14], x3 //// fourth row hence D9 264*a97c2a1fSXin Li st1 {v4.8b}, [x0], x3 //// third row hence D4 265*a97c2a1fSXin Li st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3 266*a97c2a1fSXin Li st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1 267*a97c2a1fSXin Li st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7 268*a97c2a1fSXin Li st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5 269*a97c2a1fSXin Li 270*a97c2a1fSXin Li// LDMFD sp!,{x12,pc} 271*a97c2a1fSXin Li pop_v_regs 272*a97c2a1fSXin Li ret 273*a97c2a1fSXin Li 274*a97c2a1fSXin Li 275*a97c2a1fSXin Li 276*a97c2a1fSXin Li 277*a97c2a1fSXin Li 278*a97c2a1fSXin Li///* 279*a97c2a1fSXin Li////--------------------------------------------------------------------------- 280*a97c2a1fSXin Li//// Function Name : impeg2_mc_halfx_fully_8x8_av8() 281*a97c2a1fSXin Li//// 282*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the 283*a97c2a1fSXin Li//// current frame buffer.This function is called for 284*a97c2a1fSXin Li//// blocks that are not coded and have motion vectors 285*a97c2a1fSXin Li//// with a half pel resolutionand VopRoundingType is 0 .. 286*a97c2a1fSXin Li//// 287*a97c2a1fSXin Li//// Inputs : x0 - out : Current Block Pointer 288*a97c2a1fSXin Li//// x1 - ref : Refernce Block Pointer 289*a97c2a1fSXin Li//// x2 - ref_wid : Refernce Block Width 290*a97c2a1fSXin Li//// x3 - out_wid @ Current Block Width 291*a97c2a1fSXin Li//// 292*a97c2a1fSXin Li//// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22 293*a97c2a1fSXin Li 294*a97c2a1fSXin Li//// 295*a97c2a1fSXin Li//// Stack Usage : 64 bytes 296*a97c2a1fSXin Li//// 297*a97c2a1fSXin Li//// Outputs : The Motion Compensated Block 298*a97c2a1fSXin Li//// 299*a97c2a1fSXin Li//// Return Data : None 300*a97c2a1fSXin Li//// 301*a97c2a1fSXin Li//// Programming Note : <program limitation> 302*a97c2a1fSXin Li////----------------------------------------------------------------------------- 303*a97c2a1fSXin Li//*/ 304*a97c2a1fSXin Li 305*a97c2a1fSXin Li 306*a97c2a1fSXin Li 307*a97c2a1fSXin Li.global impeg2_mc_halfx_fully_8x8_av8 308*a97c2a1fSXin Li 309*a97c2a1fSXin Li 310*a97c2a1fSXin Li 311*a97c2a1fSXin Liimpeg2_mc_halfx_fully_8x8_av8: 312*a97c2a1fSXin Li 313*a97c2a1fSXin Li // STMFD sp!,{x12,x14} 314*a97c2a1fSXin Li push_v_regs 315*a97c2a1fSXin Li 316*a97c2a1fSXin Li add x14, x1, x2, lsl #2 317*a97c2a1fSXin Li 318*a97c2a1fSXin Li add x12, x0, x3, lsl#2 319*a97c2a1fSXin Li 320*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 321*a97c2a1fSXin Li 322*a97c2a1fSXin Li ld1 {v2.8b, v3.8b}, [x14], x2 // row5 323*a97c2a1fSXin Li 324*a97c2a1fSXin Li 325*a97c2a1fSXin Li ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 326*a97c2a1fSXin Li 327*a97c2a1fSXin Li ld1 {v6.8b, v7.8b}, [x14], x2 //row6 328*a97c2a1fSXin Li 329*a97c2a1fSXin Li 330*a97c2a1fSXin Li ext v8.8b, v0.8b , v1.8b , #1 331*a97c2a1fSXin Li 332*a97c2a1fSXin Li ext v12.8b, v2.8b , v3.8b , #1 333*a97c2a1fSXin Li 334*a97c2a1fSXin Li ext v16.8b, v4.8b , v5.8b , #1 335*a97c2a1fSXin Li 336*a97c2a1fSXin Li ext v20.8b, v6.8b , v7.8b , #1 337*a97c2a1fSXin Li 338*a97c2a1fSXin Li 339*a97c2a1fSXin Li ld1 {v9.8b, v10.8b}, [x1], x2 //load row3 340*a97c2a1fSXin Li 341*a97c2a1fSXin Li ld1 {v13.8b, v14.8b}, [x14], x2 //load row7 342*a97c2a1fSXin Li 343*a97c2a1fSXin Li ld1 {v17.8b, v18.8b}, [x1], x2 //load row4 344*a97c2a1fSXin Li 345*a97c2a1fSXin Li ld1 {v21.8b, v22.8b}, [x14], x2 //load row8 346*a97c2a1fSXin Li 347*a97c2a1fSXin Li 348*a97c2a1fSXin Li ext v1.8b, v9.8b , v10.8b , #1 349*a97c2a1fSXin Li 350*a97c2a1fSXin Li ext v3.8b, v13.8b , v14.8b , #1 351*a97c2a1fSXin Li 352*a97c2a1fSXin Li 353*a97c2a1fSXin Li 354*a97c2a1fSXin Li ext v5.8b, v17.8b , v18.8b , #1 355*a97c2a1fSXin Li 356*a97c2a1fSXin Li ext v7.8b, v21.8b , v22.8b , #1 357*a97c2a1fSXin Li 358*a97c2a1fSXin Li 359*a97c2a1fSXin Li urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3 360*a97c2a1fSXin Li urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3 361*a97c2a1fSXin Li 362*a97c2a1fSXin Li urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7 363*a97c2a1fSXin Li urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7 364*a97c2a1fSXin Li 365*a97c2a1fSXin Li 366*a97c2a1fSXin Li urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4 367*a97c2a1fSXin Li urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4 368*a97c2a1fSXin Li 369*a97c2a1fSXin Li 370*a97c2a1fSXin Li urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8 371*a97c2a1fSXin Li urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8 372*a97c2a1fSXin Li 373*a97c2a1fSXin Li st1 {v0.8b}, [x0], x3 //store row1 374*a97c2a1fSXin Li 375*a97c2a1fSXin Li st1 {v2.8b}, [x12], x3 //store row5 376*a97c2a1fSXin Li 377*a97c2a1fSXin Li st1 {v4.8b}, [x0], x3 //store row2 378*a97c2a1fSXin Li 379*a97c2a1fSXin Li st1 {v6.8b}, [x12], x3 //store row6 380*a97c2a1fSXin Li 381*a97c2a1fSXin Li st1 {v1.8b}, [x0], x3 //store row3 382*a97c2a1fSXin Li 383*a97c2a1fSXin Li st1 {v3.8b}, [x12], x3 //store row7 384*a97c2a1fSXin Li 385*a97c2a1fSXin Li st1 {v5.8b}, [x0], x3 //store row4 386*a97c2a1fSXin Li 387*a97c2a1fSXin Li st1 {v7.8b}, [x12], x3 //store row8 388*a97c2a1fSXin Li 389*a97c2a1fSXin Li 390*a97c2a1fSXin Li 391*a97c2a1fSXin Li // LDMFD sp!,{x12,pc} 392*a97c2a1fSXin Li pop_v_regs 393*a97c2a1fSXin Li ret 394*a97c2a1fSXin Li 395*a97c2a1fSXin Li 396*a97c2a1fSXin Li 397*a97c2a1fSXin Li 398*a97c2a1fSXin Li 399*a97c2a1fSXin Li 400*a97c2a1fSXin Li 401*a97c2a1fSXin Li///* 402*a97c2a1fSXin Li////--------------------------------------------------------------------------- 403*a97c2a1fSXin Li//// Function Name : impeg2_mc_halfx_halfy_8x8_av8() 404*a97c2a1fSXin Li//// 405*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the 406*a97c2a1fSXin Li//// current frame buffer.This function is called for 407*a97c2a1fSXin Li//// blocks that are not coded and have motion vectors 408*a97c2a1fSXin Li//// with a half pel resolutionand VopRoundingType is 0 .. 409*a97c2a1fSXin Li//// 410*a97c2a1fSXin Li//// Inputs : x0 - out : Current Block Pointer 411*a97c2a1fSXin Li//// x1 - ref : Refernce Block Pointer 412*a97c2a1fSXin Li//// x2 - ref_wid : Refernce Block Width 413*a97c2a1fSXin Li//// x3 - out_wid @ Current Block Width 414*a97c2a1fSXin Li//// 415*a97c2a1fSXin Li//// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30 416*a97c2a1fSXin Li 417*a97c2a1fSXin Li//// 418*a97c2a1fSXin Li//// Stack Usage : 64 bytes 419*a97c2a1fSXin Li//// 420*a97c2a1fSXin Li//// Outputs : The Motion Compensated Block 421*a97c2a1fSXin Li//// 422*a97c2a1fSXin Li//// Return Data : None 423*a97c2a1fSXin Li//// 424*a97c2a1fSXin Li//// Programming Note : <program limitation> 425*a97c2a1fSXin Li////----------------------------------------------------------------------------- 426*a97c2a1fSXin Li//*/ 427*a97c2a1fSXin Li 428*a97c2a1fSXin Li 429*a97c2a1fSXin Li.global impeg2_mc_halfx_halfy_8x8_av8 430*a97c2a1fSXin Li 431*a97c2a1fSXin Liimpeg2_mc_halfx_halfy_8x8_av8: 432*a97c2a1fSXin Li 433*a97c2a1fSXin Li // STMFD sp!,{x12,x14} 434*a97c2a1fSXin Li push_v_regs 435*a97c2a1fSXin Li 436*a97c2a1fSXin Li add x14, x1, x2, lsl #2 437*a97c2a1fSXin Li 438*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 439*a97c2a1fSXin Li 440*a97c2a1fSXin Li ld1 {v2.8b, v3.8b}, [x14], x2 // row5 441*a97c2a1fSXin Li 442*a97c2a1fSXin Li ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 443*a97c2a1fSXin Li 444*a97c2a1fSXin Li ld1 {v6.8b, v7.8b}, [x14], x2 //row6 445*a97c2a1fSXin Li 446*a97c2a1fSXin Li ext v1.8b, v0.8b , v1.8b , #1 447*a97c2a1fSXin Li 448*a97c2a1fSXin Li 449*a97c2a1fSXin Li 450*a97c2a1fSXin Li ext v3.8b, v2.8b , v3.8b , #1 451*a97c2a1fSXin Li 452*a97c2a1fSXin Li 453*a97c2a1fSXin Li 454*a97c2a1fSXin Li ext v5.8b, v4.8b , v5.8b , #1 455*a97c2a1fSXin Li 456*a97c2a1fSXin Li ext v7.8b, v6.8b , v7.8b , #1 457*a97c2a1fSXin Li 458*a97c2a1fSXin Li 459*a97c2a1fSXin Li 460*a97c2a1fSXin Li 461*a97c2a1fSXin Li ld1 {v8.8b, v9.8b}, [x1], x2 //load row3 462*a97c2a1fSXin Li 463*a97c2a1fSXin Li 464*a97c2a1fSXin Li 465*a97c2a1fSXin Li ld1 {v10.8b, v11.8b}, [x14], x2 //load row7 466*a97c2a1fSXin Li 467*a97c2a1fSXin Li ld1 {v12.8b, v13.8b}, [x1], x2 //load row4 468*a97c2a1fSXin Li 469*a97c2a1fSXin Li ld1 {v14.8b, v15.8b}, [x14], x2 //load row8 470*a97c2a1fSXin Li 471*a97c2a1fSXin Li ext v9.8b, v8.8b , v9.8b , #1 472*a97c2a1fSXin Li 473*a97c2a1fSXin Li ld1 {v16.8b, v17.8b}, [x14], x2 //load row9 474*a97c2a1fSXin Li 475*a97c2a1fSXin Li 476*a97c2a1fSXin Li 477*a97c2a1fSXin Li 478*a97c2a1fSXin Li 479*a97c2a1fSXin Li ext v11.8b, v10.8b , v11.8b , #1 480*a97c2a1fSXin Li 481*a97c2a1fSXin Li 482*a97c2a1fSXin Li 483*a97c2a1fSXin Li ext v13.8b, v12.8b , v13.8b , #1 484*a97c2a1fSXin Li 485*a97c2a1fSXin Li 486*a97c2a1fSXin Li 487*a97c2a1fSXin Li ext v15.8b, v14.8b , v15.8b , #1 488*a97c2a1fSXin Li 489*a97c2a1fSXin Li ext v17.8b, v16.8b , v17.8b , #1 490*a97c2a1fSXin Li 491*a97c2a1fSXin Li 492*a97c2a1fSXin Li //interpolation in x direction 493*a97c2a1fSXin Li 494*a97c2a1fSXin Li uaddl v0.8h, v0.8b, v1.8b //operate row1 495*a97c2a1fSXin Li 496*a97c2a1fSXin Li uaddl v2.8h, v2.8b, v3.8b //operate row5 497*a97c2a1fSXin Li 498*a97c2a1fSXin Li uaddl v4.8h, v4.8b, v5.8b //operate row2 499*a97c2a1fSXin Li 500*a97c2a1fSXin Li uaddl v6.8h, v6.8b, v7.8b //operate row6 501*a97c2a1fSXin Li 502*a97c2a1fSXin Li uaddl v8.8h, v8.8b, v9.8b //operate row3 503*a97c2a1fSXin Li 504*a97c2a1fSXin Li uaddl v10.8h, v10.8b, v11.8b //operate row7 505*a97c2a1fSXin Li 506*a97c2a1fSXin Li uaddl v12.8h, v12.8b, v13.8b //operate row4 507*a97c2a1fSXin Li 508*a97c2a1fSXin Li uaddl v14.8h, v14.8b, v15.8b //operate row8 509*a97c2a1fSXin Li 510*a97c2a1fSXin Li uaddl v16.8h, v16.8b, v17.8b //operate row9 511*a97c2a1fSXin Li 512*a97c2a1fSXin Li //interpolation in y direction 513*a97c2a1fSXin Li 514*a97c2a1fSXin Li add x14, x0, x3, lsl #2 515*a97c2a1fSXin Li 516*a97c2a1fSXin Li 517*a97c2a1fSXin Li 518*a97c2a1fSXin Li add v18.8h, v0.8h , v4.8h //operate row1 and row2 519*a97c2a1fSXin Li 520*a97c2a1fSXin Li add v26.8h, v2.8h , v6.8h //operate row5 and row6 521*a97c2a1fSXin Li 522*a97c2a1fSXin Li add v20.8h, v4.8h , v8.8h //operate row2 and row3 523*a97c2a1fSXin Li 524*a97c2a1fSXin Li add v28.8h, v6.8h , v10.8h //operate row6 and row7 525*a97c2a1fSXin Li 526*a97c2a1fSXin Li rshrn v18.8b, v18.8h, #2 //row1 527*a97c2a1fSXin Li 528*a97c2a1fSXin Li rshrn v26.8b, v26.8h, #2 //row5 529*a97c2a1fSXin Li 530*a97c2a1fSXin Li rshrn v20.8b, v20.8h, #2 //row2 531*a97c2a1fSXin Li 532*a97c2a1fSXin Li rshrn v28.8b, v28.8h, #2 //row6 533*a97c2a1fSXin Li 534*a97c2a1fSXin Li add v22.8h, v8.8h , v12.8h //operate row3 and row4 535*a97c2a1fSXin Li 536*a97c2a1fSXin Li st1 {v18.8b}, [x0], x3 //store row1 537*a97c2a1fSXin Li 538*a97c2a1fSXin Li add v30.8h, v10.8h , v14.8h //operate row7 and row8 539*a97c2a1fSXin Li 540*a97c2a1fSXin Li st1 {v26.8b}, [x14], x3 //store row5 541*a97c2a1fSXin Li 542*a97c2a1fSXin Li add v24.8h, v12.8h , v2.8h //operate row4 and row5 543*a97c2a1fSXin Li 544*a97c2a1fSXin Li st1 {v20.8b}, [x0], x3 //store row2 545*a97c2a1fSXin Li 546*a97c2a1fSXin Li add v14.8h, v14.8h , v16.8h //operate row8 and row9 547*a97c2a1fSXin Li 548*a97c2a1fSXin Li st1 {v28.8b}, [x14], x3 //store row6 549*a97c2a1fSXin Li 550*a97c2a1fSXin Li 551*a97c2a1fSXin Li 552*a97c2a1fSXin Li rshrn v22.8b, v22.8h, #2 //row3 553*a97c2a1fSXin Li 554*a97c2a1fSXin Li rshrn v30.8b, v30.8h, #2 //row7 555*a97c2a1fSXin Li 556*a97c2a1fSXin Li rshrn v24.8b, v24.8h, #2 //row4 557*a97c2a1fSXin Li 558*a97c2a1fSXin Li rshrn v14.8b, v14.8h, #2 //row8 559*a97c2a1fSXin Li 560*a97c2a1fSXin Li 561*a97c2a1fSXin Li st1 {v22.8b}, [x0], x3 //store row3 562*a97c2a1fSXin Li st1 {v30.8b}, [x14], x3 //store row7 563*a97c2a1fSXin Li st1 {v24.8b}, [x0], x3 //store row4 564*a97c2a1fSXin Li st1 {v14.8b}, [x14], x3 //store row8 565*a97c2a1fSXin Li 566*a97c2a1fSXin Li 567*a97c2a1fSXin Li 568*a97c2a1fSXin Li // LDMFD sp!,{x12,pc} 569*a97c2a1fSXin Li pop_v_regs 570*a97c2a1fSXin Li ret 571*a97c2a1fSXin Li 572*a97c2a1fSXin Li 573*a97c2a1fSXin Li 574*a97c2a1fSXin Li 575*a97c2a1fSXin Li///* 576*a97c2a1fSXin Li////--------------------------------------------------------------------------- 577*a97c2a1fSXin Li//// Function Name : impeg2_mc_fullx_fully_8x8_av8() 578*a97c2a1fSXin Li//// 579*a97c2a1fSXin Li//// Detail Description : This function pastes the reference block in the 580*a97c2a1fSXin Li//// current frame buffer.This function is called for 581*a97c2a1fSXin Li//// blocks that are not coded and have motion vectors 582*a97c2a1fSXin Li//// with a half pel resolutionand .. 583*a97c2a1fSXin Li//// 584*a97c2a1fSXin Li//// Inputs : x0 - out : Current Block Pointer 585*a97c2a1fSXin Li//// x1 - ref : Refernce Block Pointer 586*a97c2a1fSXin Li//// x2 - ref_wid : Refernce Block Width 587*a97c2a1fSXin Li//// x3 - out_wid @ Current Block Width 588*a97c2a1fSXin Li//// 589*a97c2a1fSXin Li//// Registers Used : x12, x14, v0-v3 590*a97c2a1fSXin Li 591*a97c2a1fSXin Li//// 592*a97c2a1fSXin Li//// Stack Usage : 64 bytes 593*a97c2a1fSXin Li//// 594*a97c2a1fSXin Li//// Outputs : The Motion Compensated Block 595*a97c2a1fSXin Li//// 596*a97c2a1fSXin Li//// Return Data : None 597*a97c2a1fSXin Li//// 598*a97c2a1fSXin Li//// Programming Note : <program limitation> 599*a97c2a1fSXin Li////----------------------------------------------------------------------------- 600*a97c2a1fSXin Li//*/ 601*a97c2a1fSXin Li 602*a97c2a1fSXin Li 603*a97c2a1fSXin Li.global impeg2_mc_fullx_fully_8x8_av8 604*a97c2a1fSXin Liimpeg2_mc_fullx_fully_8x8_av8: 605*a97c2a1fSXin Li 606*a97c2a1fSXin Li 607*a97c2a1fSXin Li // STMFD sp!,{x12,x14} 608*a97c2a1fSXin Li push_v_regs 609*a97c2a1fSXin Li 610*a97c2a1fSXin Li add x14, x1, x2, lsl #2 611*a97c2a1fSXin Li 612*a97c2a1fSXin Li add x12, x0, x3, lsl #2 613*a97c2a1fSXin Li 614*a97c2a1fSXin Li 615*a97c2a1fSXin Li ld1 {v0.8b}, [x1], x2 //load row1 616*a97c2a1fSXin Li 617*a97c2a1fSXin Li ld1 {v1.8b}, [x14], x2 //load row4 618*a97c2a1fSXin Li 619*a97c2a1fSXin Li ld1 {v2.8b}, [x1], x2 //load row2 620*a97c2a1fSXin Li 621*a97c2a1fSXin Li ld1 {v3.8b}, [x14], x2 //load row5 622*a97c2a1fSXin Li 623*a97c2a1fSXin Li 624*a97c2a1fSXin Li st1 {v0.8b}, [x0], x3 //store row1 625*a97c2a1fSXin Li 626*a97c2a1fSXin Li st1 {v1.8b}, [x12], x3 //store row4 627*a97c2a1fSXin Li 628*a97c2a1fSXin Li st1 {v2.8b}, [x0], x3 //store row2 629*a97c2a1fSXin Li 630*a97c2a1fSXin Li st1 {v3.8b}, [x12], x3 //store row5 631*a97c2a1fSXin Li 632*a97c2a1fSXin Li 633*a97c2a1fSXin Li ld1 {v0.8b}, [x1], x2 //load row3 634*a97c2a1fSXin Li 635*a97c2a1fSXin Li ld1 {v1.8b}, [x14], x2 //load row6 636*a97c2a1fSXin Li 637*a97c2a1fSXin Li ld1 {v2.8b}, [x1], x2 //load row4 638*a97c2a1fSXin Li 639*a97c2a1fSXin Li ld1 {v3.8b}, [x14], x2 //load row8 640*a97c2a1fSXin Li 641*a97c2a1fSXin Li 642*a97c2a1fSXin Li st1 {v0.8b}, [x0], x3 //store row3 643*a97c2a1fSXin Li 644*a97c2a1fSXin Li st1 {v1.8b}, [x12], x3 //store row6 645*a97c2a1fSXin Li 646*a97c2a1fSXin Li st1 {v2.8b}, [x0], x3 //store row4 647*a97c2a1fSXin Li 648*a97c2a1fSXin Li st1 {v3.8b}, [x12], x3 //store row8 649*a97c2a1fSXin Li 650*a97c2a1fSXin Li 651*a97c2a1fSXin Li // LDMFD sp!,{x12,pc} 652*a97c2a1fSXin Li pop_v_regs 653*a97c2a1fSXin Li ret 654*a97c2a1fSXin Li 655*a97c2a1fSXin Li 656*a97c2a1fSXin Li 657*a97c2a1fSXin Li 658*a97c2a1fSXin Li///* 659*a97c2a1fSXin Li////--------------------------------------------------------------------------- 660*a97c2a1fSXin Li//// Function Name : impeg2_interpolate_av8() 661*a97c2a1fSXin Li//// 662*a97c2a1fSXin Li//// Detail Description : interpolates two buffers and adds pred 663*a97c2a1fSXin Li//// 664*a97c2a1fSXin Li//// Inputs : x0 - pointer to src1 665*a97c2a1fSXin Li//// x1 - pointer to src2 666*a97c2a1fSXin Li//// x2 - dest buf 667*a97c2a1fSXin Li//// x3 - dst stride 668*a97c2a1fSXin Li//// Registers Used : x12, v0-v15 669*a97c2a1fSXin Li//// 670*a97c2a1fSXin Li//// Stack Usage : 64 bytes 671*a97c2a1fSXin Li//// 672*a97c2a1fSXin Li//// Outputs : The Motion Compensated Block 673*a97c2a1fSXin Li//// 674*a97c2a1fSXin Li//// Return Data : None 675*a97c2a1fSXin Li//// 676*a97c2a1fSXin Li//// Programming Note : <program limitation> 677*a97c2a1fSXin Li////----------------------------------------------------------------------------- 678*a97c2a1fSXin Li//*/ 679*a97c2a1fSXin Li 680*a97c2a1fSXin Li 681*a97c2a1fSXin Li.global impeg2_interpolate_av8 682*a97c2a1fSXin Li 683*a97c2a1fSXin Li 684*a97c2a1fSXin Liimpeg2_interpolate_av8: 685*a97c2a1fSXin Li 686*a97c2a1fSXin Li//STMFD x13!,{x4-x7,x12,x14} 687*a97c2a1fSXin Li push_v_regs 688*a97c2a1fSXin Li 689*a97c2a1fSXin Li ldr x4, [x0, #0] //ptr_y src1 690*a97c2a1fSXin Li 691*a97c2a1fSXin Li ldr x5, [x1, #0] //ptr_y src2 692*a97c2a1fSXin Li 693*a97c2a1fSXin Li ldr x7, [x2, #0] //ptr_y dst buf 694*a97c2a1fSXin Li 695*a97c2a1fSXin Li mov x12, #4 //counter for number of blocks 696*a97c2a1fSXin Li 697*a97c2a1fSXin Li 698*a97c2a1fSXin Liinterp_lumablocks_stride: 699*a97c2a1fSXin Li ld1 {v0.16b}, [x4], #16 //row1 src1 700*a97c2a1fSXin Li 701*a97c2a1fSXin Li ld1 {v2.16b}, [x4], #16 //row2 src1 702*a97c2a1fSXin Li 703*a97c2a1fSXin Li ld1 {v4.16b}, [x4], #16 //row3 src1 704*a97c2a1fSXin Li 705*a97c2a1fSXin Li ld1 {v6.16b}, [x4], #16 //row4 src1 706*a97c2a1fSXin Li 707*a97c2a1fSXin Li 708*a97c2a1fSXin Li ld1 {v8.16b}, [x5], #16 //row1 src2 709*a97c2a1fSXin Li 710*a97c2a1fSXin Li ld1 {v10.16b}, [x5], #16 //row2 src2 711*a97c2a1fSXin Li 712*a97c2a1fSXin Li ld1 {v12.16b}, [x5], #16 //row3 src2 713*a97c2a1fSXin Li 714*a97c2a1fSXin Li ld1 {v14.16b}, [x5], #16 //row4 src2 715*a97c2a1fSXin Li 716*a97c2a1fSXin Li urhadd v0.16b, v0.16b , v8.16b //operate on row1 717*a97c2a1fSXin Li 718*a97c2a1fSXin Li urhadd v2.16b, v2.16b , v10.16b //operate on row2 719*a97c2a1fSXin Li 720*a97c2a1fSXin Li urhadd v4.16b, v4.16b , v12.16b //operate on row3 721*a97c2a1fSXin Li 722*a97c2a1fSXin Li urhadd v6.16b, v6.16b , v14.16b //operate on row4 723*a97c2a1fSXin Li st1 {v0.16b}, [x7], x3 //row1 724*a97c2a1fSXin Li 725*a97c2a1fSXin Li st1 {v2.16b}, [x7], x3 //row2 726*a97c2a1fSXin Li 727*a97c2a1fSXin Li st1 {v4.16b}, [x7], x3 //row3 728*a97c2a1fSXin Li 729*a97c2a1fSXin Li st1 {v6.16b}, [x7], x3 //row4 730*a97c2a1fSXin Li 731*a97c2a1fSXin Li subs x12, x12, #1 732*a97c2a1fSXin Li 733*a97c2a1fSXin Li bne interp_lumablocks_stride 734*a97c2a1fSXin Li 735*a97c2a1fSXin Li 736*a97c2a1fSXin Li lsr x3, x3, #1 //stride >> 1 737*a97c2a1fSXin Li 738*a97c2a1fSXin Li ldr x4, [x0, #8] //ptr_u src1 739*a97c2a1fSXin Li 740*a97c2a1fSXin Li ldr x5, [x1, #8] //ptr_u src2 741*a97c2a1fSXin Li 742*a97c2a1fSXin Li ldr x7 , [x2, #8] //ptr_u dst buf 743*a97c2a1fSXin Li 744*a97c2a1fSXin Li mov x12, #2 //counter for number of blocks 745*a97c2a1fSXin Li 746*a97c2a1fSXin Li 747*a97c2a1fSXin Li 748*a97c2a1fSXin Li//chroma blocks 749*a97c2a1fSXin Li 750*a97c2a1fSXin Liinterp_chromablocks_stride: 751*a97c2a1fSXin Li ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1 752*a97c2a1fSXin Li 753*a97c2a1fSXin Li ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1 754*a97c2a1fSXin Li 755*a97c2a1fSXin Li ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1 756*a97c2a1fSXin Li 757*a97c2a1fSXin Li ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1 758*a97c2a1fSXin Li 759*a97c2a1fSXin Li 760*a97c2a1fSXin Li ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2 761*a97c2a1fSXin Li 762*a97c2a1fSXin Li ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2 763*a97c2a1fSXin Li 764*a97c2a1fSXin Li ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2 765*a97c2a1fSXin Li 766*a97c2a1fSXin Li ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2 767*a97c2a1fSXin Li 768*a97c2a1fSXin Li urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2 769*a97c2a1fSXin Li urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2 770*a97c2a1fSXin Li 771*a97c2a1fSXin Li urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4 772*a97c2a1fSXin Li urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4 773*a97c2a1fSXin Li 774*a97c2a1fSXin Li urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6 775*a97c2a1fSXin Li urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6 776*a97c2a1fSXin Li 777*a97c2a1fSXin Li urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8 778*a97c2a1fSXin Li urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8 779*a97c2a1fSXin Li 780*a97c2a1fSXin Li st1 {v0.8b}, [x7], x3 //row1 781*a97c2a1fSXin Li 782*a97c2a1fSXin Li st1 {v1.8b}, [x7], x3 //row2 783*a97c2a1fSXin Li 784*a97c2a1fSXin Li st1 {v2.8b}, [x7], x3 //row3 785*a97c2a1fSXin Li 786*a97c2a1fSXin Li st1 {v3.8b}, [x7], x3 //row4 787*a97c2a1fSXin Li 788*a97c2a1fSXin Li st1 {v4.8b}, [x7], x3 //row5 789*a97c2a1fSXin Li 790*a97c2a1fSXin Li st1 {v5.8b}, [x7], x3 //row6 791*a97c2a1fSXin Li 792*a97c2a1fSXin Li st1 {v6.8b}, [x7], x3 //row7 793*a97c2a1fSXin Li 794*a97c2a1fSXin Li st1 {v7.8b}, [x7], x3 //row8 795*a97c2a1fSXin Li 796*a97c2a1fSXin Li 797*a97c2a1fSXin Li ldr x4, [x0, #16] //ptr_v src1 798*a97c2a1fSXin Li 799*a97c2a1fSXin Li ldr x5, [x1, #16] //ptr_v src2 800*a97c2a1fSXin Li 801*a97c2a1fSXin Li ldr x7, [x2, #16] //ptr_v dst buf 802*a97c2a1fSXin Li 803*a97c2a1fSXin Li subs x12, x12, #1 804*a97c2a1fSXin Li 805*a97c2a1fSXin Li bne interp_chromablocks_stride 806*a97c2a1fSXin Li 807*a97c2a1fSXin Li 808*a97c2a1fSXin Li //LDMFD x13!,{x4-x7,x12,PC} 809*a97c2a1fSXin Li pop_v_regs 810*a97c2a1fSXin Li ret 811*a97c2a1fSXin Li 812*a97c2a1fSXin Li 813*a97c2a1fSXin Li 814*a97c2a1fSXin Li 815