1*a97c2a1fSXin Li//****************************************************************************** 2*a97c2a1fSXin Li//* 3*a97c2a1fSXin Li//* Copyright (C) 2015 The Android Open Source Project 4*a97c2a1fSXin Li//* 5*a97c2a1fSXin Li//* Licensed under the Apache License, Version 2.0 (the "License"); 6*a97c2a1fSXin Li//* you may not use this file except in compliance with the License. 7*a97c2a1fSXin Li//* You may obtain a copy of the License at: 8*a97c2a1fSXin Li//* 9*a97c2a1fSXin Li//* http://www.apache.org/licenses/LICENSE-2.0 10*a97c2a1fSXin Li//* 11*a97c2a1fSXin Li//* Unless required by applicable law or agreed to in writing, software 12*a97c2a1fSXin Li//* distributed under the License is distributed on an "AS IS" BASIS, 13*a97c2a1fSXin Li//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*a97c2a1fSXin Li//* See the License for the specific language governing permissions and 15*a97c2a1fSXin Li//* limitations under the License. 16*a97c2a1fSXin Li//* 17*a97c2a1fSXin Li//***************************************************************************** 18*a97c2a1fSXin Li//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*a97c2a1fSXin Li//*/ 20*a97c2a1fSXin Li 21*a97c2a1fSXin Li//****************************************************************************** 22*a97c2a1fSXin Li//* 23*a97c2a1fSXin Li//* @brief 24*a97c2a1fSXin Li//* This file contains definitions of routines for spatial filter 25*a97c2a1fSXin Li//* 26*a97c2a1fSXin Li//* @author 27*a97c2a1fSXin Li//* Ittiam 28*a97c2a1fSXin Li//* 29*a97c2a1fSXin Li//* @par List of Functions: 30*a97c2a1fSXin Li//* - ideint_cac_8x8_av8() 31*a97c2a1fSXin Li//* 32*a97c2a1fSXin Li//* @remarks 33*a97c2a1fSXin Li//* None 34*a97c2a1fSXin Li//* 35*a97c2a1fSXin Li//******************************************************************************* 36*a97c2a1fSXin Li 37*a97c2a1fSXin Li 38*a97c2a1fSXin Li//****************************************************************************** 39*a97c2a1fSXin Li//* 40*a97c2a1fSXin Li//* @brief Calculates Combing Artifact 41*a97c2a1fSXin Li//* 42*a97c2a1fSXin Li//* @par Description 43*a97c2a1fSXin Li//* This functions calculates combing artifact check (CAC) for given two fields 44*a97c2a1fSXin Li//* 45*a97c2a1fSXin Li//* @param[in] pu1_top 46*a97c2a1fSXin Li//* UWORD8 pointer to top field 47*a97c2a1fSXin Li//* 48*a97c2a1fSXin Li//* @param[in] pu1_bot 49*a97c2a1fSXin Li//* UWORD8 pointer to bottom field 50*a97c2a1fSXin Li//* 51*a97c2a1fSXin Li//* @param[in] top_strd 52*a97c2a1fSXin Li//* Top field stride 53*a97c2a1fSXin Li//* 54*a97c2a1fSXin Li//* @param[in] bot_strd 55*a97c2a1fSXin Li//* Bottom field stride 56*a97c2a1fSXin Li//* 57*a97c2a1fSXin Li//* @returns 58*a97c2a1fSXin Li//* None 59*a97c2a1fSXin Li//* 60*a97c2a1fSXin Li//* @remarks 61*a97c2a1fSXin Li//* 62*a97c2a1fSXin Li//****************************************************************************** 63*a97c2a1fSXin Li 64*a97c2a1fSXin Li .global ideint_cac_8x8_av8 65*a97c2a1fSXin Li 66*a97c2a1fSXin Liideint_cac_8x8_av8: 67*a97c2a1fSXin Li 68*a97c2a1fSXin Li // Load first row of top 69*a97c2a1fSXin Li ld1 {v28.8b}, [x0], x2 70*a97c2a1fSXin Li 71*a97c2a1fSXin Li // Load first row of bottom 72*a97c2a1fSXin Li ld1 {v29.8b}, [x1], x3 73*a97c2a1fSXin Li mov v28.d[1], v29.d[0] 74*a97c2a1fSXin Li 75*a97c2a1fSXin Li // Load second row of top 76*a97c2a1fSXin Li ld1 {v30.8b}, [x0], x2 77*a97c2a1fSXin Li 78*a97c2a1fSXin Li // Load second row of bottom 79*a97c2a1fSXin Li ld1 {v31.8b}, [x1], x3 80*a97c2a1fSXin Li mov v30.d[1], v31.d[0] 81*a97c2a1fSXin Li 82*a97c2a1fSXin Li 83*a97c2a1fSXin Li // Calculate row based adj and alt values 84*a97c2a1fSXin Li // Get row sums 85*a97c2a1fSXin Li uaddlp v0.8h, v28.16b 86*a97c2a1fSXin Li 87*a97c2a1fSXin Li uaddlp v2.8h, v30.16b 88*a97c2a1fSXin Li 89*a97c2a1fSXin Li uaddlp v0.4s, v0.8h 90*a97c2a1fSXin Li 91*a97c2a1fSXin Li uaddlp v2.4s, v2.8h 92*a97c2a1fSXin Li 93*a97c2a1fSXin Li // Both v0 and v2 have four 32 bit sums corresponding to first 4 rows 94*a97c2a1fSXin Li // Pack v0 and v2 into a single register (sum does not exceed 16bits) 95*a97c2a1fSXin Li 96*a97c2a1fSXin Li shl v16.4s, v2.4s, #16 97*a97c2a1fSXin Li orr v16.16b, v0.16b, v16.16b 98*a97c2a1fSXin Li // v16 now contains 8 sums 99*a97c2a1fSXin Li 100*a97c2a1fSXin Li // Load third row of top 101*a97c2a1fSXin Li ld1 {v24.8b}, [x0], x2 102*a97c2a1fSXin Li 103*a97c2a1fSXin Li // Load third row of bottom 104*a97c2a1fSXin Li ld1 {v25.8b}, [x1], x3 105*a97c2a1fSXin Li mov v24.d[1], v25.d[0] 106*a97c2a1fSXin Li 107*a97c2a1fSXin Li // Load fourth row of top 108*a97c2a1fSXin Li ld1 {v26.8b}, [x0], x2 109*a97c2a1fSXin Li 110*a97c2a1fSXin Li // Load fourth row of bottom 111*a97c2a1fSXin Li ld1 {v27.8b}, [x1], x3 112*a97c2a1fSXin Li mov v26.d[1], v27.d[0] 113*a97c2a1fSXin Li 114*a97c2a1fSXin Li // Get row sums 115*a97c2a1fSXin Li uaddlp v4.8h, v24.16b 116*a97c2a1fSXin Li 117*a97c2a1fSXin Li uaddlp v6.8h, v26.16b 118*a97c2a1fSXin Li 119*a97c2a1fSXin Li uaddlp v4.4s, v4.8h 120*a97c2a1fSXin Li 121*a97c2a1fSXin Li uaddlp v6.4s, v6.8h 122*a97c2a1fSXin Li // Both v4 and v6 have four 32 bit sums corresponding to last 4 rows 123*a97c2a1fSXin Li // Pack v4 and v6 into a single register (sum does not exceed 16bits) 124*a97c2a1fSXin Li 125*a97c2a1fSXin Li shl v18.4s, v6.4s, #16 126*a97c2a1fSXin Li orr v18.16b, v4.16b, v18.16b 127*a97c2a1fSXin Li // v18 now contains 8 sums 128*a97c2a1fSXin Li 129*a97c2a1fSXin Li // Compute absolute diff between top and bottom row sums 130*a97c2a1fSXin Li mov v17.d[0], v16.d[1] 131*a97c2a1fSXin Li uabd v16.4h, v16.4h, v17.4h 132*a97c2a1fSXin Li 133*a97c2a1fSXin Li mov v19.d[0], v18.d[1] 134*a97c2a1fSXin Li uabd v17.4h, v18.4h, v19.4h 135*a97c2a1fSXin Li 136*a97c2a1fSXin Li mov v16.d[1], v17.d[0] 137*a97c2a1fSXin Li 138*a97c2a1fSXin Li // RSUM_CSUM_THRESH 139*a97c2a1fSXin Li movi v18.8h, #20 140*a97c2a1fSXin Li 141*a97c2a1fSXin Li // Eliminate values smaller than RSUM_CSUM_THRESH 142*a97c2a1fSXin Li cmhs v20.8h, v16.8h, v18.8h 143*a97c2a1fSXin Li and v20.16b, v16.16b, v20.16b 144*a97c2a1fSXin Li 145*a97c2a1fSXin Li // v20 now contains 8 absolute diff of sums above the threshold 146*a97c2a1fSXin Li 147*a97c2a1fSXin Li // Compute adj 148*a97c2a1fSXin Li mov v21.d[0], v20.d[1] 149*a97c2a1fSXin Li add v20.4h, v20.4h, v21.4h 150*a97c2a1fSXin Li 151*a97c2a1fSXin Li // v20 has four adj values for two sub-blocks 152*a97c2a1fSXin Li 153*a97c2a1fSXin Li // Compute alt 154*a97c2a1fSXin Li uabd v0.4s, v0.4s, v2.4s 155*a97c2a1fSXin Li uabd v4.4s, v4.4s, v6.4s 156*a97c2a1fSXin Li 157*a97c2a1fSXin Li add v0.4s, v0.4s, v4.4s 158*a97c2a1fSXin Li 159*a97c2a1fSXin Li mov v1.d[0], v0.d[1] 160*a97c2a1fSXin Li add v21.4s, v0.4s, v1.4s 161*a97c2a1fSXin Li // d21 has two values for two sub-blocks 162*a97c2a1fSXin Li 163*a97c2a1fSXin Li 164*a97c2a1fSXin Li // Calculate column based adj and alt values 165*a97c2a1fSXin Li 166*a97c2a1fSXin Li urhadd v0.16b, v28.16b, v30.16b 167*a97c2a1fSXin Li urhadd v2.16b, v24.16b, v26.16b 168*a97c2a1fSXin Li urhadd v0.16b, v0.16b, v2.16b 169*a97c2a1fSXin Li 170*a97c2a1fSXin Li mov v1.d[0], v0.d[1] 171*a97c2a1fSXin Li uabd v0.8b, v0.8b, v1.8b 172*a97c2a1fSXin Li 173*a97c2a1fSXin Li // RSUM_CSUM_THRESH >> 2 174*a97c2a1fSXin Li movi v22.16b, #5 175*a97c2a1fSXin Li 176*a97c2a1fSXin Li // Eliminate values smaller than RSUM_CSUM_THRESH >> 2 177*a97c2a1fSXin Li cmhs v1.16b, v0.16b, v22.16b 178*a97c2a1fSXin Li and v0.16b, v0.16b, v1.16b 179*a97c2a1fSXin Li // d0 now contains 8 absolute diff of sums above the threshold 180*a97c2a1fSXin Li 181*a97c2a1fSXin Li 182*a97c2a1fSXin Li uaddlp v0.4h, v0.8b 183*a97c2a1fSXin Li shl v0.4h, v0.4h,#2 184*a97c2a1fSXin Li 185*a97c2a1fSXin Li // Add row based adj 186*a97c2a1fSXin Li add v20.4h, v0.4h, v20.4h 187*a97c2a1fSXin Li 188*a97c2a1fSXin Li uaddlp v20.2s, v20.4h 189*a97c2a1fSXin Li // d20 now contains 2 adj values 190*a97c2a1fSXin Li 191*a97c2a1fSXin Li 192*a97c2a1fSXin Li urhadd v0.8b, v28.8b, v29.8b 193*a97c2a1fSXin Li urhadd v2.8b, v24.8b, v25.8b 194*a97c2a1fSXin Li urhadd v0.8b, v0.8b, v2.8b 195*a97c2a1fSXin Li 196*a97c2a1fSXin Li urhadd v1.8b, v30.8b, v31.8b 197*a97c2a1fSXin Li urhadd v3.8b, v26.8b, v27.8b 198*a97c2a1fSXin Li urhadd v1.8b, v1.8b, v3.8b 199*a97c2a1fSXin Li 200*a97c2a1fSXin Li uabd v0.8b, v0.8b, v1.8b 201*a97c2a1fSXin Li uaddlp v0.4h, v0.8b 202*a97c2a1fSXin Li 203*a97c2a1fSXin Li shl v0.4h, v0.4h, #2 204*a97c2a1fSXin Li uaddlp v0.2s, v0.4h 205*a97c2a1fSXin Li add v21.2s, v0.2s, v21.2s 206*a97c2a1fSXin Li 207*a97c2a1fSXin Li 208*a97c2a1fSXin Li // d21 now contains 2 alt values 209*a97c2a1fSXin Li 210*a97c2a1fSXin Li // SAD_BIAS_MULT_SHIFT 211*a97c2a1fSXin Li ushr v0.2s, v21.2s, #3 212*a97c2a1fSXin Li add v21.2s, v21.2s, v0.2s 213*a97c2a1fSXin Li 214*a97c2a1fSXin Li // SAD_BIAS_ADDITIVE >> 1 215*a97c2a1fSXin Li movi v0.2s, #4 216*a97c2a1fSXin Li add v21.2s, v21.2s, v0.2s 217*a97c2a1fSXin Li 218*a97c2a1fSXin Li cmhi v0.2s, v20.2s, v21.2s 219*a97c2a1fSXin Li uaddlp v0.1d, v0.2s 220*a97c2a1fSXin Li 221*a97c2a1fSXin Li smov x0, v0.s[0] 222*a97c2a1fSXin Li cmp x0, #0 223*a97c2a1fSXin Li mov x4, #1 224*a97c2a1fSXin Li csel x0, x4, x0, ne 225*a97c2a1fSXin Li ret 226