xref: /aosp_15_r20/external/libmpeg2/common/armv8/ideint_cac_av8.s (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li//******************************************************************************
2*a97c2a1fSXin Li//*
3*a97c2a1fSXin Li//* Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li//*
5*a97c2a1fSXin Li//* Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li//* you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li//* You may obtain a copy of the License at:
8*a97c2a1fSXin Li//*
9*a97c2a1fSXin Li//* http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li//*
11*a97c2a1fSXin Li//* Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li//* distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li//* See the License for the specific language governing permissions and
15*a97c2a1fSXin Li//* limitations under the License.
16*a97c2a1fSXin Li//*
17*a97c2a1fSXin Li//*****************************************************************************
18*a97c2a1fSXin Li//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li//*/
20*a97c2a1fSXin Li
21*a97c2a1fSXin Li//******************************************************************************
22*a97c2a1fSXin Li//*
23*a97c2a1fSXin Li//* @brief
24*a97c2a1fSXin Li//*  This file contains definitions of routines for spatial filter
25*a97c2a1fSXin Li//*
26*a97c2a1fSXin Li//* @author
27*a97c2a1fSXin Li//*  Ittiam
28*a97c2a1fSXin Li//*
29*a97c2a1fSXin Li//* @par List of Functions:
30*a97c2a1fSXin Li//*  - ideint_cac_8x8_av8()
31*a97c2a1fSXin Li//*
32*a97c2a1fSXin Li//* @remarks
33*a97c2a1fSXin Li//*  None
34*a97c2a1fSXin Li//*
35*a97c2a1fSXin Li//*******************************************************************************
36*a97c2a1fSXin Li
37*a97c2a1fSXin Li
38*a97c2a1fSXin Li//******************************************************************************
39*a97c2a1fSXin Li//*
40*a97c2a1fSXin Li//*  @brief Calculates Combing Artifact
41*a97c2a1fSXin Li//*
42*a97c2a1fSXin Li//*  @par   Description
43*a97c2a1fSXin Li//*   This functions calculates combing artifact check (CAC) for given two fields
44*a97c2a1fSXin Li//*
45*a97c2a1fSXin Li//* @param[in] pu1_top
46*a97c2a1fSXin Li//*  UWORD8 pointer to top field
47*a97c2a1fSXin Li//*
48*a97c2a1fSXin Li//* @param[in] pu1_bot
49*a97c2a1fSXin Li//*  UWORD8 pointer to bottom field
50*a97c2a1fSXin Li//*
51*a97c2a1fSXin Li//* @param[in] top_strd
52*a97c2a1fSXin Li//*  Top field stride
53*a97c2a1fSXin Li//*
54*a97c2a1fSXin Li//* @param[in] bot_strd
55*a97c2a1fSXin Li//*  Bottom field stride
56*a97c2a1fSXin Li//*
57*a97c2a1fSXin Li//* @returns
58*a97c2a1fSXin Li//*     None
59*a97c2a1fSXin Li//*
60*a97c2a1fSXin Li//* @remarks
61*a97c2a1fSXin Li//*
62*a97c2a1fSXin Li//******************************************************************************
63*a97c2a1fSXin Li
64*a97c2a1fSXin Li    .global ideint_cac_8x8_av8
65*a97c2a1fSXin Li
66*a97c2a1fSXin Liideint_cac_8x8_av8:
67*a97c2a1fSXin Li
68*a97c2a1fSXin Li    // Load first row of top
69*a97c2a1fSXin Li    ld1     {v28.8b},       [x0],       x2
70*a97c2a1fSXin Li
71*a97c2a1fSXin Li    // Load first row of bottom
72*a97c2a1fSXin Li    ld1     {v29.8b},       [x1],       x3
73*a97c2a1fSXin Li    mov     v28.d[1],       v29.d[0]
74*a97c2a1fSXin Li
75*a97c2a1fSXin Li    // Load second row of top
76*a97c2a1fSXin Li    ld1     {v30.8b},       [x0],       x2
77*a97c2a1fSXin Li
78*a97c2a1fSXin Li    // Load second row of bottom
79*a97c2a1fSXin Li    ld1     {v31.8b},       [x1],       x3
80*a97c2a1fSXin Li    mov     v30.d[1],       v31.d[0]
81*a97c2a1fSXin Li
82*a97c2a1fSXin Li
83*a97c2a1fSXin Li    // Calculate row based adj and alt values
84*a97c2a1fSXin Li    // Get row sums
85*a97c2a1fSXin Li    uaddlp  v0.8h,          v28.16b
86*a97c2a1fSXin Li
87*a97c2a1fSXin Li    uaddlp  v2.8h,          v30.16b
88*a97c2a1fSXin Li
89*a97c2a1fSXin Li    uaddlp  v0.4s,          v0.8h
90*a97c2a1fSXin Li
91*a97c2a1fSXin Li    uaddlp  v2.4s,          v2.8h
92*a97c2a1fSXin Li
93*a97c2a1fSXin Li    // Both v0 and v2 have four 32 bit sums corresponding to first 4 rows
94*a97c2a1fSXin Li    // Pack v0 and v2 into a single register (sum does not exceed 16bits)
95*a97c2a1fSXin Li
96*a97c2a1fSXin Li    shl     v16.4s,         v2.4s,      #16
97*a97c2a1fSXin Li    orr     v16.16b,        v0.16b,     v16.16b
98*a97c2a1fSXin Li    // v16 now contains 8 sums
99*a97c2a1fSXin Li
100*a97c2a1fSXin Li    // Load third row of top
101*a97c2a1fSXin Li    ld1     {v24.8b},       [x0],       x2
102*a97c2a1fSXin Li
103*a97c2a1fSXin Li    // Load third row of bottom
104*a97c2a1fSXin Li    ld1     {v25.8b},       [x1],       x3
105*a97c2a1fSXin Li    mov     v24.d[1],       v25.d[0]
106*a97c2a1fSXin Li
107*a97c2a1fSXin Li    // Load fourth row of top
108*a97c2a1fSXin Li    ld1     {v26.8b},       [x0],       x2
109*a97c2a1fSXin Li
110*a97c2a1fSXin Li    // Load fourth row of bottom
111*a97c2a1fSXin Li    ld1     {v27.8b},       [x1],       x3
112*a97c2a1fSXin Li    mov     v26.d[1],       v27.d[0]
113*a97c2a1fSXin Li
114*a97c2a1fSXin Li    // Get row sums
115*a97c2a1fSXin Li    uaddlp  v4.8h,          v24.16b
116*a97c2a1fSXin Li
117*a97c2a1fSXin Li    uaddlp  v6.8h,          v26.16b
118*a97c2a1fSXin Li
119*a97c2a1fSXin Li    uaddlp  v4.4s,          v4.8h
120*a97c2a1fSXin Li
121*a97c2a1fSXin Li    uaddlp  v6.4s,          v6.8h
122*a97c2a1fSXin Li    // Both v4 and v6 have four 32 bit sums corresponding to last 4 rows
123*a97c2a1fSXin Li    // Pack v4 and v6 into a single register (sum does not exceed 16bits)
124*a97c2a1fSXin Li
125*a97c2a1fSXin Li    shl     v18.4s,         v6.4s,      #16
126*a97c2a1fSXin Li    orr     v18.16b,        v4.16b,     v18.16b
127*a97c2a1fSXin Li    // v18 now contains 8 sums
128*a97c2a1fSXin Li
129*a97c2a1fSXin Li    // Compute absolute diff between top and bottom row sums
130*a97c2a1fSXin Li    mov     v17.d[0],       v16.d[1]
131*a97c2a1fSXin Li    uabd    v16.4h,         v16.4h,     v17.4h
132*a97c2a1fSXin Li
133*a97c2a1fSXin Li    mov     v19.d[0],       v18.d[1]
134*a97c2a1fSXin Li    uabd    v17.4h,         v18.4h,     v19.4h
135*a97c2a1fSXin Li
136*a97c2a1fSXin Li    mov     v16.d[1],       v17.d[0]
137*a97c2a1fSXin Li
138*a97c2a1fSXin Li    // RSUM_CSUM_THRESH
139*a97c2a1fSXin Li    movi    v18.8h,         #20
140*a97c2a1fSXin Li
141*a97c2a1fSXin Li    // Eliminate values smaller than RSUM_CSUM_THRESH
142*a97c2a1fSXin Li    cmhs    v20.8h,         v16.8h,     v18.8h
143*a97c2a1fSXin Li    and     v20.16b,        v16.16b,    v20.16b
144*a97c2a1fSXin Li
145*a97c2a1fSXin Li    // v20 now contains 8 absolute diff of sums above the threshold
146*a97c2a1fSXin Li
147*a97c2a1fSXin Li    // Compute adj
148*a97c2a1fSXin Li    mov     v21.d[0],       v20.d[1]
149*a97c2a1fSXin Li    add     v20.4h,         v20.4h,     v21.4h
150*a97c2a1fSXin Li
151*a97c2a1fSXin Li    // v20 has four adj values for two sub-blocks
152*a97c2a1fSXin Li
153*a97c2a1fSXin Li    // Compute alt
154*a97c2a1fSXin Li    uabd    v0.4s,      v0.4s,      v2.4s
155*a97c2a1fSXin Li    uabd    v4.4s,      v4.4s,      v6.4s
156*a97c2a1fSXin Li
157*a97c2a1fSXin Li    add     v0.4s,      v0.4s,      v4.4s
158*a97c2a1fSXin Li
159*a97c2a1fSXin Li    mov     v1.d[0],    v0.d[1]
160*a97c2a1fSXin Li    add     v21.4s,     v0.4s,      v1.4s
161*a97c2a1fSXin Li    // d21 has two values for two sub-blocks
162*a97c2a1fSXin Li
163*a97c2a1fSXin Li
164*a97c2a1fSXin Li    // Calculate column based adj and alt values
165*a97c2a1fSXin Li
166*a97c2a1fSXin Li    urhadd  v0.16b,     v28.16b,    v30.16b
167*a97c2a1fSXin Li    urhadd  v2.16b,     v24.16b,    v26.16b
168*a97c2a1fSXin Li    urhadd  v0.16b,     v0.16b,     v2.16b
169*a97c2a1fSXin Li
170*a97c2a1fSXin Li    mov     v1.d[0],    v0.d[1]
171*a97c2a1fSXin Li    uabd    v0.8b,      v0.8b,      v1.8b
172*a97c2a1fSXin Li
173*a97c2a1fSXin Li    // RSUM_CSUM_THRESH >> 2
174*a97c2a1fSXin Li    movi    v22.16b,        #5
175*a97c2a1fSXin Li
176*a97c2a1fSXin Li    // Eliminate values smaller than RSUM_CSUM_THRESH >> 2
177*a97c2a1fSXin Li    cmhs    v1.16b,      v0.16b,        v22.16b
178*a97c2a1fSXin Li    and     v0.16b,      v0.16b,        v1.16b
179*a97c2a1fSXin Li    // d0 now contains 8 absolute diff of sums above the threshold
180*a97c2a1fSXin Li
181*a97c2a1fSXin Li
182*a97c2a1fSXin Li    uaddlp  v0.4h,      v0.8b
183*a97c2a1fSXin Li    shl     v0.4h,      v0.4h,#2
184*a97c2a1fSXin Li
185*a97c2a1fSXin Li    // Add row based adj
186*a97c2a1fSXin Li    add     v20.4h,     v0.4h,      v20.4h
187*a97c2a1fSXin Li
188*a97c2a1fSXin Li    uaddlp  v20.2s,     v20.4h
189*a97c2a1fSXin Li    // d20 now contains 2 adj values
190*a97c2a1fSXin Li
191*a97c2a1fSXin Li
192*a97c2a1fSXin Li    urhadd  v0.8b,      v28.8b,     v29.8b
193*a97c2a1fSXin Li    urhadd  v2.8b,      v24.8b,     v25.8b
194*a97c2a1fSXin Li    urhadd  v0.8b,      v0.8b,      v2.8b
195*a97c2a1fSXin Li
196*a97c2a1fSXin Li    urhadd  v1.8b,      v30.8b,     v31.8b
197*a97c2a1fSXin Li    urhadd  v3.8b,      v26.8b,     v27.8b
198*a97c2a1fSXin Li    urhadd  v1.8b,      v1.8b,      v3.8b
199*a97c2a1fSXin Li
200*a97c2a1fSXin Li    uabd    v0.8b,      v0.8b,      v1.8b
201*a97c2a1fSXin Li    uaddlp  v0.4h,      v0.8b
202*a97c2a1fSXin Li
203*a97c2a1fSXin Li    shl     v0.4h,      v0.4h,      #2
204*a97c2a1fSXin Li    uaddlp  v0.2s,      v0.4h
205*a97c2a1fSXin Li    add     v21.2s,     v0.2s,      v21.2s
206*a97c2a1fSXin Li
207*a97c2a1fSXin Li
208*a97c2a1fSXin Li    // d21 now contains 2 alt values
209*a97c2a1fSXin Li
210*a97c2a1fSXin Li    // SAD_BIAS_MULT_SHIFT
211*a97c2a1fSXin Li    ushr    v0.2s,      v21.2s,     #3
212*a97c2a1fSXin Li    add     v21.2s,     v21.2s,     v0.2s
213*a97c2a1fSXin Li
214*a97c2a1fSXin Li    // SAD_BIAS_ADDITIVE >> 1
215*a97c2a1fSXin Li    movi    v0.2s,      #4
216*a97c2a1fSXin Li    add     v21.2s,     v21.2s,     v0.2s
217*a97c2a1fSXin Li
218*a97c2a1fSXin Li    cmhi    v0.2s,      v20.2s,     v21.2s
219*a97c2a1fSXin Li    uaddlp  v0.1d,      v0.2s
220*a97c2a1fSXin Li
221*a97c2a1fSXin Li    smov    x0,         v0.s[0]
222*a97c2a1fSXin Li    cmp     x0,         #0
223*a97c2a1fSXin Li    mov     x4,         #1
224*a97c2a1fSXin Li    csel    x0,         x4,         x0,         ne
225*a97c2a1fSXin Li    ret
226