xref: /aosp_15_r20/external/libmpeg2/common/arm/ideint_cac_a9.s (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li@/******************************************************************************
2*a97c2a1fSXin Li@ *
3*a97c2a1fSXin Li@ * Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li@ *
5*a97c2a1fSXin Li@ * Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li@ * you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li@ * You may obtain a copy of the License at:
8*a97c2a1fSXin Li@ *
9*a97c2a1fSXin Li@ * http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li@ *
11*a97c2a1fSXin Li@ * Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li@ * distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li@ * See the License for the specific language governing permissions and
15*a97c2a1fSXin Li@ * limitations under the License.
16*a97c2a1fSXin Li@ *
17*a97c2a1fSXin Li@ *****************************************************************************
18*a97c2a1fSXin Li@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li@*/
20*a97c2a1fSXin Li
21*a97c2a1fSXin Li@******************************************************************************
22*a97c2a1fSXin Li@*
23*a97c2a1fSXin Li@* @brief
24*a97c2a1fSXin Li@*  This file contains definitions of routines for spatial filter
25*a97c2a1fSXin Li@*
26*a97c2a1fSXin Li@* @author
27*a97c2a1fSXin Li@*  Ittiam
28*a97c2a1fSXin Li@*
29*a97c2a1fSXin Li@* @par List of Functions:
30*a97c2a1fSXin Li@*  - ideint_cac_8x8_a9()
31*a97c2a1fSXin Li@*
32*a97c2a1fSXin Li@* @remarks
33*a97c2a1fSXin Li@*  None
34*a97c2a1fSXin Li@*
35*a97c2a1fSXin Li@*******************************************************************************
36*a97c2a1fSXin Li
37*a97c2a1fSXin Li
38*a97c2a1fSXin Li@******************************************************************************
39*a97c2a1fSXin Li@*
40*a97c2a1fSXin Li@*  @brief Calculates Combing Artifact
41*a97c2a1fSXin Li@*
42*a97c2a1fSXin Li@*  @par   Description
43*a97c2a1fSXin Li@*   This functions calculates combing artifact check (CAC) for given two fields
44*a97c2a1fSXin Li@*
45*a97c2a1fSXin Li@* @param[in] pu1_top
46*a97c2a1fSXin Li@*  UWORD8 pointer to top field
47*a97c2a1fSXin Li@*
48*a97c2a1fSXin Li@* @param[in] pu1_bot
49*a97c2a1fSXin Li@*  UWORD8 pointer to bottom field
50*a97c2a1fSXin Li@*
51*a97c2a1fSXin Li@* @param[in] top_strd
52*a97c2a1fSXin Li@*  Top field stride
53*a97c2a1fSXin Li@*
54*a97c2a1fSXin Li@* @param[in] bot_strd
55*a97c2a1fSXin Li@*  Bottom field stride
56*a97c2a1fSXin Li@*
57*a97c2a1fSXin Li@* @returns
58*a97c2a1fSXin Li@*  None
59*a97c2a1fSXin Li@*
60*a97c2a1fSXin Li@* @remarks
61*a97c2a1fSXin Li@*
62*a97c2a1fSXin Li@******************************************************************************
63*a97c2a1fSXin Li
64*a97c2a1fSXin Li    .global ideint_cac_8x8_a9
65*a97c2a1fSXin Li
66*a97c2a1fSXin Liideint_cac_8x8_a9:
67*a97c2a1fSXin Li
68*a97c2a1fSXin Li    stmfd       sp!,    {r4-r10, lr}
69*a97c2a1fSXin Li    vpush      {d9}
70*a97c2a1fSXin Li
71*a97c2a1fSXin Li    @ Load first row of top
72*a97c2a1fSXin Li    vld1.u8     d28,    [r0],   r2
73*a97c2a1fSXin Li
74*a97c2a1fSXin Li    @ Load first row of bottom
75*a97c2a1fSXin Li    vld1.u8     d29,    [r1],   r3
76*a97c2a1fSXin Li
77*a97c2a1fSXin Li    @ Load second row of top
78*a97c2a1fSXin Li    vld1.u8     d30,    [r0],   r2
79*a97c2a1fSXin Li
80*a97c2a1fSXin Li    @ Load second row of bottom
81*a97c2a1fSXin Li    vld1.u8     d31,    [r1],   r3
82*a97c2a1fSXin Li
83*a97c2a1fSXin Li
84*a97c2a1fSXin Li    @ Calculate row based adj and alt values
85*a97c2a1fSXin Li    @ Get row sums
86*a97c2a1fSXin Li    vpaddl.u8   q0,     q14
87*a97c2a1fSXin Li
88*a97c2a1fSXin Li    vpaddl.u8   q1,     q15
89*a97c2a1fSXin Li
90*a97c2a1fSXin Li    vpaddl.u16  q0,     q0
91*a97c2a1fSXin Li
92*a97c2a1fSXin Li    vpaddl.u16  q1,     q1
93*a97c2a1fSXin Li
94*a97c2a1fSXin Li    @ Both q0 and q1 have four 32 bit sums corresponding to first 4 rows
95*a97c2a1fSXin Li    @ Pack q0 and q1 into a single register (sum does not exceed 16bits)
96*a97c2a1fSXin Li
97*a97c2a1fSXin Li    vshl.u32    q8,     q1,     #16
98*a97c2a1fSXin Li    vorr.u32    q8,     q0,     q8
99*a97c2a1fSXin Li    @ q8 now contains 8 sums
100*a97c2a1fSXin Li
101*a97c2a1fSXin Li    @ Load third row of top
102*a97c2a1fSXin Li    vld1.u8     d24,    [r0],   r2
103*a97c2a1fSXin Li
104*a97c2a1fSXin Li    @ Load third row of bottom
105*a97c2a1fSXin Li    vld1.u8     d25,    [r1],   r3
106*a97c2a1fSXin Li
107*a97c2a1fSXin Li    @ Load fourth row of top
108*a97c2a1fSXin Li    vld1.u8     d26,    [r0],   r2
109*a97c2a1fSXin Li
110*a97c2a1fSXin Li    @ Load fourth row of bottom
111*a97c2a1fSXin Li    vld1.u8     d27,    [r1],   r3
112*a97c2a1fSXin Li
113*a97c2a1fSXin Li    @ Get row sums
114*a97c2a1fSXin Li    vpaddl.u8   q2,     q12
115*a97c2a1fSXin Li
116*a97c2a1fSXin Li    vpaddl.u8   q3,     q13
117*a97c2a1fSXin Li
118*a97c2a1fSXin Li    vpaddl.u16  q2,     q2
119*a97c2a1fSXin Li
120*a97c2a1fSXin Li    vpaddl.u16  q3,     q3
121*a97c2a1fSXin Li    @ Both q2 and q3 have four 32 bit sums corresponding to last 4 rows
122*a97c2a1fSXin Li    @ Pack q2 and q3 into a single register (sum does not exceed 16bits)
123*a97c2a1fSXin Li
124*a97c2a1fSXin Li    vshl.u32    q9,     q3,     #16
125*a97c2a1fSXin Li    vorr.u32    q9,     q2,     q9
126*a97c2a1fSXin Li    @ q9 now contains 8 sums
127*a97c2a1fSXin Li
128*a97c2a1fSXin Li    @ Compute absolute diff between top and bottom row sums
129*a97c2a1fSXin Li    vabd.u16    d16,    d16,    d17
130*a97c2a1fSXin Li    vabd.u16    d17,    d18,    d19
131*a97c2a1fSXin Li
132*a97c2a1fSXin Li    @ RSUM_CSUM_THRESH
133*a97c2a1fSXin Li    vmov.u16    q9,     #20
134*a97c2a1fSXin Li
135*a97c2a1fSXin Li    @ Eliminate values smaller than RSUM_CSUM_THRESH
136*a97c2a1fSXin Li    vcge.u16    q10,    q8,     q9
137*a97c2a1fSXin Li    vand.u16    q10,    q8,     q10
138*a97c2a1fSXin Li    @ q10 now contains 8 absolute diff of sums above the threshold
139*a97c2a1fSXin Li
140*a97c2a1fSXin Li
141*a97c2a1fSXin Li    @ Compute adj
142*a97c2a1fSXin Li    vadd.u16    d20,    d20,    d21
143*a97c2a1fSXin Li
144*a97c2a1fSXin Li    @ d20 has four adj values for two sub-blocks
145*a97c2a1fSXin Li
146*a97c2a1fSXin Li    @ Compute alt
147*a97c2a1fSXin Li    vabd.u32    q0,     q0,     q1
148*a97c2a1fSXin Li    vabd.u32    q2,     q2,     q3
149*a97c2a1fSXin Li
150*a97c2a1fSXin Li    vadd.u32    q0,     q0,     q2
151*a97c2a1fSXin Li    vadd.u32    d21,    d0,     d1
152*a97c2a1fSXin Li    @ d21 has two values for two sub-blocks
153*a97c2a1fSXin Li
154*a97c2a1fSXin Li
155*a97c2a1fSXin Li    @ Calculate column based adj and alt values
156*a97c2a1fSXin Li
157*a97c2a1fSXin Li    vrhadd.u8   q0,     q14,    q15
158*a97c2a1fSXin Li    vrhadd.u8   q1,     q12,    q13
159*a97c2a1fSXin Li    vrhadd.u8   q0,     q0,     q1
160*a97c2a1fSXin Li
161*a97c2a1fSXin Li    vabd.u8     d0,     d0,     d1
162*a97c2a1fSXin Li
163*a97c2a1fSXin Li    @ RSUM_CSUM_THRESH >> 2
164*a97c2a1fSXin Li    vmov.u8     d9,     #5
165*a97c2a1fSXin Li
166*a97c2a1fSXin Li    @ Eliminate values smaller than RSUM_CSUM_THRESH >> 2
167*a97c2a1fSXin Li    vcge.u8     d1,     d0,     d9
168*a97c2a1fSXin Li    vand.u8     d0,     d0,     d1
169*a97c2a1fSXin Li    @ d0 now contains 8 absolute diff of sums above the threshold
170*a97c2a1fSXin Li
171*a97c2a1fSXin Li
172*a97c2a1fSXin Li    vpaddl.u8   d0,     d0
173*a97c2a1fSXin Li    vshl.u16    d0,     d0,     #2
174*a97c2a1fSXin Li
175*a97c2a1fSXin Li    @ Add row based adj
176*a97c2a1fSXin Li    vadd.u16    d20,    d0,     d20
177*a97c2a1fSXin Li
178*a97c2a1fSXin Li    vpaddl.u16  d20,    d20
179*a97c2a1fSXin Li    @ d20 now contains 2 adj values
180*a97c2a1fSXin Li
181*a97c2a1fSXin Li
182*a97c2a1fSXin Li    vrhadd.u8   d0,     d28,    d29
183*a97c2a1fSXin Li    vrhadd.u8   d2,     d24,    d25
184*a97c2a1fSXin Li    vrhadd.u8   d0,     d0,     d2
185*a97c2a1fSXin Li
186*a97c2a1fSXin Li    vrhadd.u8   d1,     d30,    d31
187*a97c2a1fSXin Li    vrhadd.u8   d3,     d26,    d27
188*a97c2a1fSXin Li    vrhadd.u8   d1,     d1,     d3
189*a97c2a1fSXin Li
190*a97c2a1fSXin Li    vabd.u8     d0,     d0,     d1
191*a97c2a1fSXin Li    vpaddl.u8   d0,     d0
192*a97c2a1fSXin Li
193*a97c2a1fSXin Li    vshl.u16    d0,     d0,     #2
194*a97c2a1fSXin Li    vpaddl.u16  d0,     d0
195*a97c2a1fSXin Li    vadd.u32    d21,    d0,     d21
196*a97c2a1fSXin Li
197*a97c2a1fSXin Li
198*a97c2a1fSXin Li    @ d21 now contains 2 alt values
199*a97c2a1fSXin Li
200*a97c2a1fSXin Li    @ SAD_BIAS_MULT_SHIFT
201*a97c2a1fSXin Li    vshr.u32    d0,     d21,    #3
202*a97c2a1fSXin Li    vadd.u32    d21,    d21,    d0
203*a97c2a1fSXin Li
204*a97c2a1fSXin Li    @ SAD_BIAS_ADDITIVE >> 1
205*a97c2a1fSXin Li    vmov.u32    d0,     #4
206*a97c2a1fSXin Li    vadd.u32    d21,    d21,    d0
207*a97c2a1fSXin Li
208*a97c2a1fSXin Li    vclt.u32    d0,     d21,    d20
209*a97c2a1fSXin Li    vpaddl.u32  d0,     d0
210*a97c2a1fSXin Li
211*a97c2a1fSXin Li    vmov.u32    r0,     d0[0]
212*a97c2a1fSXin Li    cmp         r0,     #0
213*a97c2a1fSXin Li    movne       r0,     #1
214*a97c2a1fSXin Li    vpop        {d9}
215*a97c2a1fSXin Li    ldmfd       sp!,    {r4-r10, pc}
216