xref: /aosp_15_r20/external/libavc/common/armv8/ih264_deblk_chroma_av8.s (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///*****************************************************************************/
21///*                                                                           */
22///*  File Name         : ih264_deblk_chroma_av8.s                              */
23///*                                                                           */
24///*  Description       : Contains function definitions for deblocking luma    */
25///*                      edge. Functions are coded in NEON assembly and can   */
26///*                      be compiled using ARM RVDS.                          */
27///*                                                                           */
28///*  List of Functions : ih264_deblk_chroma_vert_bs4_av8()              */
29///*                      ih264_deblk_chroma_vert_bslt4_av8()            */
30///*                      ih264_deblk_chroma_horz_bs4_av8()              */
31///*                      ih264_deblk_chroma_horz_bslt4_av8()            */
32///*  Issues / Problems : None                                                 */
33///*                                                                           */
34///*  Revision History  :                                                      */
35///*                                                                           */
36///*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
37///*         28 11 2013   Ittiam          Draft                                */
38///*****************************************************************************/
39
40
41.text
42.p2align 2
43.include "ih264_neon_macros.s"
44
45///**
46//*******************************************************************************
47//*
48//* @brief
49//*     Performs filtering of a chroma block horizontal edge when the
50//*     boundary strength is set to 4 in high profile
51//*
52//* @par Description:
53//*       This operation is described in  Sec. 8.7.2.4 under the title
54//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
55//*
56//* @param[in] x0 - pu1_src
57//*  Pointer to the src sample q0
58//*
59//* @param[in] w1 - src_strd
60//*  Source stride
61//*
62//* @param[in] w2 - alpha_cb
63//*  Alpha Value for the boundary in U
64//*
65//* @param[in] w3 - beta_cb
66//*  Beta Value for the boundary in U
67//*
68//* @param[in] w4 - alpha_cr
69//*    Alpha Value for the boundary in V
70//*
71//* @param[in] w5 - beta_cr
72//*    Beta Value for the boundary in V
73//*
74//* @returns
75//*  None
76//*
77//* @remarks
78//*  None
79//*
80//*******************************************************************************
81//*/
82
83    .global ih264_deblk_chroma_horz_bs4_av8
84
85ih264_deblk_chroma_horz_bs4_av8:
86
87    // STMFD sp!,{x4-x6,x14}            //
88    push_v_regs
89    stp       x19, x20, [sp, #-16]!
90    sxtw      x1, w1
91    mov       x6, x5
92    mov       x5, x4
93    sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixel pointing to p1 of chroma
94    ld2       {v6.8b, v7.8b}, [x0], x1  //D6 = p1u , D7 = p1v
95    mov       x4, x0                    //Keeping a backup of the pointer p0 of chroma
96    ld2       {v4.8b, v5.8b}, [x0], x1  //D4 = p0u , D5 = p0v
97    dup       v20.8b, w2                //D20 contains alpha_cb
98    dup       v21.8b, w5                //D21 contains alpha_cr
99    mov       v20.d[1], v21.d[0]
100    ld2       {v0.8b, v1.8b}, [x0], x1  //D0 = q0u , D1 = q0v
101    uaddl     v8.8h, v6.8b, v0.8b       //
102    uaddl     v10.8h, v7.8b, v1.8b      //Q4,Q5 = q0 + p1
103    movi      v31.8b, #2                //
104    ld2       {v2.8b, v3.8b}, [x0]      //D2 = q1u , D3 = q1v
105    mov       v0.d[1], v1.d[0]
106    mov       v2.d[1], v3.d[0]
107    mov       v4.d[1], v5.d[0]
108    mov       v6.d[1], v7.d[0]
109    uabd      v26.16b, v6.16b , v4.16b  //Q13 = ABS(p1 - p0)
110    umlal     v8.8h, v2.8b, v31.8b      //
111    umlal     v10.8h, v3.8b, v31.8b     //Q5,Q4 = (X2(q1U) + q0U + p1U)
112    uabd      v22.16b, v4.16b , v0.16b  //Q11 = ABS(p0 - q0)
113    uabd      v24.16b, v2.16b , v0.16b  //Q12 = ABS(q1 - q0)
114    uaddl     v14.8h, v4.8b, v2.8b      //
115    uaddl     v28.8h, v5.8b, v3.8b      //Q14,Q7 = P0 + Q1
116    dup       v16.8b, w3                //D16 contains beta_cb
117    dup       v17.8b, w6                //D17 contains beta_cr
118    mov       v16.d[1], v17.d[0]
119    umlal     v14.8h, v6.8b, v31.8b     //
120    umlal     v28.8h, v7.8b, v31.8b     //Q14,Q7 = (X2(p1U) + p0U + q1U)
121    cmhs      v18.16b, v22.16b, v20.16b
122    cmhs      v24.16b, v24.16b, v16.16b
123    cmhs      v26.16b, v26.16b, v16.16b
124    rshrn     v8.8b, v8.8h, #2          //
125    rshrn     v9.8b, v10.8h, #2         //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
126    mov       v8.d[1], v9.d[0]
127    orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
128    rshrn     v10.8b, v14.8h, #2        //
129    rshrn     v11.8b, v28.8h, #2        //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
130    mov       v10.d[1], v11.d[0]
131    orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
132    bit       v10.16b, v4.16b , v18.16b //
133    bit       v8.16b, v0.16b , v18.16b  //
134    mov       v11.d[0], v10.d[1]
135    mov       v9.d[0], v8.d[1]
136    st2       {v10.8b, v11.8b}, [x4], x1 //
137    st2       {v8.8b, v9.8b}, [x4]      //
138    // LDMFD sp!,{x4-x6,pc}                //
139    ldp       x19, x20, [sp], #16
140    pop_v_regs
141    ret
142
143
144
145///**
146//*******************************************************************************
147//*
148//* @brief
149//*     Performs filtering of a chroma block vertical edge when the
150//*     boundary strength is set to 4 in high profile
151//*
152//* @par Description:
153//*       This operation is described in  Sec. 8.7.2.4 under the title
154//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
155//*
156//* @param[in] x0 - pu1_src
157//*  Pointer to the src sample q0
158//*
159//* @param[in] w1 - src_strd
160//*  Source stride
161//*
162//* @param[in] w2 - alpha_cb
163//*  Alpha Value for the boundary in U
164//*
165//* @param[in] w3 - beta_cb
166//*  Beta Value for the boundary in U
167//*
168//* @param[in] w4 - alpha_cr
169//*    Alpha Value for the boundary in V
170//*
171//* @param[in] w5 - beta_cr
172//*    Beta Value for the boundary in V
173//*
174//* @returns
175//*  None
176//*
177//* @remarks
178//*  None
179//*
180//*******************************************************************************
181//*/
182
183    .global ih264_deblk_chroma_vert_bs4_av8
184
185ih264_deblk_chroma_vert_bs4_av8:
186
187    // STMFD sp!,{x4,x5,x12,x14}
188    push_v_regs
189    stp       x19, x20, [sp, #-16]!
190    sxtw      x1, w1
191
192    sub       x0, x0, #4                //point x0 to p1u of row0.
193    mov       x12, x0                   //keep a back up of x0 for buffer write
194
195    add       w2, w2, w4, lsl #8        //w2 = (alpha_cr,alpha_cb)
196    add       w3, w3, w5, lsl #8        //w3 = (beta_cr,beta_cb)
197
198    ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
199    ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
200    ld4       {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
201    ld4       {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
202
203    ld4       {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
204    ld4       {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
205    ld4       {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
206    ld4       {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
207
208    mov       v10.16b, v2.16b
209    mov       v2.16b, v1.16b
210    mov       v1.16b, v4.16b
211    mov       v4.16b, v10.16b
212    mov       v10.16b, v6.16b
213    mov       v6.16b, v3.16b
214    mov       v3.16b, v5.16b
215    mov       v5.16b, v10.16b
216
217    dup       v22.8h, w2                //Q11 = alpha
218    dup       v24.8h, w3                //Q12 = beta
219    movi      v31.8b, #2
220
221    mov       v0.d[1], v1.d[0]
222    mov       v2.d[1], v3.d[0]
223    mov       v4.d[1], v5.d[0]
224    mov       v6.d[1], v7.d[0]
225
226    uabd      v8.16b, v2.16b , v4.16b   //|p0-q0|
227    uabd      v10.16b, v6.16b , v4.16b  //|q1-q0|
228    uabd      v12.16b, v0.16b , v2.16b  //|p1-p0|
229    uaddl     v14.8h, v2.8b, v6.8b
230    uaddl     v16.8h, v3.8b, v7.8b      //(p0 + q1)
231    cmhi      v8.16b, v22.16b , v8.16b  //|p0-q0| < alpha ?
232    cmhi      v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
233    cmhi      v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
234    umlal     v14.8h, v0.8b, v31.8b
235    umlal     v16.8h, v1.8b, v31.8b     //2*p1 + (p0 + q1)
236    uaddl     v18.8h, v0.8b, v4.8b
237    uaddl     v20.8h, v1.8b, v5.8b      //(p1 + q0)
238    and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta
239    umlal     v18.8h, v6.8b, v31.8b
240    umlal     v20.8h, v7.8b, v31.8b     //2*q1 + (p1 + q0)
241
242    rshrn     v14.8b, v14.8h, #2
243    rshrn     v15.8b, v16.8h, #2        //(2*p1 + (p0 + q1) + 2) >> 2
244    mov       v14.d[1], v15.d[0]
245    and       v8.16b, v8.16b , v12.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
246    rshrn     v18.8b, v18.8h, #2
247    rshrn     v19.8b, v20.8h, #2        //(2*q1 + (p1 + q0) + 2) >> 2
248    mov       v18.d[1], v19.d[0]
249    bit       v2.16b, v14.16b , v8.16b
250    bit       v4.16b, v18.16b , v8.16b
251
252    mov       v1.d[0], v0.d[1]
253    mov       v3.d[0], v2.d[1]
254    mov       v5.d[0], v4.d[1]
255    mov       v7.d[0], v6.d[1]
256
257    mov       v10.16b, v1.16b
258    mov       v1.16b, v2.16b
259    mov       v2.16b, v4.16b
260    mov       v4.16b, v10.16b
261    mov       v10.16b, v3.16b
262    mov       v3.16b, v6.16b
263    mov       v6.16b, v5.16b
264    mov       v5.16b, v10.16b
265
266    st4       {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
267    st4       {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
268    st4       {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
269    st4       {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
270
271    st4       {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
272    st4       {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
273    st4       {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
274    st4       {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
275
276    // LDMFD sp!,{x4,x5,x12,pc}
277    ldp       x19, x20, [sp], #16
278    pop_v_regs
279    ret
280
281
282
283///**
284//*******************************************************************************
285//*
286//* @brief
287//*     Performs filtering of a chroma block horizontal edge for cases where the
288//*     boundary strength is less than 4 in high profile
289//*
290//* @par Description:
291//*       This operation is described in  Sec. 8.7.2.4 under the title
292//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
293//*
294//* @param[in] x0 - pu1_src
295//*  Pointer to the src sample q0
296//*
297//* @param[in] w1 - src_strd
298//*  Source stride
299//*
300//* @param[in] w2 - alpha_cb
301//*  Alpha Value for the boundary in U
302//*
303//* @param[in] w3 - beta_cb
304//*  Beta Value for the boundary in U
305//*
306//* @param[in] w4 - alpha_cr
307//*    Alpha Value for the boundary in V
308//*
309//* @param[in] w5 - beta_cr
310//*    Beta Value for the boundary in V
311//*
312//* @param[in] w6 - u4_bs
313//*    Packed Boundary strength array
314//*
315//* @param[in] x7 - pu1_cliptab_cb
316//*    tc0_table for U
317//*
318//* @param[in] sp(0) - pu1_cliptab_cr
319//*    tc0_table for V
320//*
321//* @returns
322//*  None
323//*
324//* @remarks
325//*  None
326//*
327//*******************************************************************************
328//*/
329
330    .global ih264_deblk_chroma_horz_bslt4_av8
331
332ih264_deblk_chroma_horz_bslt4_av8:
333
334    // STMFD sp!,{x4-x9,x14}        //
335    push_v_regs
336    stp       x19, x20, [sp, #-16]!
337    sxtw      x1, w1
338    ldr       x8, [sp, #80]
339    sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixelU pointing to p1 of chroma U
340    rev       w6, w6                    //
341    mov       v12.s[0], w6              //D12[0] = ui_Bs
342    ld1       {v16.s}[0], [x7]          //D16[0] contains cliptab_cb
343    ld1       {v17.s}[0], [x8]          //D17[0] contains cliptab_cr
344    ld2       {v6.8b, v7.8b}, [x0], x1  //Q3=p1
345    tbl       v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
346    tbl       v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
347    uxtl      v12.8h, v12.8b            //Q6 = uc_Bs in each 16 bit scalar
348    mov       x6, x0                    //Keeping a backup of the pointer to chroma U P0
349    ld2       {v4.8b, v5.8b}, [x0], x1  //Q2=p0
350    movi      v30.8b, #1                //
351    dup       v20.8b, w2                //D20 contains alpha_cb
352    dup       v21.8b, w4                //D21 contains alpha_cr
353    mov       v20.d[1], v21.d[0]
354    ld2       {v0.8b, v1.8b}, [x0], x1  //Q0=q0
355    uxtl      v14.8h, v14.8b            //
356    uxtl      v28.8h, v28.8b            //
357    mov       v15.d[0], v28.d[0]        //D14 has cliptab values for U, D15 for V
358    mov       v14.d[1], v28.d[0]
359    ld2       {v2.8b, v3.8b}, [x0]      //Q1=q1
360    usubl     v10.8h, v1.8b, v5.8b      //
361    usubl     v8.8h, v0.8b, v4.8b       //Q5,Q4 = (q0 - p0)
362    mov       v6.d[1], v7.d[0]
363    mov       v4.d[1], v5.d[0]
364    uabd      v26.16b, v6.16b , v4.16b  //Q13 = ABS(p1 - p0)
365    shl       v10.8h, v10.8h, #2        //Q5 = (q0 - p0)<<2
366    mov       v0.d[1], v1.d[0]
367    uabd      v22.16b, v4.16b , v0.16b  //Q11 = ABS(p0 - q0)
368    shl       v8.8h, v8.8h, #2          //Q4 = (q0 - p0)<<2
369    mov       v14.d[1], v15.d[0]
370    sli       v14.8h, v14.8h, #8
371    mov       v15.d[0], v14.d[1]
372    mov       v2.d[1], v3.d[0]
373    uabd      v24.16b, v2.16b , v0.16b  //Q12 = ABS(q1 - q0)
374    cmhs      v18.16b, v22.16b, v20.16b
375    usubl     v20.8h, v6.8b, v2.8b      //Q10 = (p1 - q1)L
376    usubl     v6.8h, v7.8b, v3.8b       //Q3 = (p1 - q1)H
377    dup       v16.8b, w3                //Q8 contains beta_cb
378    dup       v17.8b, w5                //Q8 contains beta_cr
379    mov       v16.d[1], v17.d[0]
380    add       v8.8h, v8.8h , v20.8h     //
381    add       v10.8h, v10.8h , v6.8h    //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
382    cmhs      v24.16b, v24.16b, v16.16b
383    cmgt      v12.4h, v12.4h, #0
384    sqrshrn   v8.8b, v8.8h, #3          //
385    sqrshrn   v9.8b, v10.8h, #3         //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
386    mov       v8.d[1], v9.d[0]
387    add       v14.8b, v14.8b , v30.8b   //D14 = C = C0+1 for U
388    cmhs      v26.16b, v26.16b, v16.16b
389    orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
390    abs       v6.16b, v8.16b            //Q4 = ABS (i_macro)
391    add       v15.8b, v15.8b , v30.8b   //D15 = C = C0+1 for V
392    mov       v14.d[1], v15.d[0]
393    mov       v13.8b, v12.8b
394    mov       v12.d[1], v13.d[0]        //
395    orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
396    umin      v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
397    bic       v12.16b, v12.16b , v18.16b //final condition
398    cmge      v8.16b, v8.16b, #0
399    and       v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd
400    uqadd     v16.16b, v4.16b , v14.16b //Q8 = p0 + delta
401    uqsub     v4.16b, v4.16b , v14.16b  //Q2 = p0 - delta
402    uqadd     v18.16b, v0.16b , v14.16b //Q9 = q0 + delta
403    uqsub     v0.16b, v0.16b , v14.16b  //Q0 = q0 - delta
404    bif       v16.16b, v4.16b , v8.16b  //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
405    bif       v0.16b, v18.16b , v8.16b  //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
406    mov       v17.d[0], v16.d[1]
407    mov       v1.d[0], v0.d[1]
408    st2       {v16.8b, v17.8b}, [x6], x1 //
409    st2       {v0.8b, v1.8b}, [x6]      //
410
411    ldp       x19, x20, [sp], #16
412    pop_v_regs
413    ret
414
415
416
417
418///**
419//*******************************************************************************
420//*
421//* @brief
422//*     Performs filtering of a chroma block vertical edge for cases where the
423//*     boundary strength is less than 4 in high profile
424//*
425//* @par Description:
426//*       This operation is described in  Sec. 8.7.2.4 under the title
427//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
428//*
429//* @param[in] x0 - pu1_src
430//*  Pointer to the src sample q0
431//*
432//* @param[in] w1 - src_strd
433//*  Source stride
434//*
435//* @param[in] w2 - alpha_cb
436//*  Alpha Value for the boundary in U
437//*
438//* @param[in] w3 - beta_cb
439//*  Beta Value for the boundary in U
440//*
441//* @param[in] w4 - alpha_cr
442//*    Alpha Value for the boundary in V
443//*
444//* @param[in] w5 - beta_cr
445//*    Beta Value for the boundary in V
446//*
447//* @param[in] w6 - u4_bs
448//*    Packed Boundary strength array
449//*
450//* @param[in] x7 - pu1_cliptab_cb
451//*    tc0_table for U
452//*
453//* @param[in] sp(0) - pu1_cliptab_cr
454//*    tc0_table for V
455//*
456//* @returns
457//*  None
458//*
459//* @remarks
460//*  None
461//*
462//*******************************************************************************
463//*/
464
465    .global ih264_deblk_chroma_vert_bslt4_av8
466
467ih264_deblk_chroma_vert_bslt4_av8:
468
469    // STMFD sp!,{x4-x7,x10-x12,x14}
470    push_v_regs
471    stp       x19, x20, [sp, #-16]!
472    sxtw      x1, w1
473    mov       x10, x7
474    ldr       x11, [sp, #80]            //x11 = u4_bs
475    sub       x0, x0, #4                //point x0 to p1u of row0.
476    add       w2, w2, w4, lsl #8
477    add       w3, w3, w5, lsl #8
478    mov       x12, x0                   //keep a back up of x0 for buffer write
479    ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
480    ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
481    ld4       {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
482    ld4       {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
483
484    ld4       {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
485    ld4       {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
486    ld4       {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
487    ld4       {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
488
489    mov       v10.16b, v2.16b
490    mov       v2.16b, v1.16b
491    mov       v1.16b, v4.16b
492    mov       v4.16b, v10.16b
493    mov       v10.16b, v6.16b
494    mov       v6.16b, v3.16b
495    mov       v3.16b, v5.16b
496    mov       v5.16b, v10.16b
497    dup       v22.8h, w2                //Q11 = alpha
498    mov       v2.d[1], v3.d[0]
499    mov       v4.d[1], v5.d[0]
500    uabd      v8.16b, v2.16b , v4.16b   //|p0-q0|
501    dup       v24.8h, w3                //Q12 = beta
502    mov       v25.d[0], v24.d[1]
503    mov       v6.d[1], v7.d[0]
504    mov       v0.d[1], v1.d[0]
505    uabd      v10.16b, v6.16b , v4.16b  //|q1-q0|
506    uabd      v12.16b, v0.16b , v2.16b  //|p1-p0|
507    cmhi      v8.16b, v22.16b , v8.16b  //|p0-q0| < alpha ?
508    usubl     v14.8h, v0.8b, v6.8b
509    cmhi      v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
510    usubl     v16.8h, v1.8b, v7.8b      //(p1 - q1)
511    cmhi      v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
512    usubl     v18.8h, v4.8b, v2.8b
513    and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta
514    usubl     v20.8h, v5.8b, v3.8b      //(q0 - p0)
515    movi      v28.8h, #4
516    ld1       {v24.s}[0], [x10]         //Load ClipTable for U
517    ld1       {v25.s}[0], [x11]         //Load ClipTable for V
518    rev       w6, w6                    //Blocking strengths
519    and       v8.16b, v8.16b , v12.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
520    mov       v10.s[0], w6
521    mla       v14.8h, v18.8h , v28.8h
522    mla       v16.8h, v20.8h , v28.8h   //4*(q0 - p0) + (p1 - q1)
523    uxtl      v10.8h, v10.8b
524    sli       v10.4h, v10.4h, #8
525    tbl       v12.8b, {v24.16b}, v10.8b //tC0 for U
526    tbl       v13.8b, {v25.16b}, v10.8b //tC0 for V
527    zip1      v31.8b, v12.8b, v13.8b
528    zip2      v13.8b, v12.8b, v13.8b
529    mov       v12.8b, v31.8b
530    mov       v12.d[1], v13.d[0]
531    uxtl      v10.4s, v10.4h
532    sli       v10.4s, v10.4s, #16
533    movi      v24.16b, #1
534    add       v12.16b, v12.16b , v24.16b //tC0 + 1
535    cmhs      v10.16b, v10.16b , v24.16b
536    and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
537    // Q0 - Q3(inputs),
538    // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
539    // Q6 (tC)
540    srshr     v14.8h, v14.8h, #3
541    srshr     v16.8h, v16.8h, #3        //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
542    cmgt      v18.8h, v14.8h , #0
543    cmgt      v20.8h, v16.8h , #0
544    xtn       v18.8b, v18.8h
545    xtn       v19.8b, v20.8h            //Q9 = sign(delta)
546    mov       v18.d[1], v19.d[0]
547    abs       v14.8h, v14.8h
548    abs       v16.8h, v16.8h
549    xtn       v14.8b, v14.8h
550    xtn       v15.8b, v16.8h
551    mov       v14.d[1], v15.d[0]
552    umin      v14.16b, v14.16b , v12.16b //Q7 = |delta|
553    uqadd     v20.16b, v2.16b , v14.16b //p0+|delta|
554    uqadd     v22.16b, v4.16b , v14.16b //q0+|delta|
555    uqsub     v24.16b, v2.16b , v14.16b //p0-|delta|
556    uqsub     v26.16b, v4.16b , v14.16b //q0-|delta|
557    bit       v24.16b, v20.16b , v18.16b //p0 + delta
558    bit       v22.16b, v26.16b , v18.16b //q0 - delta
559    bit       v2.16b, v24.16b , v8.16b
560    bit       v4.16b, v22.16b , v8.16b
561    mov       v1.d[0], v0.d[1]
562    mov       v3.d[0], v2.d[1]
563    mov       v5.d[0], v4.d[1]
564    mov       v7.d[0], v6.d[1]
565    mov       v10.16b, v1.16b
566    mov       v1.16b, v2.16b
567    mov       v2.16b, v4.16b
568    mov       v4.16b, v10.16b
569    mov       v10.16b, v3.16b
570    mov       v3.16b, v6.16b
571    mov       v6.16b, v5.16b
572    mov       v5.16b, v10.16b
573    st4       {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
574    st4       {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
575    st4       {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
576    st4       {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
577
578    st4       {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
579    st4       {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
580    st4       {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
581    st4       {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
582
583    ldp       x19, x20, [sp], #16
584    pop_v_regs
585    ret
586
587
588