xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_sao_edge_offset_class2.s (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class2.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8=>   ht
60
61.equ    pu1_src_top_left_offset,    264
62.equ    pu1_src_top_right_offset,   268
63.equ    pu1_src_bot_left_offset,    272
64.equ    pu1_avail_offset,           276
65.equ    pi1_sao_offset,             280
66.equ    wd_offset,                  284
67.equ    ht_offset,                  288
68
69.text
70.syntax unified
71.p2align 2
72
73.extern gi1_table_edge_idx
74.globl ihevc_sao_edge_offset_class2_a9q
75
76gi1_table_edge_idx_addr_1:
77.long gi1_table_edge_idx - ulbl1 - 8
78
79gi1_table_edge_idx_addr_2:
80.long gi1_table_edge_idx - ulbl2 - 8
81
82gi1_table_edge_idx_addr_3:
83.long gi1_table_edge_idx - ulbl3 - 8
84
85ihevc_sao_edge_offset_class2_a9q:
86
87
88    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
89    vpush       {d8  -  d15}
90    SUB         sp,sp,#160                  @Decrement the stack pointer to store some temp arr values
91
92    LDR         r7,[sp,#wd_offset]          @Loads wd
93    LDR         r8,[sp,#ht_offset]          @Loads ht
94    SUB         r9,r7,#1                    @wd - 1
95
96    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
97    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
98
99    STR         r0,[sp,#152]                @Store pu1_src in sp
100    MOV         r9,r7                       @Move width to r9 for loop count
101
102    STR         r2,[sp,#156]                @Store pu1_src_left in sp
103    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
104    LDR         r6,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
105    STR         r3,[sp,#148]                @Store pu1_src_top in sp
106
107
108    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
109    SUB         r10,r8,#1                   @ht-1
110    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
111    ADD         r12,sp,#2                   @temp array
112
113AU1_SRC_TOP_LOOP:
114    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
115    SUBS        r9,r9,#8                    @Decrement the loop count by 8
116    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
117    BNE         AU1_SRC_TOP_LOOP
118
119PU1_AVAIL_4_LOOP:
120    LDRB        r10,[r5,#4]                 @pu1_avail[4]
121    CMP         r10,#0
122    LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
123    BEQ         PU1_AVAIL_7_LOOP
124
125    LDRB        r11,[r4]                    @pu1_src_top_left[0]
126    ADD         r14,r0,r1                   @pu1_src + src_strd
127
128    SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
129    LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
130
131    MVNLT       r12,#0
132    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
133
134    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
135ulbl1:
136    add         r14,r14,pc
137    SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
138
139    MVNLT       r11,#0
140    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
141    ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
142    ADD         r4,r4,#2                    @edge_idx
143
144    LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
145    CMP         r12,#0                      @0 != edge_idx
146    BEQ         PU1_AVAIL_7_LOOP
147    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
148    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
149    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
150
151PU1_AVAIL_7_LOOP:
152    LDRB        r14,[r5,#7]                 @pu1_avail[7]
153    CMP         r14,#0
154    SUB         r10,r7,#1                   @wd - 1
155    SUB         r11,r8,#1                   @ht - 1
156    MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
157    ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
158    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
159    BEQ         PU1_AVAIL
160
161    SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
162    LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
163    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
164
165    SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
166    LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
167
168    MVNLT       r11,#0
169    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
170
171    SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
172    MVNLT       r4,#0
173    MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
174
175    ADD         r11,r11,r4                  @Add 2 sign value
176    ADD         r11,r11,#2                  @edge_idx
177    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
178ulbl2:
179    add         r14,r14,pc
180
181    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
182    CMP         r12,#0
183    BEQ         PU1_AVAIL
184    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
185    ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
186    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
187
188PU1_AVAIL:
189    MOV         r12,r8                      @Move ht
190    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
191    LDRB        r11,[r5,#3]                 @pu1_avail[3]
192
193    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
194    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
195    CMP         r11,#0
196
197    LDRB        r5,[r5,#2]                  @pu1_avail[2]
198    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
199    SUBEQ       r12,r12,#1                  @ht_tmp--
200
201    CMP         r5,#0
202    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
203    LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
204ulbl3:
205    add         r11,r11,pc
206
207    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
208    VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
209    SUBEQ       r12,r12,#1                  @ht_tmp--
210
211    MOV         r6,r7                       @move wd to r6 loop_count
212    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
213    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
214
215    STR         r0,[sp,#144]                @Store pu1_src in sp
216    CMP         r7,#16                      @Compare wd with 16
217
218    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
219    CMP         r8,#4                       @Compare ht with 4
220    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
221
222WIDTH_LOOP_16:
223    LDR         r7,[sp,#wd_offset]          @Loads wd
224
225    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
226    CMP         r6,r7                       @col == wd
227    LDRBEQ      r8,[r5]                     @pu1_avail[0]
228    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
229
230    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
231    CMP         r6,#16                      @if(col == 16)
232    BNE         SKIP_AU1_MASK_VAL
233    LDRB        r8,[r5,#1]                  @pu1_avail[1]
234    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
235
236SKIP_AU1_MASK_VAL:
237    LDRB        r11,[r5,#2]                 @pu1_avail[2]
238    CMP         r11,#0
239
240    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
241    MOVNE       r8,r3                       @pu1_src_top_cpy
242    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
243
244    LDR         r7,[sp,#wd_offset]          @Loads wd
245    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
246    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
247    SUB         r8,#8
248    ADD         r3,r3,#16
249
250    ADD         r5,sp,#66                   @*au1_src_left_tmp
251    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
252    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
253    SUB         r0,#8
254    LDR         r4,[sp,#ht_offset]          @Loads ht
255
256    SUB         r7,r7,r6                    @(wd - col)
257    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
258    LDR         r8,[sp,#152]                @Loads *pu1_src
259
260    ADD         r7,r7,#15                   @15 + (wd - col)
261    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
262    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
263
264    SUB         r5,r5,#1
265    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
266
267AU1_SRC_LEFT_LOOP:
268    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
269    STRB        r8,[r5,#1]!                 @store it in the stack pointer
270    SUBS        r4,r4,#1                    @decrement the loop count
271    BNE         AU1_SRC_LEFT_LOOP
272
273    ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
274    VMOV.I8     Q9,#0
275    LDR         r4,[sp,#pu1_avail_offset]   @I Loads pu1_avail
276
277    MOV         r7,r12                      @row count, move ht_tmp to r7
278    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
279    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
280    SUB         r8,#8
281    LDRB        r4,[r4,#2]                  @I pu1_avail[2]
282
283    LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
284    VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
285
286    VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
287    CMP         r4,#0                       @I
288    BNE         SIGN_UP_CHANGE_DONE         @I
289
290SIGN_UP_CHANGE:
291    SUB         r2,r12,r7                   @I ht_tmp - row
292    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
293    ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
294
295    LDRB        r5,[r2,#-1]                 @I load the value
296    SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
297    MVNLT       r4,#0                       @I
298    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
299    VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
300
301SIGN_UP_CHANGE_DONE:
302    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
303    VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
304
305    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
306    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
307
308    VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
309    VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
310    VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
311
312    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
313
314    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
315    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
316    VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
317
318    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
319    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
320    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
321
322    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
323    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
324
325    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
326    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
327
328    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
329    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
330
331    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
332    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
333
334    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
335
336    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
337
338PU1_SRC_LOOP:
339
340    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
341    ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
342
343    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
344    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
345    SUB         r8,#8
346    ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
347
348    LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
349    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
350    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
351    SUB         r11,#8
352    LDRB        r4,[r0]                     @II pu1_src_cpy[0]
353
354    LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
355    VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
356
357    SUB         r5,r12,r7                   @II ht_tmp - row
358    VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
359    ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
360
361    LDRB        r5,[r5,#-1]                 @II load the value
362    VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
363    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
364
365    SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
366    VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
367    LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
368
369    VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
370    SUB         r5,r12,r7                   @III ht_tmp - row
371
372    MVNLT       r4,#0                       @II
373    VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
374    ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
375
376    MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
377    VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
378    LDRB        r5,[r5,#-1]                 @III load the value
379
380    SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
381    VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
382
383    MVNLT       r2,#0                       @III
384    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
385    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
386
387    VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
388    VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
389
390    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
391    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
392    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
393
394    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
395    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
396    VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
397
398    VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
399    VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
400
401    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
402    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
403    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
404
405    VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
406    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
407    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
408
409    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
410    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
411    VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
412
413    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
414    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
415
416    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
417    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
418    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
419
420    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
421    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
422    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
423
424    VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
425    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
426
427    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
428    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
429    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
430
431    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
432    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
433
434    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
435    VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
436
437    VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
438    VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
439
440    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
441    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
442
443    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
444    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
445    CMP         r7,#1                       @III
446
447    VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
448    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
449
450    BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
451    BLT         INNER_LOOP_DONE
452
453    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
454    ADD         r8,r0,r1                    @*pu1_src + src_strd
455
456    LDRB        r2,[r0]                     @pu1_src_cpy[0]
457    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
458    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
459    SUB         r8,#8
460    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
461
462    SUB         r11,r12,r7                  @ht_tmp - row
463    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
464    ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
465
466    LDRB        r5,[r11,#-1]                @load the value
467    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
468    SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
469
470    VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
471    MVNLT       r4,#0
472
473    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
474    VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
475
476    VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
477    VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
478
479    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
480    VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
481
482    VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
483    VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
484
485    VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
486    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
487
488    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
489
490    VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
491
492    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
493    VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
494    VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
495
496    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
497    VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
498
499    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
500    VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
501
502    VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
503    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
504
505    VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
506    VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
507
508
509INNER_LOOP_DONE:
510    ADD         r5,sp,#66                   @*au1_src_left_tmp
511    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
512    LDR         r2,[sp,#156]                @Loads *pu1_src_left
513
514    LDR         r8,[sp,#ht_offset]          @Loads ht
515    SUB         r5,r5,#1
516
517    SUB         r2,r2,#1
518SRC_LEFT_LOOP:
519    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
520    SUBS        r8,r8,#1
521    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
522    BNE         SRC_LEFT_LOOP
523
524    SUB         r6,r6,#16                   @Decrement the wd loop count by 16
525    CMP         r6,#8                       @Check whether residue remains
526    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
527    LDR         r7,[sp,#wd_offset]          @Loads wd
528    LDR         r0,[sp,#144]                @Loads *pu1_src
529    SUB         r7,r7,r6
530    ADD         r0,r0,r7
531    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
532    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
533
534
535WD_16_HT_4_LOOP:
536    LDR         r7,[sp,#wd_offset]          @Loads wd
537    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
538    CMP         r6,r7                       @col == wd
539    LDRBEQ      r8,[r5]                     @pu1_avail[0]
540    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
541
542    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
543    CMP         r6,#16                      @if(col == 16)
544    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
545    LDRB        r8,[r5,#1]                  @pu1_avail[1]
546    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
547
548SKIP_AU1_MASK_VAL_WD_16_HT_4:
549    LDRB        r8,[r5,#2]                  @pu1_avail[2]
550    CMP         r8,#0
551
552    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
553    MOVNE       r8,r3
554    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
555
556    LDR         r7,[sp,#wd_offset]          @Loads wd
557    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
558    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
559    SUB         r8,#8
560    ADD         r3,r3,#16
561
562    ADD         r5,sp,#66                   @*au1_src_left_tmp
563    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
564    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
565    SUB         r0,#8
566    LDR         r4,[sp,#ht_offset]          @Loads ht
567
568    SUB         r7,r7,r6                    @(wd - col)
569    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
570    LDR         r8,[sp,#152]                @Loads *pu1_src
571
572    ADD         r7,r7,#15                   @15 + (wd - col)
573    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
574    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
575
576    SUB         r5,r5,#1
577    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
578
579AU1_SRC_LEFT_LOOP_WD_16_HT_4:
580    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
581    SUBS        r4,r4,#1                    @decrement the loop count
582    STRB        r8,[r5,#1]!                 @store it in the stack pointer
583    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
584
585    VMOV.I8     Q9,#0
586    MOV         r7,r12                      @row count, move ht_tmp to r7
587
588PU1_SRC_LOOP_WD_16_HT_4:
589    ADD         r8,r0,r1                    @*pu1_src + src_strd
590    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
591    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
592    SUB         r8,#8
593
594    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
595    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
596    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
597
598    CMP         r7,r12
599    BLT         SIGN_UP_CHANGE_WD_16_HT_4
600    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
601    LDRB        r5,[r5,#2]                  @pu1_avail[2]
602    CMP         r5,#0
603    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
604
605SIGN_UP_CHANGE_WD_16_HT_4:
606    LDRB        r8,[r0]                     @pu1_src_cpy[0]
607    SUB         r5,r12,r7                   @ht_tmp - row
608    ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
609    LDRB        r5,[r5,#-1]                 @load the value
610    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
611    MVNLT       r8,#0
612    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
613    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
614
615SIGN_UP_CHANGE_DONE_WD_16_HT_4:
616    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
617    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
618    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
619
620    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
621    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
622    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
623    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
624
625    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
626
627    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
628    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
629
630    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
631    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
632    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
633    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
634    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
635
636    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
637    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
638    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
639    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
640    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
641
642    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
643    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
644
645    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
646
647    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
648    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
649    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
650
651    LDR         r8,[sp,#ht_offset]          @Loads ht
652    ADD         r5,sp,#66                   @*au1_src_left_tmp
653    LDR         r2,[sp,#156]                @Loads *pu1_src_left
654    SUB         r5,r5,#1
655    SUB         r2,r2,#1
656
657SRC_LEFT_LOOP_WD_16_HT_4:
658    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
659    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
660    SUBS        r8,r8,#1
661    BNE         SRC_LEFT_LOOP_WD_16_HT_4
662
663    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
664    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
665
666
667WIDTH_RESIDUE:
668    LDR         r7,[sp,#wd_offset]          @Loads wd
669    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
670    CMP         r6,r7                       @wd_residue == wd
671    LDRBEQ      r8,[r5]                     @pu1_avail[0]
672
673    MOVNE       r8,#-1
674    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
675
676    LDRB        r8,[r5,#1]                  @pu1_avail[1]
677    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
678
679PU1_AVAIL_2_RESIDUE:
680    LDRB        r11,[r5,#2]                 @pu1_avail[2]
681    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
682    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
683    SUB         r0,#8
684    CMP         r11,#0
685
686    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
687    MOVNE       r8,r3
688
689    SUB         r8,r8,#1
690
691    ADD         r5,sp,#66                   @*au1_src_left_tmp
692    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
693    VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
694    LDR         r7,[sp,#wd_offset]          @Loads wd
695
696    LDR         r4,[sp,#ht_offset]          @Loads ht
697    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
698    SUB         r7,r7,#1                    @(wd - 1)
699
700    LDR         r8,[sp,#152]                @Loads *pu1_src
701    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
702    SUB         r5,r5,#1
703
704    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
705    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
706
707
708AU1_SRC_LEFT_LOOP_RESIDUE:
709    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
710    SUBS        r4,r4,#1                    @decrement the loop count
711    STRB        r8,[r5,#1]!                 @store it in the stack pointer
712    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
713
714
715    MOV         r7,r12                      @row count, move ht_tmp to r7
716
717PU1_SRC_LOOP_RESIDUE:
718    VMOV.I8     Q9,#0
719    ADD         r8,r0,r1                    @*pu1_src + src_strd
720    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
721    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
722    SUB         r8,#8
723
724    LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
725    VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
726    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
727
728    CMP         r7,r12
729    BLT         SIGN_UP_CHANGE_RESIDUE
730    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
731    LDRB        r5,[r5,#2]                  @pu1_avail[2]
732    CMP         r5,#0
733    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
734
735SIGN_UP_CHANGE_RESIDUE:
736    LDRB        r8,[r0]                     @pu1_src_cpy[0]
737    SUB         r5,r12,r7                   @ht_tmp - row
738
739    ADD         r5,r14,r5
740    LDRB        r5,[r5,#-1]                 @load the value
741    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
742    MVNLT       r8,#0
743    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
744    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
745
746SIGN_UP_CHANGE_DONE_RESIDUE:
747    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
748    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
749    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
750
751    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
752    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
753    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
754    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
755
756    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
757
758    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
759    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
760
761    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
762    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
763    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
764    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
765    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
766
767    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
768
769    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
770    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
771    SUBS        r7,r7,#1
772    BNE         PU1_SRC_LOOP_RESIDUE
773
774    LDR         r8,[sp,#ht_offset]          @Loads ht
775    ADD         r5,sp,#66                   @*au1_src_left_tmp
776
777    LDR         r2,[sp,#156]                @Loads *pu1_src_left
778    SUB         r5,r5,#1
779
780    SUB         r2,r2,#1
781
782SRC_LEFT_LOOP_RESIDUE:
783    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
784    SUBS        r8,r8,#1
785    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
786    BNE         SRC_LEFT_LOOP_RESIDUE
787
788
789RE_ASSINING_LOOP:
790    LDR         r8,[sp,#ht_offset]          @Loads ht
791    LDR         r7,[sp,#wd_offset]          @Loads wd
792
793    LDR         r0,[sp,#152]                @Loads *pu1_src
794    SUB         r8,r8,#1                    @ht - 1
795
796    MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
797    STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
798
799    LDR         r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
800    ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
801
802    ADD         r12,sp,#2
803    STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
804
805    LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
806    LDR         r3,[sp,#148]                @Loads pu1_src_top
807
808    STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
809
810SRC_TOP_LOOP:
811    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
812    SUBS        r7,r7,#8                    @Decrement the width
813    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
814    BNE         SRC_TOP_LOOP
815
816END_LOOPS:
817    ADD         sp,sp,#160
818    vpop        {d8  -  d15}
819    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
820
821
822
823