xref: /aosp_15_r20/external/libdav1d/src/arm/32/filmgrain.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2021, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2021, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm-offsets.h"
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Worker#define GRAIN_WIDTH 82
33*c0909341SAndroid Build Coastguard Worker#define GRAIN_HEIGHT 73
34*c0909341SAndroid Build Coastguard Worker
35*c0909341SAndroid Build Coastguard Worker#define SUB_GRAIN_WIDTH 44
36*c0909341SAndroid Build Coastguard Worker#define SUB_GRAIN_HEIGHT 38
37*c0909341SAndroid Build Coastguard Worker
38*c0909341SAndroid Build Coastguard Worker.macro increment_seed steps, shift=1
39*c0909341SAndroid Build Coastguard Worker        lsr             r11, r2,  #3
40*c0909341SAndroid Build Coastguard Worker        lsr             r12, r2,  #12
41*c0909341SAndroid Build Coastguard Worker        lsr             lr,  r2,  #1
42*c0909341SAndroid Build Coastguard Worker        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
43*c0909341SAndroid Build Coastguard Worker        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
44*c0909341SAndroid Build Coastguard Worker        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
45*c0909341SAndroid Build Coastguard Worker.if \shift
46*c0909341SAndroid Build Coastguard Worker        lsr             r2,  r2,  #\steps
47*c0909341SAndroid Build Coastguard Worker.endif
48*c0909341SAndroid Build Coastguard Worker        and             r11, r11, #((1 << \steps) - 1)    // bit
49*c0909341SAndroid Build Coastguard Worker.if \shift
50*c0909341SAndroid Build Coastguard Worker        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
51*c0909341SAndroid Build Coastguard Worker.else
52*c0909341SAndroid Build Coastguard Worker        orr             r2,  r2,  r11, lsl #16            // *state
53*c0909341SAndroid Build Coastguard Worker.endif
54*c0909341SAndroid Build Coastguard Worker.endm
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard Worker.macro read_rand dest, bits, age
57*c0909341SAndroid Build Coastguard Worker        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
58*c0909341SAndroid Build Coastguard Worker.endm
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard Worker.macro read_shift_rand dest, bits
61*c0909341SAndroid Build Coastguard Worker        ubfx            \dest,  r2,   #17 - \bits, #\bits
62*c0909341SAndroid Build Coastguard Worker        lsr             r2,  r2,  #1
63*c0909341SAndroid Build Coastguard Worker.endm
64*c0909341SAndroid Build Coastguard Worker
65*c0909341SAndroid Build Coastguard Worker// special calling convention:
66*c0909341SAndroid Build Coastguard Worker// r2 holds seed
67*c0909341SAndroid Build Coastguard Worker// r3 holds dav1d_gaussian_sequence
68*c0909341SAndroid Build Coastguard Worker// clobbers r11-r12
69*c0909341SAndroid Build Coastguard Worker// returns in d0-d1
70*c0909341SAndroid Build Coastguard Workerfunction get_gaussian_neon
71*c0909341SAndroid Build Coastguard Worker        push            {r5-r6,lr}
72*c0909341SAndroid Build Coastguard Worker        increment_seed  4
73*c0909341SAndroid Build Coastguard Worker        read_rand       r5,  11,  3
74*c0909341SAndroid Build Coastguard Worker        read_rand       r6,  11,  2
75*c0909341SAndroid Build Coastguard Worker        add             r5,  r3,  r5,  lsl #1
76*c0909341SAndroid Build Coastguard Worker        add             r6,  r3,  r6,  lsl #1
77*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[0]}, [r5]
78*c0909341SAndroid Build Coastguard Worker        read_rand       r5,  11,  1
79*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[1]}, [r6]
80*c0909341SAndroid Build Coastguard Worker        add             r5,  r3,  r5,  lsl #1
81*c0909341SAndroid Build Coastguard Worker        read_rand       r6, 11,  0
82*c0909341SAndroid Build Coastguard Worker        increment_seed  4
83*c0909341SAndroid Build Coastguard Worker        add             r6,  r3,  r6,  lsl #1
84*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[2]}, [r5]
85*c0909341SAndroid Build Coastguard Worker        read_rand       r5,  11,  3
86*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[3]}, [r6]
87*c0909341SAndroid Build Coastguard Worker        add             r5,  r3,  r5,  lsl #1
88*c0909341SAndroid Build Coastguard Worker        read_rand       r6,  11,  2
89*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[0]}, [r5]
90*c0909341SAndroid Build Coastguard Worker        add             r6,  r3,  r6,  lsl #1
91*c0909341SAndroid Build Coastguard Worker        read_rand       r5,  11,  1
92*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[1]}, [r6]
93*c0909341SAndroid Build Coastguard Worker        read_rand       r6,  11,  0
94*c0909341SAndroid Build Coastguard Worker        add             r5,  r3,  r5,  lsl #1
95*c0909341SAndroid Build Coastguard Worker        add             r6,  r3,  r6,  lsl #1
96*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[2]}, [r5]
97*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[3]}, [r6]
98*c0909341SAndroid Build Coastguard Worker        pop             {r5-r6,pc}
99*c0909341SAndroid Build Coastguard Workerendfunc
100*c0909341SAndroid Build Coastguard Worker
101*c0909341SAndroid Build Coastguard Worker.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
102*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
103*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
104*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r0, q0
105*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
106*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
107*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r1, q0
108*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
109*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
110*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r2, q0
111*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
112*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
113*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r3, q0
114*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
115*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
116*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r4, q0
117*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
118*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
119*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r5, q0
120*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
121*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
122*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r6, q0
123*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
124*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
125*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r7, q0
126*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
127*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
128*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r8, q0
129*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
130*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
131*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r9, q0
132*c0909341SAndroid Build Coastguard Worker        increment_seed  2
133*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  1
134*c0909341SAndroid Build Coastguard Worker        read_rand       r12, 11,  0
135*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
136*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r12, lsl #1
137*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[0]}, [r11]
138*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[1]}, [r12]
139*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d0,  d0,  d30
140*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r10, q0
141*c0909341SAndroid Build Coastguard Worker.endm
142*c0909341SAndroid Build Coastguard Worker
143*c0909341SAndroid Build Coastguard Worker.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
144*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
145*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r4, \r5, \r6, \r7}, [r0]!
146*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r8, \r9},           [r0]!
147*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r10[0]},            [r0]!
148*c0909341SAndroid Build Coastguard Worker.endm
149*c0909341SAndroid Build Coastguard Worker
150*c0909341SAndroid Build Coastguard Worker.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
151*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
152*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
153*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r0, q0
154*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
155*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
156*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r1, q0
157*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
158*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
159*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r2, q0
160*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
161*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
162*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r3, q0
163*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
164*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q0,  q0,  q15
165*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r4, q0
166*c0909341SAndroid Build Coastguard Worker        increment_seed  4
167*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  3
168*c0909341SAndroid Build Coastguard Worker        read_rand       r12, 11,  2
169*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
170*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r12, lsl #1
171*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[]}, [r11]
172*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  1
173*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[1]}, [r12]
174*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
175*c0909341SAndroid Build Coastguard Worker        read_rand       r12, 11,  0
176*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[2]}, [r11]
177*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r12, lsl #1
178*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[3]}, [r12]
179*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d0,  d0,  d30
180*c0909341SAndroid Build Coastguard Worker        vmovn.i16       \r5, q0
181*c0909341SAndroid Build Coastguard Worker.endm
182*c0909341SAndroid Build Coastguard Worker
183*c0909341SAndroid Build Coastguard Worker.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
184*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
185*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r4, \r5},           [r0]
186*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  #GRAIN_WIDTH-32
187*c0909341SAndroid Build Coastguard Worker.endm
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Workerfunction get_grain_2_neon
190*c0909341SAndroid Build Coastguard Worker        push            {r11,lr}
191*c0909341SAndroid Build Coastguard Worker        increment_seed  2
192*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  1
193*c0909341SAndroid Build Coastguard Worker        read_rand       r12, 11,  0
194*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
195*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r12, lsl #1
196*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[0]}, [r11]
197*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[1]}, [r12]
198*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d0,  d0,  d30
199*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d0,  q0
200*c0909341SAndroid Build Coastguard Worker        pop             {r11,pc}
201*c0909341SAndroid Build Coastguard Workerendfunc
202*c0909341SAndroid Build Coastguard Worker
203*c0909341SAndroid Build Coastguard Worker.macro get_grain_2 dst
204*c0909341SAndroid Build Coastguard Worker        bl              get_grain_2_neon
205*c0909341SAndroid Build Coastguard Worker.ifnc \dst, d0
206*c0909341SAndroid Build Coastguard Worker        vmov            \dst, d0
207*c0909341SAndroid Build Coastguard Worker.endif
208*c0909341SAndroid Build Coastguard Worker.endm
209*c0909341SAndroid Build Coastguard Worker
210*c0909341SAndroid Build Coastguard Worker// r1 holds the number of entries to produce
211*c0909341SAndroid Build Coastguard Worker// r6, r8 and r10 hold the previous output entries
212*c0909341SAndroid Build Coastguard Worker// q0 holds the vector of produced entries
213*c0909341SAndroid Build Coastguard Worker// q1 holds the input vector of sums from above
214*c0909341SAndroid Build Coastguard Worker.macro output_lag n
215*c0909341SAndroid Build Coastguard Workerfunction output_lag\n\()_neon
216*c0909341SAndroid Build Coastguard Worker        push            {r0, lr}
217*c0909341SAndroid Build Coastguard Worker.if \n == 1
218*c0909341SAndroid Build Coastguard Worker        mov             lr,  #-128
219*c0909341SAndroid Build Coastguard Worker.else
220*c0909341SAndroid Build Coastguard Worker        mov             r0,  #1
221*c0909341SAndroid Build Coastguard Worker        mov             lr,  #1
222*c0909341SAndroid Build Coastguard Worker        sub             r7,  r7,  #1
223*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  #1
224*c0909341SAndroid Build Coastguard Worker        lsl             r0,  r0,  r7
225*c0909341SAndroid Build Coastguard Worker        lsl             lr,  lr,  r9
226*c0909341SAndroid Build Coastguard Worker        add             r7,  r7,  #1
227*c0909341SAndroid Build Coastguard Worker        add             r9,  r9,  #1
228*c0909341SAndroid Build Coastguard Worker.endif
229*c0909341SAndroid Build Coastguard Worker1:
230*c0909341SAndroid Build Coastguard Worker        read_shift_rand r12, 11
231*c0909341SAndroid Build Coastguard Worker        vmov.32         r11, d2[0]
232*c0909341SAndroid Build Coastguard Worker        lsl             r12, r12, #1
233*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q0,  q0,  #1
234*c0909341SAndroid Build Coastguard Worker        ldrsh           r12, [r3, r12]
235*c0909341SAndroid Build Coastguard Worker.if \n == 1
236*c0909341SAndroid Build Coastguard Worker        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
237*c0909341SAndroid Build Coastguard Worker        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
238*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r10
239*c0909341SAndroid Build Coastguard Worker        asr             r6,  r6,  r7              // >> ar_coeff_shift
240*c0909341SAndroid Build Coastguard Worker        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
241*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r12
242*c0909341SAndroid Build Coastguard Worker        cmp             r6,  r5
243*c0909341SAndroid Build Coastguard Worker.elseif \n == 2
244*c0909341SAndroid Build Coastguard Worker        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
245*c0909341SAndroid Build Coastguard Worker        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
246*c0909341SAndroid Build Coastguard Worker        mov             r8,  r6
247*c0909341SAndroid Build Coastguard Worker        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
248*c0909341SAndroid Build Coastguard Worker        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
249*c0909341SAndroid Build Coastguard Worker        asr             r6,  r6,  r7              // >> ar_coeff_shift
250*c0909341SAndroid Build Coastguard Worker        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
251*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r12
252*c0909341SAndroid Build Coastguard Worker        push            {lr}
253*c0909341SAndroid Build Coastguard Worker        cmp             r6,  r5
254*c0909341SAndroid Build Coastguard Worker        mov             lr,  #-128
255*c0909341SAndroid Build Coastguard Worker.else
256*c0909341SAndroid Build Coastguard Worker        push            {r1-r3}
257*c0909341SAndroid Build Coastguard Worker        sbfx            r1,  r4,  #0,  #8
258*c0909341SAndroid Build Coastguard Worker        sbfx            r2,  r4,  #8,  #8
259*c0909341SAndroid Build Coastguard Worker        sbfx            r3,  r4,  #16, #8
260*c0909341SAndroid Build Coastguard Worker        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
261*c0909341SAndroid Build Coastguard Worker        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
262*c0909341SAndroid Build Coastguard Worker        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
263*c0909341SAndroid Build Coastguard Worker        pop             {r1-r3}
264*c0909341SAndroid Build Coastguard Worker        mov             r10, r8
265*c0909341SAndroid Build Coastguard Worker        mov             r8,  r6
266*c0909341SAndroid Build Coastguard Worker
267*c0909341SAndroid Build Coastguard Worker        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
268*c0909341SAndroid Build Coastguard Worker        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
269*c0909341SAndroid Build Coastguard Worker        asr             r6,  r6,  r7              // >> ar_coeff_shift
270*c0909341SAndroid Build Coastguard Worker        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
271*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r12
272*c0909341SAndroid Build Coastguard Worker        push            {lr}
273*c0909341SAndroid Build Coastguard Worker        cmp             r6,  r5
274*c0909341SAndroid Build Coastguard Worker        mov             lr,  #-128
275*c0909341SAndroid Build Coastguard Worker.endif
276*c0909341SAndroid Build Coastguard Worker        it              gt
277*c0909341SAndroid Build Coastguard Worker        movgt           r6,  r5
278*c0909341SAndroid Build Coastguard Worker        cmp             r6,  lr
279*c0909341SAndroid Build Coastguard Worker        it              lt
280*c0909341SAndroid Build Coastguard Worker        movlt           r6,  lr
281*c0909341SAndroid Build Coastguard Worker.if \n >= 2
282*c0909341SAndroid Build Coastguard Worker        pop             {lr}
283*c0909341SAndroid Build Coastguard Worker.endif
284*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
285*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  q1,  q1,  #4
286*c0909341SAndroid Build Coastguard Worker        vmov.8          d1[7], r6
287*c0909341SAndroid Build Coastguard Worker        bgt             1b
288*c0909341SAndroid Build Coastguard Worker        pop             {r0, pc}
289*c0909341SAndroid Build Coastguard Workerendfunc
290*c0909341SAndroid Build Coastguard Worker.endm
291*c0909341SAndroid Build Coastguard Worker
292*c0909341SAndroid Build Coastguard Workeroutput_lag 1
293*c0909341SAndroid Build Coastguard Workeroutput_lag 2
294*c0909341SAndroid Build Coastguard Workeroutput_lag 3
295*c0909341SAndroid Build Coastguard Worker
296*c0909341SAndroid Build Coastguard Worker
297*c0909341SAndroid Build Coastguard Workerfunction sum_lag1_above_neon
298*c0909341SAndroid Build Coastguard Worker        vmull.s8        q2,  d6,  d28
299*c0909341SAndroid Build Coastguard Worker        vmull.s8        q3,  d7,  d28
300*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d0,  d27
301*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d1,  d27
302*c0909341SAndroid Build Coastguard Worker
303*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d4,  d8
304*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q2,  d5,  d9
305*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q4,  d6,  d10
306*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q5,  d7,  d11
307*c0909341SAndroid Build Coastguard Worker
308*c0909341SAndroid Build Coastguard Worker        vmull.s8        q3,  d3,  d29
309*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d2,  d29
310*c0909341SAndroid Build Coastguard Worker
311*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q4,  q4,  d6
312*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q5,  q5,  d7
313*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q3,  q2,  d3
314*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q2,  q0,  d2
315*c0909341SAndroid Build Coastguard Worker        bx              lr
316*c0909341SAndroid Build Coastguard Workerendfunc
317*c0909341SAndroid Build Coastguard Worker
318*c0909341SAndroid Build Coastguard Worker.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
319*c0909341SAndroid Build Coastguard Worker.ifc \lag\()_\edge, lag3_left
320*c0909341SAndroid Build Coastguard Worker        bl              sum_lag3_left_above_neon
321*c0909341SAndroid Build Coastguard Worker.else
322*c0909341SAndroid Build Coastguard Worker        bl              sum_\lag\()_above_neon
323*c0909341SAndroid Build Coastguard Worker.endif
324*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_420
325*c0909341SAndroid Build Coastguard Worker        vpush           {q6-q7}
326*c0909341SAndroid Build Coastguard Worker        add             r12, r11, #GRAIN_WIDTH
327*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r11]!
328*c0909341SAndroid Build Coastguard Worker        vld1.16         {q6, q7}, [r12]!
329*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q0,  q0
330*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q1,  q1
331*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q6,  q6
332*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q7,  q7
333*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q6
334*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q7
335*c0909341SAndroid Build Coastguard Worker        vpop            {q6-q7}
336*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d0,  q0,  #2
337*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d1,  q1,  #2
338*c0909341SAndroid Build Coastguard Worker.endif
339*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_422
340*c0909341SAndroid Build Coastguard Worker        vld1.8          {q0, q1}, [r11]!
341*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q0,  q0
342*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q1,  q1
343*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d0,  q0,  #1
344*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d1,  q1,  #1
345*c0909341SAndroid Build Coastguard Worker.endif
346*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
347*c0909341SAndroid Build Coastguard Worker        vld1.8          {q0}, [r11]!
348*c0909341SAndroid Build Coastguard Worker.endif
349*c0909341SAndroid Build Coastguard Worker.if \uv_layout
350*c0909341SAndroid Build Coastguard Worker.ifnb \uv_coeff
351*c0909341SAndroid Build Coastguard Worker        vdup.8          d13, \uv_coeff
352*c0909341SAndroid Build Coastguard Worker.endif
353*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d0,  d13
354*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d1,  d13
355*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q2,  q2,  d2
356*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q3,  q3,  d3
357*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q4,  q4,  d0
358*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q5,  q5,  d1
359*c0909341SAndroid Build Coastguard Worker.endif
360*c0909341SAndroid Build Coastguard Worker.if \uv_layout && \elems == 16
361*c0909341SAndroid Build Coastguard Worker        b               sum_\lag\()_y_\edge\()_start
362*c0909341SAndroid Build Coastguard Worker.elseif \uv_layout == 444 && \elems == 15
363*c0909341SAndroid Build Coastguard Worker        b               sum_\lag\()_y_\edge\()_start
364*c0909341SAndroid Build Coastguard Worker.elseif \uv_layout == 422 && \elems == 9
365*c0909341SAndroid Build Coastguard Worker        b               sum_\lag\()_uv_420_\edge\()_start
366*c0909341SAndroid Build Coastguard Worker.else
367*c0909341SAndroid Build Coastguard Workersum_\lag\()_\type\()_\edge\()_start:
368*c0909341SAndroid Build Coastguard Worker        push            {r11}
369*c0909341SAndroid Build Coastguard Worker.ifc \edge, left
370*c0909341SAndroid Build Coastguard Worker        increment_seed  4
371*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  3
372*c0909341SAndroid Build Coastguard Worker        read_rand       r12, 11,  2
373*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
374*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r12, lsl #1
375*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[1]}, [r11]
376*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  1
377*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[2]}, [r12]
378*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
379*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1[3]}, [r11]
380*c0909341SAndroid Build Coastguard Worker        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
381*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d1,  d1,  d30
382*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d1,  q0
383*c0909341SAndroid Build Coastguard Worker        vext.8          q2,  q2,  q2,  #12
384*c0909341SAndroid Build Coastguard Worker.ifc \lag, lag3
385*c0909341SAndroid Build Coastguard Worker        vmov.s8         r10, d1[5]
386*c0909341SAndroid Build Coastguard Worker.endif
387*c0909341SAndroid Build Coastguard Worker.ifnc \lag, lag1
388*c0909341SAndroid Build Coastguard Worker        vmov.s8         r8,  d1[6]
389*c0909341SAndroid Build Coastguard Worker.endif
390*c0909341SAndroid Build Coastguard Worker        vmov.s8         r6,  d1[7]
391*c0909341SAndroid Build Coastguard Worker
392*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q2
393*c0909341SAndroid Build Coastguard Worker        mov             r1,  #1
394*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
395*c0909341SAndroid Build Coastguard Worker.else
396*c0909341SAndroid Build Coastguard Worker        increment_seed  4, shift=0
397*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q2
398*c0909341SAndroid Build Coastguard Worker        mov             r1,  #4
399*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
400*c0909341SAndroid Build Coastguard Worker.endif
401*c0909341SAndroid Build Coastguard Worker
402*c0909341SAndroid Build Coastguard Worker        increment_seed  4, shift=0
403*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q3
404*c0909341SAndroid Build Coastguard Worker        mov             r1,  #4
405*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
406*c0909341SAndroid Build Coastguard Worker
407*c0909341SAndroid Build Coastguard Worker        increment_seed  4, shift=0
408*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q4
409*c0909341SAndroid Build Coastguard Worker.if \elems == 9
410*c0909341SAndroid Build Coastguard Worker        mov             r1,  #1
411*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
412*c0909341SAndroid Build Coastguard Worker        lsr             r2,  r2,  #3
413*c0909341SAndroid Build Coastguard Worker
414*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  2
415*c0909341SAndroid Build Coastguard Worker        read_rand       r12, 11,  1
416*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
417*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r12, lsl #1
418*c0909341SAndroid Build Coastguard Worker        vld1.16         {d2[0]}, [r11]
419*c0909341SAndroid Build Coastguard Worker        read_rand       r11, 11,  0
420*c0909341SAndroid Build Coastguard Worker        vld1.16         {d2[1]}, [r12]
421*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
422*c0909341SAndroid Build Coastguard Worker        vld1.16         {d2[2]}, [r11]
423*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d2,  d2,  d30
424*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d2,  q1
425*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q0,  q1,  #7
426*c0909341SAndroid Build Coastguard Worker.else
427*c0909341SAndroid Build Coastguard Worker        mov             r1,  #4
428*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
429*c0909341SAndroid Build Coastguard Worker
430*c0909341SAndroid Build Coastguard Worker        increment_seed  4, shift=0
431*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q5
432*c0909341SAndroid Build Coastguard Worker
433*c0909341SAndroid Build Coastguard Worker.ifc \edge, right
434*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
435*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
436*c0909341SAndroid Build Coastguard Worker        read_shift_rand r11, 11
437*c0909341SAndroid Build Coastguard Worker        add             r11, r3,  r11, lsl #1
438*c0909341SAndroid Build Coastguard Worker        vld1.16         {d2[0]}, [r11]
439*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d2,  d2,  d30
440*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q0,  q1,  #1
441*c0909341SAndroid Build Coastguard Worker.else
442*c0909341SAndroid Build Coastguard Worker        mov             r1,  #4
443*c0909341SAndroid Build Coastguard Worker        bl              output_\lag\()_neon
444*c0909341SAndroid Build Coastguard Worker.endif
445*c0909341SAndroid Build Coastguard Worker.endif
446*c0909341SAndroid Build Coastguard Worker.if \store
447*c0909341SAndroid Build Coastguard Worker        vst1.8          {q0}, [r0]!
448*c0909341SAndroid Build Coastguard Worker.endif
449*c0909341SAndroid Build Coastguard Worker        pop             {r11}
450*c0909341SAndroid Build Coastguard Worker        pop             {r1, pc}
451*c0909341SAndroid Build Coastguard Worker.endif
452*c0909341SAndroid Build Coastguard Worker.endm
453*c0909341SAndroid Build Coastguard Worker
454*c0909341SAndroid Build Coastguard Worker.macro sum_lag1_func type, uv_layout, edge, elems=16
455*c0909341SAndroid Build Coastguard Workerfunction sum_\type\()_lag1_\edge\()_neon
456*c0909341SAndroid Build Coastguard Worker        push            {r1, lr}
457*c0909341SAndroid Build Coastguard Worker        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
458*c0909341SAndroid Build Coastguard Workerendfunc
459*c0909341SAndroid Build Coastguard Worker.endm
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard Workersum_lag1_func y,      0,   left
462*c0909341SAndroid Build Coastguard Workersum_lag1_func y,      0,   mid
463*c0909341SAndroid Build Coastguard Workersum_lag1_func y,      0,   right, 15
464*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_444, 444, left
465*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_444, 444, mid
466*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_444, 444, right, 15
467*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_422, 422, left
468*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_422, 422, mid
469*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_422, 422, right, 9
470*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_420, 420, left
471*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_420, 420, mid
472*c0909341SAndroid Build Coastguard Workersum_lag1_func uv_420, 420, right, 9
473*c0909341SAndroid Build Coastguard Worker
474*c0909341SAndroid Build Coastguard Worker.macro sum_lag1 type, dst, left, mid, right, edge=mid
475*c0909341SAndroid Build Coastguard Worker        vmov            q3,  \mid
476*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  \left, \mid,   #15
477*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  \mid,  \right, #1
478*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag1_\edge\()_neon
479*c0909341SAndroid Build Coastguard Worker        vmov            \dst, q0
480*c0909341SAndroid Build Coastguard Worker.endm
481*c0909341SAndroid Build Coastguard Worker
482*c0909341SAndroid Build Coastguard Worker.macro sum_y_lag1 dst, left, mid, right, edge=mid
483*c0909341SAndroid Build Coastguard Worker        sum_lag1        y, \dst, \left, \mid, \right, \edge
484*c0909341SAndroid Build Coastguard Worker.endm
485*c0909341SAndroid Build Coastguard Worker
486*c0909341SAndroid Build Coastguard Worker.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
487*c0909341SAndroid Build Coastguard Worker        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
488*c0909341SAndroid Build Coastguard Worker.endm
489*c0909341SAndroid Build Coastguard Worker
490*c0909341SAndroid Build Coastguard Worker.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
491*c0909341SAndroid Build Coastguard Worker        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
492*c0909341SAndroid Build Coastguard Worker.endm
493*c0909341SAndroid Build Coastguard Worker
494*c0909341SAndroid Build Coastguard Worker.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
495*c0909341SAndroid Build Coastguard Worker        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
496*c0909341SAndroid Build Coastguard Worker.endm
497*c0909341SAndroid Build Coastguard Worker
498*c0909341SAndroid Build Coastguard Worker
499*c0909341SAndroid Build Coastguard Workerfunction sum_lag2_above_neon
500*c0909341SAndroid Build Coastguard Worker        push            {lr}
501*c0909341SAndroid Build Coastguard Worker        sub             r12, r0,  #2*GRAIN_WIDTH - 16
502*c0909341SAndroid Build Coastguard Worker        sub             lr,  r0,  #1*GRAIN_WIDTH - 16
503*c0909341SAndroid Build Coastguard Worker        vld1.8          {q10}, [r12] // load top right
504*c0909341SAndroid Build Coastguard Worker        vld1.8          {q13}, [lr]
505*c0909341SAndroid Build Coastguard Worker
506*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q8,  q9,  #14 // top left, top mid
507*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d28[0]
508*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q8,  q9,  #15
509*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d28[1]
510*c0909341SAndroid Build Coastguard Worker
511*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d14
512*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d14
513*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d16, d15
514*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d17, d15
515*c0909341SAndroid Build Coastguard Worker
516*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q2,  d0,  d12
517*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q3,  d1,  d13
518*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q4,  d2,  d16
519*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q5,  d3,  d17
520*c0909341SAndroid Build Coastguard Worker
521*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q9,  q10, #1  // top mid, top right
522*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d28[3]
523*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q9,  q10, #2
524*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d28[4]
525*c0909341SAndroid Build Coastguard Worker
526*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d14
527*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d14
528*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d16, d15
529*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d17, d15
530*c0909341SAndroid Build Coastguard Worker
531*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d0,  d12
532*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d1,  d13
533*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d2,  d16
534*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d17
535*c0909341SAndroid Build Coastguard Worker
536*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q7
537*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q0
538*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q6
539*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
540*c0909341SAndroid Build Coastguard Worker
541*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q11, q12, #14 // top left, top mid
542*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d28[5]
543*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q11, q12, #15
544*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d28[6]
545*c0909341SAndroid Build Coastguard Worker
546*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d14
547*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d14
548*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d16, d15
549*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d17, d15
550*c0909341SAndroid Build Coastguard Worker
551*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d0,  d12
552*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d1,  d13
553*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d2,  d16
554*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d17
555*c0909341SAndroid Build Coastguard Worker
556*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q7
557*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q0
558*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q6
559*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
560*c0909341SAndroid Build Coastguard Worker
561*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q12, q13, #1  // top mid, top right
562*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d29[0]
563*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q12, q13, #2
564*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d29[1]
565*c0909341SAndroid Build Coastguard Worker
566*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d14
567*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d14
568*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d16, d15
569*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d17, d15
570*c0909341SAndroid Build Coastguard Worker
571*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d0,  d12
572*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d1,  d13
573*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d2,  d16
574*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d17
575*c0909341SAndroid Build Coastguard Worker
576*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q7
577*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q0
578*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q6
579*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
580*c0909341SAndroid Build Coastguard Worker
581*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d28[2]
582*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d28[7]
583*c0909341SAndroid Build Coastguard Worker
584*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d18, d14
585*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d19, d14
586*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d24, d15
587*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d25, d15
588*c0909341SAndroid Build Coastguard Worker
589*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d0,  d12
590*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d1,  d13
591*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d2,  d16
592*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d17
593*c0909341SAndroid Build Coastguard Worker
594*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q9
595*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q10
596*c0909341SAndroid Build Coastguard Worker
597*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q7
598*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q0
599*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q6
600*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
601*c0909341SAndroid Build Coastguard Worker
602*c0909341SAndroid Build Coastguard Worker        vmov            q11, q12
603*c0909341SAndroid Build Coastguard Worker        vmov            q12, q13
604*c0909341SAndroid Build Coastguard Worker
605*c0909341SAndroid Build Coastguard Worker        pop             {pc}
606*c0909341SAndroid Build Coastguard Workerendfunc
607*c0909341SAndroid Build Coastguard Worker
608*c0909341SAndroid Build Coastguard Worker.macro sum_lag2_func type, uv_layout, edge, elems=16
609*c0909341SAndroid Build Coastguard Workerfunction sum_\type\()_lag2_\edge\()_neon
610*c0909341SAndroid Build Coastguard Worker        push            {r1, lr}
611*c0909341SAndroid Build Coastguard Worker.ifc \edge, left
612*c0909341SAndroid Build Coastguard Worker        sub             r12, r0,  #2*GRAIN_WIDTH
613*c0909341SAndroid Build Coastguard Worker        sub             lr,  r0,  #1*GRAIN_WIDTH
614*c0909341SAndroid Build Coastguard Worker        vld1.8          {q9},  [r12] // load the previous block right above
615*c0909341SAndroid Build Coastguard Worker        vld1.8          {q12}, [lr]
616*c0909341SAndroid Build Coastguard Worker.endif
617*c0909341SAndroid Build Coastguard Worker        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
618*c0909341SAndroid Build Coastguard Workerendfunc
619*c0909341SAndroid Build Coastguard Worker.endm
620*c0909341SAndroid Build Coastguard Worker
621*c0909341SAndroid Build Coastguard Workersum_lag2_func y,      0,   left
622*c0909341SAndroid Build Coastguard Workersum_lag2_func y,      0,   mid
623*c0909341SAndroid Build Coastguard Workersum_lag2_func y,      0,   right, 15
624*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_444, 444, left
625*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_444, 444, mid
626*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_444, 444, right, 15
627*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_422, 422, left
628*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_422, 422, mid
629*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_422, 422, right, 9
630*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_420, 420, left
631*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_420, 420, mid
632*c0909341SAndroid Build Coastguard Workersum_lag2_func uv_420, 420, right, 9
633*c0909341SAndroid Build Coastguard Worker
634*c0909341SAndroid Build Coastguard Worker
635*c0909341SAndroid Build Coastguard Workerfunction sum_lag3_left_above_neon
636*c0909341SAndroid Build Coastguard Worker        // A separate codepath for the left edge, to avoid reading outside
637*c0909341SAndroid Build Coastguard Worker        // of the edge of the buffer.
638*c0909341SAndroid Build Coastguard Worker        sub             r12, r0,  #3*GRAIN_WIDTH
639*c0909341SAndroid Build Coastguard Worker        vld1.8          {q11, q12}, [r12]
640*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q11, q12, #13
641*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q11, q11, #13
642*c0909341SAndroid Build Coastguard Worker        b               sum_lag3_above_start
643*c0909341SAndroid Build Coastguard Workerendfunc
644*c0909341SAndroid Build Coastguard Worker
645*c0909341SAndroid Build Coastguard Workerfunction sum_lag3_above_neon
646*c0909341SAndroid Build Coastguard Worker        sub             r12, r0,  #3*GRAIN_WIDTH + 3
647*c0909341SAndroid Build Coastguard Worker        vld1.8          {q11, q12}, [r12]
648*c0909341SAndroid Build Coastguard Worker
649*c0909341SAndroid Build Coastguard Workersum_lag3_above_start:
650*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d26[0]
651*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q11, q12, #1
652*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d26[1]
653*c0909341SAndroid Build Coastguard Worker
654*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d22, d20
655*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d23, d20
656*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d18, d21
657*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d19, d21
658*c0909341SAndroid Build Coastguard Worker
659*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q11, q12, #2
660*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d26[2]
661*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q11, q12, #3
662*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d26[3]
663*c0909341SAndroid Build Coastguard Worker
664*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q2,  d0,  d12
665*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q3,  d1,  d13
666*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q4,  d2,  d14
667*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q5,  d3,  d15
668*c0909341SAndroid Build Coastguard Worker
669*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d16, d20
670*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d17, d20
671*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d18, d21
672*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d19, d21
673*c0909341SAndroid Build Coastguard Worker
674*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q8,  d0,  d12
675*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q9,  d1,  d13
676*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d14
677*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d15
678*c0909341SAndroid Build Coastguard Worker
679*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q11, q12, #4
680*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d26[4]
681*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q11, q12, #5
682*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d26[5]
683*c0909341SAndroid Build Coastguard Worker
684*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q8
685*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q9
686*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
687*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
688*c0909341SAndroid Build Coastguard Worker
689*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d20
690*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d20
691*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d14, d21
692*c0909341SAndroid Build Coastguard Worker        vmull.s8        q9,  d15, d21
693*c0909341SAndroid Build Coastguard Worker
694*c0909341SAndroid Build Coastguard Worker        sub             r12, r0,  #2*GRAIN_WIDTH + 3
695*c0909341SAndroid Build Coastguard Worker
696*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d0,  d16
697*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d1,  d17
698*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d18
699*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d19
700*c0909341SAndroid Build Coastguard Worker
701*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q11, q12, #6
702*c0909341SAndroid Build Coastguard Worker        vld1.8          {q11, q12}, [r12]
703*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d26[6]
704*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d26[7]
705*c0909341SAndroid Build Coastguard Worker
706*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q6
707*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q7
708*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
709*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
710*c0909341SAndroid Build Coastguard Worker
711*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d16, d20
712*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d17, d20
713*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d22, d21
714*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d23, d21
715*c0909341SAndroid Build Coastguard Worker
716*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q8,  d0,  d12
717*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q9,  d1,  d13
718*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d14
719*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d15
720*c0909341SAndroid Build Coastguard Worker
721*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q11, q12, #1
722*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d27[0]
723*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q11, q12, #2
724*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d27[1]
725*c0909341SAndroid Build Coastguard Worker
726*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q8
727*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q9
728*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
729*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
730*c0909341SAndroid Build Coastguard Worker
731*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d20
732*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d20
733*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d14, d21
734*c0909341SAndroid Build Coastguard Worker        vmull.s8        q9,  d15, d21
735*c0909341SAndroid Build Coastguard Worker
736*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d0,  d16
737*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d1,  d17
738*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d18
739*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d19
740*c0909341SAndroid Build Coastguard Worker
741*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q11, q12, #3
742*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d27[2]
743*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q11, q12, #4
744*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d27[3]
745*c0909341SAndroid Build Coastguard Worker
746*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q6
747*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q7
748*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
749*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
750*c0909341SAndroid Build Coastguard Worker
751*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d16, d20
752*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d17, d20
753*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d18, d21
754*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d19, d21
755*c0909341SAndroid Build Coastguard Worker
756*c0909341SAndroid Build Coastguard Worker        sub             r12, r0,  #1*GRAIN_WIDTH + 3
757*c0909341SAndroid Build Coastguard Worker
758*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q8,  d0,  d12
759*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q9,  d1,  d13
760*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d14
761*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d15
762*c0909341SAndroid Build Coastguard Worker
763*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q11, q12, #5
764*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d27[4]
765*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q11, q12, #6
766*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d27[5]
767*c0909341SAndroid Build Coastguard Worker
768*c0909341SAndroid Build Coastguard Worker        vld1.8          {q11, q12}, [r12]
769*c0909341SAndroid Build Coastguard Worker
770*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q8
771*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q9
772*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
773*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
774*c0909341SAndroid Build Coastguard Worker
775*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d20
776*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d20
777*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d14, d21
778*c0909341SAndroid Build Coastguard Worker        vmull.s8        q9,  d15, d21
779*c0909341SAndroid Build Coastguard Worker
780*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d0,  d16
781*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d1,  d17
782*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d18
783*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d19
784*c0909341SAndroid Build Coastguard Worker
785*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d27[6]
786*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q11, q12, #1
787*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d27[7]
788*c0909341SAndroid Build Coastguard Worker
789*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q6
790*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q7
791*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
792*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
793*c0909341SAndroid Build Coastguard Worker
794*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d22, d20
795*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d23, d20
796*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d18, d21
797*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d19, d21
798*c0909341SAndroid Build Coastguard Worker
799*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q8,  d0,  d12
800*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q9,  d1,  d13
801*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d14
802*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d15
803*c0909341SAndroid Build Coastguard Worker
804*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q11, q12, #2
805*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d28[0]
806*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q11, q12, #3
807*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d28[1]
808*c0909341SAndroid Build Coastguard Worker
809*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q8
810*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q9
811*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
812*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
813*c0909341SAndroid Build Coastguard Worker
814*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d20
815*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d20
816*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d14, d21
817*c0909341SAndroid Build Coastguard Worker        vmull.s8        q9,  d15, d21
818*c0909341SAndroid Build Coastguard Worker
819*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q6,  d0,  d16
820*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q7,  d1,  d17
821*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d18
822*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d19
823*c0909341SAndroid Build Coastguard Worker
824*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q11, q12, #4
825*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d28[2]
826*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q11, q12, #5
827*c0909341SAndroid Build Coastguard Worker        vdup.8          d21, d28[3]
828*c0909341SAndroid Build Coastguard Worker
829*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q6
830*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q7
831*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
832*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
833*c0909341SAndroid Build Coastguard Worker
834*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d16, d20
835*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d17, d20
836*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d18, d21
837*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d19, d21
838*c0909341SAndroid Build Coastguard Worker
839*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q8,  d0,  d12
840*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q9,  d1,  d13
841*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q0,  d2,  d14
842*c0909341SAndroid Build Coastguard Worker        vaddl.s16       q1,  d3,  d15
843*c0909341SAndroid Build Coastguard Worker
844*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q11, q12, #6
845*c0909341SAndroid Build Coastguard Worker        vdup.8          d20, d28[4]
846*c0909341SAndroid Build Coastguard Worker
847*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q8
848*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q9
849*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q0
850*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q1
851*c0909341SAndroid Build Coastguard Worker
852*c0909341SAndroid Build Coastguard Worker        vmull.s8        q0,  d12, d20
853*c0909341SAndroid Build Coastguard Worker        vmull.s8        q1,  d13, d20
854*c0909341SAndroid Build Coastguard Worker
855*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q2,  q2,  d0
856*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q3,  q3,  d1
857*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q4,  q4,  d2
858*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q5,  q5,  d3
859*c0909341SAndroid Build Coastguard Worker
860*c0909341SAndroid Build Coastguard Worker        bx              lr
861*c0909341SAndroid Build Coastguard Workerendfunc
862*c0909341SAndroid Build Coastguard Worker
863*c0909341SAndroid Build Coastguard Worker.macro sum_lag3_func type, uv_layout, edge, elems=16
864*c0909341SAndroid Build Coastguard Workerfunction sum_\type\()_lag3_\edge\()_neon
865*c0909341SAndroid Build Coastguard Worker        push            {r1, lr}
866*c0909341SAndroid Build Coastguard Worker        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
867*c0909341SAndroid Build Coastguard Workerendfunc
868*c0909341SAndroid Build Coastguard Worker.endm
869*c0909341SAndroid Build Coastguard Worker
870*c0909341SAndroid Build Coastguard Workersum_lag3_func y,      0,   left
871*c0909341SAndroid Build Coastguard Workersum_lag3_func y,      0,   mid
872*c0909341SAndroid Build Coastguard Workersum_lag3_func y,      0,   right, 15
873*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_444, 444, left
874*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_444, 444, mid
875*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_444, 444, right, 15
876*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_422, 422, left
877*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_422, 422, mid
878*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_422, 422, right, 9
879*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_420, 420, left
880*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_420, 420, mid
881*c0909341SAndroid Build Coastguard Workersum_lag3_func uv_420, 420, right, 9
882*c0909341SAndroid Build Coastguard Worker
883*c0909341SAndroid Build Coastguard Workerfunction generate_grain_rows_neon
884*c0909341SAndroid Build Coastguard Worker        push            {r11,lr}
885*c0909341SAndroid Build Coastguard Worker1:
886*c0909341SAndroid Build Coastguard Worker        get_grain_row   d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
887*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
888*c0909341SAndroid Build Coastguard Worker        store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
889*c0909341SAndroid Build Coastguard Worker        bgt             1b
890*c0909341SAndroid Build Coastguard Worker        pop             {r11,pc}
891*c0909341SAndroid Build Coastguard Workerendfunc
892*c0909341SAndroid Build Coastguard Worker
893*c0909341SAndroid Build Coastguard Workerfunction generate_grain_rows_44_neon
894*c0909341SAndroid Build Coastguard Worker        push            {r11,lr}
895*c0909341SAndroid Build Coastguard Worker1:
896*c0909341SAndroid Build Coastguard Worker        get_grain_row_44 d16, d17, d18, d19, d20, d21
897*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
898*c0909341SAndroid Build Coastguard Worker        store_grain_row_44 d16, d17, d18, d19, d20, d21
899*c0909341SAndroid Build Coastguard Worker        bgt             1b
900*c0909341SAndroid Build Coastguard Worker        pop             {r11,pc}
901*c0909341SAndroid Build Coastguard Workerendfunc
902*c0909341SAndroid Build Coastguard Worker
903*c0909341SAndroid Build Coastguard Workerfunction gen_grain_uv_444_lag0_neon
904*c0909341SAndroid Build Coastguard Worker        vld1.8          {q3}, [r11]!
905*c0909341SAndroid Build Coastguard Worker        push            {r11,lr}
906*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
907*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q8,  q0,  q15
908*c0909341SAndroid Build Coastguard Worker        bl              get_gaussian_neon
909*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q9,  q0,  q15
910*c0909341SAndroid Build Coastguard Worker        vqmovn.s16      d0,  q8
911*c0909341SAndroid Build Coastguard Worker        vqmovn.s16      d1,  q9
912*c0909341SAndroid Build Coastguard Worker
913*c0909341SAndroid Build Coastguard Worker        vand            q3,  q3,  q1
914*c0909341SAndroid Build Coastguard Worker        vmull.s8        q2,  d6,  d22
915*c0909341SAndroid Build Coastguard Worker        vmull.s8        q3,  d7,  d22
916*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q2,  q2,  q12
917*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q3,  q3,  q12
918*c0909341SAndroid Build Coastguard Worker        vaddw.s8        q2,  q2,  d0
919*c0909341SAndroid Build Coastguard Worker        vaddw.s8        q3,  q3,  d1
920*c0909341SAndroid Build Coastguard Worker        vqmovn.s16      d4,  q2
921*c0909341SAndroid Build Coastguard Worker        vqmovn.s16      d5,  q3
922*c0909341SAndroid Build Coastguard Worker        vst1.8          {q2}, [r0]!
923*c0909341SAndroid Build Coastguard Worker        pop             {r11,pc}
924*c0909341SAndroid Build Coastguard Workerendfunc
925*c0909341SAndroid Build Coastguard Worker
926*c0909341SAndroid Build Coastguard Workerfunction get_grain_row_44_neon
927*c0909341SAndroid Build Coastguard Worker        push            {r11,lr}
928*c0909341SAndroid Build Coastguard Worker        get_grain_row_44 d16, d17, d18, d19, d20, d21
929*c0909341SAndroid Build Coastguard Worker        pop             {r11,pc}
930*c0909341SAndroid Build Coastguard Workerendfunc
931*c0909341SAndroid Build Coastguard Worker
932*c0909341SAndroid Build Coastguard Workerfunction add_uv_420_coeff_lag0_neon
933*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r11]!
934*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4, q5}, [r12]!
935*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q2,  q2
936*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q3,  q3
937*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q4,  q4
938*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q5,  q5
939*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q4
940*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q5
941*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d4,  q2,  #2
942*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d5,  q3,  #2
943*c0909341SAndroid Build Coastguard Worker        b               add_coeff_lag0_start
944*c0909341SAndroid Build Coastguard Workerendfunc
945*c0909341SAndroid Build Coastguard Worker
946*c0909341SAndroid Build Coastguard Workerfunction add_uv_422_coeff_lag0_neon
947*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r11]!
948*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q2,  q2
949*c0909341SAndroid Build Coastguard Worker        vpaddl.s8       q3,  q3
950*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d4,  q2,  #1
951*c0909341SAndroid Build Coastguard Worker        vrshrn.s16      d5,  q3,  #1
952*c0909341SAndroid Build Coastguard Worker
953*c0909341SAndroid Build Coastguard Workeradd_coeff_lag0_start:
954*c0909341SAndroid Build Coastguard Worker        vand            q3,  q2,  q1
955*c0909341SAndroid Build Coastguard Worker        vmull.s8        q2,  d6,  d22
956*c0909341SAndroid Build Coastguard Worker        vmull.s8        q3,  d7,  d22
957*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q2,  q2,  q12
958*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q3,  q3,  q12
959*c0909341SAndroid Build Coastguard Worker        vaddw.s8        q2,  q2,  d0
960*c0909341SAndroid Build Coastguard Worker        vaddw.s8        q3,  q3,  d1
961*c0909341SAndroid Build Coastguard Worker        vqmovn.s16      d4,  q2
962*c0909341SAndroid Build Coastguard Worker        vqmovn.s16      d5,  q3
963*c0909341SAndroid Build Coastguard Worker        bx              lr
964*c0909341SAndroid Build Coastguard Workerendfunc
965*c0909341SAndroid Build Coastguard Worker
966*c0909341SAndroid Build Coastguard Worker.macro gen_grain_82 type
967*c0909341SAndroid Build Coastguard Workerfunction generate_grain_\type\()_8bpc_neon, export=1
968*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
969*c0909341SAndroid Build Coastguard Worker
970*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
971*c0909341SAndroid Build Coastguard Worker        mov             r12, r3
972*c0909341SAndroid Build Coastguard Worker        mov             lr,  #28
973*c0909341SAndroid Build Coastguard Worker        add             r11, r1,  #3*GRAIN_WIDTH
974*c0909341SAndroid Build Coastguard Worker        mov             r1,  r2
975*c0909341SAndroid Build Coastguard Worker        mul             r12, r12, lr
976*c0909341SAndroid Build Coastguard Worker.endif
977*c0909341SAndroid Build Coastguard Worker        movrel          r3,  X(gaussian_sequence)
978*c0909341SAndroid Build Coastguard Worker        ldr             r2,  [r1, #FGD_SEED]
979*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
980*c0909341SAndroid Build Coastguard Worker.ifc \type, y
981*c0909341SAndroid Build Coastguard Worker        add             r4,  r1,  #FGD_AR_COEFFS_Y
982*c0909341SAndroid Build Coastguard Worker.else
983*c0909341SAndroid Build Coastguard Worker        add             r4,  r1,  #FGD_AR_COEFFS_UV
984*c0909341SAndroid Build Coastguard Worker.endif
985*c0909341SAndroid Build Coastguard Worker        adr             r5,  L(gen_grain_\type\()_tbl)
986*c0909341SAndroid Build Coastguard Worker        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
987*c0909341SAndroid Build Coastguard Worker        add             r9,  r9,  #4
988*c0909341SAndroid Build Coastguard Worker        ldr             r6,  [r5, r6, lsl #2]
989*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r9    // 4 + data->grain_scale_shift
990*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r6
991*c0909341SAndroid Build Coastguard Worker        vneg.s16        q15, q15
992*c0909341SAndroid Build Coastguard Worker
993*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
994*c0909341SAndroid Build Coastguard Worker        cmp             r12, #0
995*c0909341SAndroid Build Coastguard Worker        movw            r10, #0x49d8
996*c0909341SAndroid Build Coastguard Worker        movw            lr,  #0xb524
997*c0909341SAndroid Build Coastguard Worker        // Intentionally using a separate register instead of moveq with an
998*c0909341SAndroid Build Coastguard Worker        // immediate constant, to avoid armv8 deprecated it instruction forms.
999*c0909341SAndroid Build Coastguard Worker        it              eq
1000*c0909341SAndroid Build Coastguard Worker        moveq           r10, lr
1001*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
1002*c0909341SAndroid Build Coastguard Worker        eor             r2,  r2,  r10
1003*c0909341SAndroid Build Coastguard Worker.endif
1004*c0909341SAndroid Build Coastguard Worker
1005*c0909341SAndroid Build Coastguard Worker        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
1006*c0909341SAndroid Build Coastguard Worker        mov             r8,  #1
1007*c0909341SAndroid Build Coastguard Worker        mov             r10, #1
1008*c0909341SAndroid Build Coastguard Worker        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
1009*c0909341SAndroid Build Coastguard Worker        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
1010*c0909341SAndroid Build Coastguard Worker        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
1011*c0909341SAndroid Build Coastguard Worker        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1012*c0909341SAndroid Build Coastguard Worker
1013*c0909341SAndroid Build Coastguard Worker        bx              r5
1014*c0909341SAndroid Build Coastguard Worker
1015*c0909341SAndroid Build Coastguard Worker        .align 2
1016*c0909341SAndroid Build Coastguard WorkerL(gen_grain_\type\()_tbl):
1017*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1018*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1019*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1020*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1021*c0909341SAndroid Build Coastguard Worker
1022*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag0):
1023*c0909341SAndroid Build Coastguard Worker.ifc \type, y
1024*c0909341SAndroid Build Coastguard Worker        mov             r1,  #GRAIN_HEIGHT
1025*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_neon
1026*c0909341SAndroid Build Coastguard Worker.else
1027*c0909341SAndroid Build Coastguard Worker
1028*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1029*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_neon
1030*c0909341SAndroid Build Coastguard Worker        mov             r1,  #GRAIN_HEIGHT-3
1031*c0909341SAndroid Build Coastguard Worker
1032*c0909341SAndroid Build Coastguard Worker        vdup.16         q12, r7
1033*c0909341SAndroid Build Coastguard Worker        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
1034*c0909341SAndroid Build Coastguard Worker        vmov.i8         q0,  #0
1035*c0909341SAndroid Build Coastguard Worker        vmov.i8         q1,  #255
1036*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q0,  q1,  #13
1037*c0909341SAndroid Build Coastguard Worker        vext.8          q14, q1,  q0,  #1
1038*c0909341SAndroid Build Coastguard Worker        vneg.s16        q12, q12
1039*c0909341SAndroid Build Coastguard Worker
1040*c0909341SAndroid Build Coastguard Worker1:
1041*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q13
1042*c0909341SAndroid Build Coastguard Worker        bl              gen_grain_uv_444_lag0_neon // 16
1043*c0909341SAndroid Build Coastguard Worker        vmov.i8         q1,  #255
1044*c0909341SAndroid Build Coastguard Worker        bl              gen_grain_uv_444_lag0_neon // 32
1045*c0909341SAndroid Build Coastguard Worker        bl              gen_grain_uv_444_lag0_neon // 48
1046*c0909341SAndroid Build Coastguard Worker        bl              gen_grain_uv_444_lag0_neon // 64
1047*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q14
1048*c0909341SAndroid Build Coastguard Worker        bl              gen_grain_uv_444_lag0_neon // 80
1049*c0909341SAndroid Build Coastguard Worker        get_grain_2     d16
1050*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1051*c0909341SAndroid Build Coastguard Worker        add             r11, r11, #2
1052*c0909341SAndroid Build Coastguard Worker        vst1.16         {d16[0]}, [r0]!
1053*c0909341SAndroid Build Coastguard Worker        bgt             1b
1054*c0909341SAndroid Build Coastguard Worker.endif
1055*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1056*c0909341SAndroid Build Coastguard Worker
1057*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag1):
1058*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1059*c0909341SAndroid Build Coastguard Worker        mov             r5,  #127
1060*c0909341SAndroid Build Coastguard Worker        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
1061*c0909341SAndroid Build Coastguard Worker        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
1062*c0909341SAndroid Build Coastguard Worker        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
1063*c0909341SAndroid Build Coastguard Worker.ifc \type, y
1064*c0909341SAndroid Build Coastguard Worker        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
1065*c0909341SAndroid Build Coastguard Worker.else
1066*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  #2
1067*c0909341SAndroid Build Coastguard Worker.endif
1068*c0909341SAndroid Build Coastguard Worker
1069*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1070*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
1071*c0909341SAndroid Build Coastguard Worker        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
1072*c0909341SAndroid Build Coastguard Worker        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
1073*c0909341SAndroid Build Coastguard Worker.endif
1074*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_neon
1075*c0909341SAndroid Build Coastguard Worker
1076*c0909341SAndroid Build Coastguard Worker        mov             r1,  #GRAIN_HEIGHT - 3
1077*c0909341SAndroid Build Coastguard Worker1:
1078*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
1079*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q8,  q8,  q9,  q10
1080*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q9,  q9,  q10, q11
1081*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q10, q10, q11, q12
1082*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q12, q11, q12, q13, right
1083*c0909341SAndroid Build Coastguard Worker        get_grain_2     d26
1084*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1085*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
1086*c0909341SAndroid Build Coastguard Worker        add             r11, r11, #2
1087*c0909341SAndroid Build Coastguard Worker.endif
1088*c0909341SAndroid Build Coastguard Worker        store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
1089*c0909341SAndroid Build Coastguard Worker        vmov            q11, q10
1090*c0909341SAndroid Build Coastguard Worker        vmov            q10, q9
1091*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q8
1092*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q7
1093*c0909341SAndroid Build Coastguard Worker        bgt             1b
1094*c0909341SAndroid Build Coastguard Worker
1095*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1096*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1097*c0909341SAndroid Build Coastguard Worker
1098*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag2):
1099*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1100*c0909341SAndroid Build Coastguard Worker        mov             r5,  #127
1101*c0909341SAndroid Build Coastguard Worker        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
1102*c0909341SAndroid Build Coastguard Worker
1103*c0909341SAndroid Build Coastguard Worker        vmov.s8         r4,  d29[2]
1104*c0909341SAndroid Build Coastguard Worker        vmov.s8         r10, d29[3]
1105*c0909341SAndroid Build Coastguard Worker
1106*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1107*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_neon
1108*c0909341SAndroid Build Coastguard Worker
1109*c0909341SAndroid Build Coastguard Worker        mov             r1,  #GRAIN_HEIGHT - 3
1110*c0909341SAndroid Build Coastguard Worker1:
1111*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_left_neon
1112*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_mid_neon
1113*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_mid_neon
1114*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_mid_neon
1115*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_right_neon
1116*c0909341SAndroid Build Coastguard Worker        get_grain_2     d16
1117*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1118*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
1119*c0909341SAndroid Build Coastguard Worker        add             r11, r11, #2
1120*c0909341SAndroid Build Coastguard Worker.endif
1121*c0909341SAndroid Build Coastguard Worker        vst1.16         {d16[0]}, [r0]!
1122*c0909341SAndroid Build Coastguard Worker        bgt             1b
1123*c0909341SAndroid Build Coastguard Worker
1124*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1125*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1126*c0909341SAndroid Build Coastguard Worker
1127*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag3):
1128*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1129*c0909341SAndroid Build Coastguard Worker        mov             r5,  #127
1130*c0909341SAndroid Build Coastguard Worker        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
1131*c0909341SAndroid Build Coastguard Worker
1132*c0909341SAndroid Build Coastguard Worker        vmov.u8         r4,  d28[5]
1133*c0909341SAndroid Build Coastguard Worker        vmov.u8         r10, d28[6]
1134*c0909341SAndroid Build Coastguard Worker        vmov.u8         r12, d28[7]
1135*c0909341SAndroid Build Coastguard Worker
1136*c0909341SAndroid Build Coastguard Worker        orr             r4,  r4,  r10, lsl #8
1137*c0909341SAndroid Build Coastguard Worker        orr             r4,  r4,  r12, lsl #16
1138*c0909341SAndroid Build Coastguard Worker
1139*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1140*c0909341SAndroid Build Coastguard Worker        vpush           {d26}
1141*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_neon
1142*c0909341SAndroid Build Coastguard Worker        vpop            {d26}
1143*c0909341SAndroid Build Coastguard Worker
1144*c0909341SAndroid Build Coastguard Worker        mov             r1,  #GRAIN_HEIGHT - 3
1145*c0909341SAndroid Build Coastguard Worker1:
1146*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_left_neon
1147*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_mid_neon
1148*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_mid_neon
1149*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_mid_neon
1150*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_right_neon
1151*c0909341SAndroid Build Coastguard Worker        get_grain_2     d16
1152*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1153*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_444
1154*c0909341SAndroid Build Coastguard Worker        add             r11, r11, #2
1155*c0909341SAndroid Build Coastguard Worker.endif
1156*c0909341SAndroid Build Coastguard Worker        vst1.16         {d16[0]}, [r0]!
1157*c0909341SAndroid Build Coastguard Worker        bgt             1b
1158*c0909341SAndroid Build Coastguard Worker
1159*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1160*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1161*c0909341SAndroid Build Coastguard Workerendfunc
1162*c0909341SAndroid Build Coastguard Worker.endm
1163*c0909341SAndroid Build Coastguard Worker
1164*c0909341SAndroid Build Coastguard Workergen_grain_82 y
1165*c0909341SAndroid Build Coastguard Workergen_grain_82 uv_444
1166*c0909341SAndroid Build Coastguard Worker
1167*c0909341SAndroid Build Coastguard Worker.macro set_height dst, type
1168*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_420
1169*c0909341SAndroid Build Coastguard Worker        mov             \dst,  #SUB_GRAIN_HEIGHT-3
1170*c0909341SAndroid Build Coastguard Worker.else
1171*c0909341SAndroid Build Coastguard Worker        mov             \dst,  #GRAIN_HEIGHT-3
1172*c0909341SAndroid Build Coastguard Worker.endif
1173*c0909341SAndroid Build Coastguard Worker.endm
1174*c0909341SAndroid Build Coastguard Worker
1175*c0909341SAndroid Build Coastguard Worker.macro increment_y_ptr reg, type
1176*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_420
1177*c0909341SAndroid Build Coastguard Worker        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
1178*c0909341SAndroid Build Coastguard Worker.else
1179*c0909341SAndroid Build Coastguard Worker        sub             \reg, \reg, #3*32-GRAIN_WIDTH
1180*c0909341SAndroid Build Coastguard Worker.endif
1181*c0909341SAndroid Build Coastguard Worker.endm
1182*c0909341SAndroid Build Coastguard Worker
1183*c0909341SAndroid Build Coastguard Worker.macro gen_grain_44 type
1184*c0909341SAndroid Build Coastguard Workerfunction generate_grain_\type\()_8bpc_neon, export=1
1185*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
1186*c0909341SAndroid Build Coastguard Worker
1187*c0909341SAndroid Build Coastguard Worker        mov             r12, r3
1188*c0909341SAndroid Build Coastguard Worker        mov             lr,  #28
1189*c0909341SAndroid Build Coastguard Worker        add             r11, r1,  #3*GRAIN_WIDTH-3
1190*c0909341SAndroid Build Coastguard Worker        mov             r1,  r2
1191*c0909341SAndroid Build Coastguard Worker        mul             r12, r12, lr
1192*c0909341SAndroid Build Coastguard Worker
1193*c0909341SAndroid Build Coastguard Worker        movrel          r3,  X(gaussian_sequence)
1194*c0909341SAndroid Build Coastguard Worker        ldr             r2,  [r1, #FGD_SEED]
1195*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
1196*c0909341SAndroid Build Coastguard Worker        add             r4,  r1,  #FGD_AR_COEFFS_UV
1197*c0909341SAndroid Build Coastguard Worker        adr             r5,  L(gen_grain_\type\()_tbl)
1198*c0909341SAndroid Build Coastguard Worker        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
1199*c0909341SAndroid Build Coastguard Worker        add             r9,  r9,  #4
1200*c0909341SAndroid Build Coastguard Worker        ldr             r6,  [r5, r6, lsl #2]
1201*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r9    // 4 + data->grain_scale_shift
1202*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r6
1203*c0909341SAndroid Build Coastguard Worker        vneg.s16        q15, q15
1204*c0909341SAndroid Build Coastguard Worker
1205*c0909341SAndroid Build Coastguard Worker        cmp             r12, #0
1206*c0909341SAndroid Build Coastguard Worker        movw            r10, #0x49d8
1207*c0909341SAndroid Build Coastguard Worker        movw            lr,  #0xb524
1208*c0909341SAndroid Build Coastguard Worker        // Intentionally using a separate register instead of moveq with an
1209*c0909341SAndroid Build Coastguard Worker        // immediate constant, to avoid armv8 deprecated it instruction forms.
1210*c0909341SAndroid Build Coastguard Worker        it              eq
1211*c0909341SAndroid Build Coastguard Worker        moveq           r10, lr
1212*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
1213*c0909341SAndroid Build Coastguard Worker        eor             r2,  r2,  r10
1214*c0909341SAndroid Build Coastguard Worker
1215*c0909341SAndroid Build Coastguard Worker        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
1216*c0909341SAndroid Build Coastguard Worker        mov             r8,  #1
1217*c0909341SAndroid Build Coastguard Worker        mov             r10, #1
1218*c0909341SAndroid Build Coastguard Worker        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
1219*c0909341SAndroid Build Coastguard Worker        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
1220*c0909341SAndroid Build Coastguard Worker        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
1221*c0909341SAndroid Build Coastguard Worker        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1222*c0909341SAndroid Build Coastguard Worker        bx              r5
1223*c0909341SAndroid Build Coastguard Worker
1224*c0909341SAndroid Build Coastguard Worker        .align 2
1225*c0909341SAndroid Build Coastguard WorkerL(gen_grain_\type\()_tbl):
1226*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1227*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1228*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1229*c0909341SAndroid Build Coastguard Worker        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1230*c0909341SAndroid Build Coastguard Worker
1231*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag0):
1232*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_420
1233*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q5}
1234*c0909341SAndroid Build Coastguard Worker.endif
1235*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1236*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_44_neon
1237*c0909341SAndroid Build Coastguard Worker        set_height      r1,  \type
1238*c0909341SAndroid Build Coastguard Worker
1239*c0909341SAndroid Build Coastguard Worker        vdup.16         q12, r7
1240*c0909341SAndroid Build Coastguard Worker        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
1241*c0909341SAndroid Build Coastguard Worker        vmov.i8         q0,  #0
1242*c0909341SAndroid Build Coastguard Worker        vmov.i8         q1,  #255
1243*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q0,  q1,  #13
1244*c0909341SAndroid Build Coastguard Worker        vext.8          q14, q1,  q0,  #7
1245*c0909341SAndroid Build Coastguard Worker        vneg.s16        q12, q12
1246*c0909341SAndroid Build Coastguard Worker
1247*c0909341SAndroid Build Coastguard Worker1:
1248*c0909341SAndroid Build Coastguard Worker        bl              get_grain_row_44_neon
1249*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_420
1250*c0909341SAndroid Build Coastguard Worker        add             r12, r11, #GRAIN_WIDTH
1251*c0909341SAndroid Build Coastguard Worker.endif
1252*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q13
1253*c0909341SAndroid Build Coastguard Worker        vmov            q0,  q8
1254*c0909341SAndroid Build Coastguard Worker        bl              add_\type\()_coeff_lag0_neon
1255*c0909341SAndroid Build Coastguard Worker        vmov.i8         q1,  #255
1256*c0909341SAndroid Build Coastguard Worker        vmov            q0,  q9
1257*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q2
1258*c0909341SAndroid Build Coastguard Worker        bl              add_\type\()_coeff_lag0_neon
1259*c0909341SAndroid Build Coastguard Worker        vmov.i8         q1,  q14
1260*c0909341SAndroid Build Coastguard Worker        vmov            q0,  q10
1261*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q2
1262*c0909341SAndroid Build Coastguard Worker        bl              add_\type\()_coeff_lag0_neon
1263*c0909341SAndroid Build Coastguard Worker        vmov            q10, q2
1264*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1265*c0909341SAndroid Build Coastguard Worker        increment_y_ptr r11, \type
1266*c0909341SAndroid Build Coastguard Worker        store_grain_row_44 d16, d17, d18, d19, d20, d21
1267*c0909341SAndroid Build Coastguard Worker        bgt             1b
1268*c0909341SAndroid Build Coastguard Worker
1269*c0909341SAndroid Build Coastguard Worker.ifc \type, uv_420
1270*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q5}
1271*c0909341SAndroid Build Coastguard Worker.endif
1272*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1273*c0909341SAndroid Build Coastguard Worker
1274*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag1):
1275*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1276*c0909341SAndroid Build Coastguard Worker        mov             r5,  #127
1277*c0909341SAndroid Build Coastguard Worker        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
1278*c0909341SAndroid Build Coastguard Worker        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
1279*c0909341SAndroid Build Coastguard Worker        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
1280*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  #2
1281*c0909341SAndroid Build Coastguard Worker
1282*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1283*c0909341SAndroid Build Coastguard Worker        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
1284*c0909341SAndroid Build Coastguard Worker        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
1285*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_44_neon
1286*c0909341SAndroid Build Coastguard Worker
1287*c0909341SAndroid Build Coastguard Worker        set_height      r1,  \type
1288*c0909341SAndroid Build Coastguard Worker1:
1289*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
1290*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q8,  q8,  q9,  q10
1291*c0909341SAndroid Build Coastguard Worker        sum_\type\()_lag1 q10, q9,  q10, q11, right
1292*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1293*c0909341SAndroid Build Coastguard Worker        increment_y_ptr r11, \type
1294*c0909341SAndroid Build Coastguard Worker        store_grain_row_44 d14, d15, d16, d17, d20, d21
1295*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q8
1296*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q7
1297*c0909341SAndroid Build Coastguard Worker        bgt             1b
1298*c0909341SAndroid Build Coastguard Worker
1299*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1300*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1301*c0909341SAndroid Build Coastguard Worker
1302*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag2):
1303*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1304*c0909341SAndroid Build Coastguard Worker        mov             r5,  #127
1305*c0909341SAndroid Build Coastguard Worker        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
1306*c0909341SAndroid Build Coastguard Worker
1307*c0909341SAndroid Build Coastguard Worker        vmov.s8         r4,  d29[2]
1308*c0909341SAndroid Build Coastguard Worker        vmov.s8         r10, d29[3]
1309*c0909341SAndroid Build Coastguard Worker
1310*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1311*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_44_neon
1312*c0909341SAndroid Build Coastguard Worker
1313*c0909341SAndroid Build Coastguard Worker        set_height      r1,  \type
1314*c0909341SAndroid Build Coastguard Worker1:
1315*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_left_neon
1316*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_mid_neon
1317*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag2_right_neon
1318*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1319*c0909341SAndroid Build Coastguard Worker        increment_y_ptr r11, \type
1320*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  #GRAIN_WIDTH-48
1321*c0909341SAndroid Build Coastguard Worker        bgt             1b
1322*c0909341SAndroid Build Coastguard Worker
1323*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1324*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1325*c0909341SAndroid Build Coastguard Worker
1326*c0909341SAndroid Build Coastguard WorkerL(generate_grain_\type\()_lag3):
1327*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1328*c0909341SAndroid Build Coastguard Worker        mov             r5,  #127
1329*c0909341SAndroid Build Coastguard Worker        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
1330*c0909341SAndroid Build Coastguard Worker
1331*c0909341SAndroid Build Coastguard Worker        vmov.u8         r4,  d28[5]
1332*c0909341SAndroid Build Coastguard Worker        vmov.u8         r10, d28[6]
1333*c0909341SAndroid Build Coastguard Worker        vmov.u8         r12, d28[7]
1334*c0909341SAndroid Build Coastguard Worker
1335*c0909341SAndroid Build Coastguard Worker        orr             r4,  r4,  r10, lsl #8
1336*c0909341SAndroid Build Coastguard Worker        orr             r4,  r4,  r12, lsl #16
1337*c0909341SAndroid Build Coastguard Worker
1338*c0909341SAndroid Build Coastguard Worker        mov             r1,  #3
1339*c0909341SAndroid Build Coastguard Worker        bl              generate_grain_rows_44_neon
1340*c0909341SAndroid Build Coastguard Worker
1341*c0909341SAndroid Build Coastguard Worker        set_height      r1,  \type
1342*c0909341SAndroid Build Coastguard Worker1:
1343*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_left_neon
1344*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_mid_neon
1345*c0909341SAndroid Build Coastguard Worker        bl              sum_\type\()_lag3_right_neon
1346*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1
1347*c0909341SAndroid Build Coastguard Worker        increment_y_ptr r11, \type
1348*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  #GRAIN_WIDTH-48
1349*c0909341SAndroid Build Coastguard Worker        bgt             1b
1350*c0909341SAndroid Build Coastguard Worker
1351*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1352*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1353*c0909341SAndroid Build Coastguard Workerendfunc
1354*c0909341SAndroid Build Coastguard Worker.endm
1355*c0909341SAndroid Build Coastguard Worker
1356*c0909341SAndroid Build Coastguard Workergen_grain_44 uv_420
1357*c0909341SAndroid Build Coastguard Workergen_grain_44 uv_422
1358*c0909341SAndroid Build Coastguard Worker
1359*c0909341SAndroid Build Coastguard Worker.macro gather_interleaved dst1, dst2, src1, src2, off
1360*c0909341SAndroid Build Coastguard Worker        vmov.u8         r11, \src1[0+\off]
1361*c0909341SAndroid Build Coastguard Worker        vmov.u8         r12, \src2[0+\off]
1362*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r3
1363*c0909341SAndroid Build Coastguard Worker        vmov.u8         lr,  \src1[2+\off]
1364*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r3
1365*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst1[0+\off]}, [r11]
1366*c0909341SAndroid Build Coastguard Worker        vmov.u8         r11, \src2[2+\off]
1367*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  r3
1368*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst2[0+\off]}, [r12]
1369*c0909341SAndroid Build Coastguard Worker        vmov.u8         r12, \src1[4+\off]
1370*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r3
1371*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst1[2+\off]}, [lr]
1372*c0909341SAndroid Build Coastguard Worker        vmov.u8         lr,  \src2[4+\off]
1373*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r3
1374*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst2[2+\off]}, [r11]
1375*c0909341SAndroid Build Coastguard Worker        vmov.u8         r11, \src1[6+\off]
1376*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  r3
1377*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst1[4+\off]}, [r12]
1378*c0909341SAndroid Build Coastguard Worker        vmov.u8         r12, \src2[6+\off]
1379*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r3
1380*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst2[4+\off]}, [lr]
1381*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r3
1382*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst1[6+\off]}, [r11]
1383*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst2[6+\off]}, [r12]
1384*c0909341SAndroid Build Coastguard Worker.endm
1385*c0909341SAndroid Build Coastguard Worker
1386*c0909341SAndroid Build Coastguard Worker.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
1387*c0909341SAndroid Build Coastguard Worker        gather_interleaved \dst1, \dst3, \src1, \src3, 0
1388*c0909341SAndroid Build Coastguard Worker        gather_interleaved \dst1, \dst3, \src1, \src3, 1
1389*c0909341SAndroid Build Coastguard Worker        gather_interleaved \dst2, \dst4, \src2, \src4, 0
1390*c0909341SAndroid Build Coastguard Worker        gather_interleaved \dst2, \dst4, \src2, \src4, 1
1391*c0909341SAndroid Build Coastguard Worker.endm
1392*c0909341SAndroid Build Coastguard Worker
1393*c0909341SAndroid Build Coastguard Workerfunction gather32_neon
1394*c0909341SAndroid Build Coastguard Worker        push            {r11-r12,lr}
1395*c0909341SAndroid Build Coastguard Worker        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3
1396*c0909341SAndroid Build Coastguard Worker        pop             {r11-r12,pc}
1397*c0909341SAndroid Build Coastguard Workerendfunc
1398*c0909341SAndroid Build Coastguard Worker
1399*c0909341SAndroid Build Coastguard Workerfunction gather16_neon
1400*c0909341SAndroid Build Coastguard Worker        push            {r11-r12,lr}
1401*c0909341SAndroid Build Coastguard Worker        gather_interleaved d8,  d9,  d0,  d1,  0
1402*c0909341SAndroid Build Coastguard Worker        gather_interleaved d8,  d9,  d0,  d1,  1
1403*c0909341SAndroid Build Coastguard Worker        pop             {r11-r12,pc}
1404*c0909341SAndroid Build Coastguard Workerendfunc
1405*c0909341SAndroid Build Coastguard Worker
1406*c0909341SAndroid Build Coastguard Workerconst overlap_coeffs_0, align=4
1407*c0909341SAndroid Build Coastguard Worker        .byte 27, 17, 0,  0,  0,  0,  0,  0
1408*c0909341SAndroid Build Coastguard Worker        .byte 17, 27, 32, 32, 32, 32, 32, 32
1409*c0909341SAndroid Build Coastguard Workerendconst
1410*c0909341SAndroid Build Coastguard Worker
1411*c0909341SAndroid Build Coastguard Workerconst overlap_coeffs_1, align=4
1412*c0909341SAndroid Build Coastguard Worker        .byte 23, 0,  0,  0,  0,  0,  0,  0
1413*c0909341SAndroid Build Coastguard Worker        .byte 22, 32, 32, 32, 32, 32, 32, 32
1414*c0909341SAndroid Build Coastguard Workerendconst
1415*c0909341SAndroid Build Coastguard Worker
1416*c0909341SAndroid Build Coastguard Worker.macro calc_offset offx, offy, src, sx, sy
1417*c0909341SAndroid Build Coastguard Worker        and             \offy, \src,  #0xF     // randval & 0xF
1418*c0909341SAndroid Build Coastguard Worker        lsr             \offx, \src,  #4       // randval >> 4
1419*c0909341SAndroid Build Coastguard Worker.if \sy == 0
1420*c0909341SAndroid Build Coastguard Worker        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
1421*c0909341SAndroid Build Coastguard Worker.endif
1422*c0909341SAndroid Build Coastguard Worker.if \sx == 0
1423*c0909341SAndroid Build Coastguard Worker        add             \offx, \offx, \offx    // 2 * (randval >> 4)
1424*c0909341SAndroid Build Coastguard Worker.endif
1425*c0909341SAndroid Build Coastguard Worker.endm
1426*c0909341SAndroid Build Coastguard Worker
1427*c0909341SAndroid Build Coastguard Worker.macro add_offset dst, offx, offy, src, stride
1428*c0909341SAndroid Build Coastguard Worker        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
1429*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \offx          // grain_lut += offx
1430*c0909341SAndroid Build Coastguard Worker.endm
1431*c0909341SAndroid Build Coastguard Worker
1432*c0909341SAndroid Build Coastguard Worker// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
1433*c0909341SAndroid Build Coastguard Worker//                                const ptrdiff_t stride,
1434*c0909341SAndroid Build Coastguard Worker//                                const uint8_t scaling[SCALING_SIZE],
1435*c0909341SAndroid Build Coastguard Worker//                                const int scaling_shift,
1436*c0909341SAndroid Build Coastguard Worker//                                const entry grain_lut[][GRAIN_WIDTH],
1437*c0909341SAndroid Build Coastguard Worker//                                const int offsets[][2],
1438*c0909341SAndroid Build Coastguard Worker//                                const int h, const ptrdiff_t clip,
1439*c0909341SAndroid Build Coastguard Worker//                                const ptrdiff_t type);
1440*c0909341SAndroid Build Coastguard Workerfunction fgy_32x32_8bpc_neon, export=1
1441*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
1442*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1443*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
1444*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #108]   // offsets, h
1445*c0909341SAndroid Build Coastguard Worker        ldr             r8,       [sp, #116]   // clip
1446*c0909341SAndroid Build Coastguard Worker        mov             r9,  #GRAIN_WIDTH      // grain_lut stride
1447*c0909341SAndroid Build Coastguard Worker
1448*c0909341SAndroid Build Coastguard Worker        neg             r4,  r4
1449*c0909341SAndroid Build Coastguard Worker        vdup.16         q13, r4                // -scaling_shift
1450*c0909341SAndroid Build Coastguard Worker        cmp             r8,  #0
1451*c0909341SAndroid Build Coastguard Worker
1452*c0909341SAndroid Build Coastguard Worker        movrel_local    r12, overlap_coeffs_0
1453*c0909341SAndroid Build Coastguard Worker
1454*c0909341SAndroid Build Coastguard Worker        beq             1f
1455*c0909341SAndroid Build Coastguard Worker        // clip
1456*c0909341SAndroid Build Coastguard Worker        vmov.i8         q14, #16
1457*c0909341SAndroid Build Coastguard Worker        vmov.i8         q15, #235
1458*c0909341SAndroid Build Coastguard Worker        b               2f
1459*c0909341SAndroid Build Coastguard Worker1:
1460*c0909341SAndroid Build Coastguard Worker        // no clip
1461*c0909341SAndroid Build Coastguard Worker        vmov.i8         q14, #0
1462*c0909341SAndroid Build Coastguard Worker        vmov.i8         q15, #255
1463*c0909341SAndroid Build Coastguard Worker2:
1464*c0909341SAndroid Build Coastguard Worker
1465*c0909341SAndroid Build Coastguard Worker        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
1466*c0909341SAndroid Build Coastguard Worker
1467*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #9           // grain_lut += 9
1468*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
1469*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r9           // grain_lut += grain_stride
1470*c0909341SAndroid Build Coastguard Worker
1471*c0909341SAndroid Build Coastguard Worker        ldr             r10, [r6, #8]          // offsets[1][0]
1472*c0909341SAndroid Build Coastguard Worker        calc_offset     r10, r4,  r10, 0,   0
1473*c0909341SAndroid Build Coastguard Worker        add_offset      r4,  r10, r4,  r5,  r9
1474*c0909341SAndroid Build Coastguard Worker        ldr             r10, [r6, #4]          // offsets[0][1]
1475*c0909341SAndroid Build Coastguard Worker        calc_offset     r10, r11, r10, 0,   0
1476*c0909341SAndroid Build Coastguard Worker        add_offset      r11, r10, r11, r5,  r9
1477*c0909341SAndroid Build Coastguard Worker        ldr             r10, [r6, #12]         // offsets[1][1]
1478*c0909341SAndroid Build Coastguard Worker        calc_offset     r10, r8,  r10, 0,   0
1479*c0909341SAndroid Build Coastguard Worker        add_offset      r8,  r10, r8,  r5,  r9
1480*c0909341SAndroid Build Coastguard Worker        ldr             r6,  [r6]              // offsets[0][0]
1481*c0909341SAndroid Build Coastguard Worker        calc_offset     r6,  lr,  r6,  0,   0
1482*c0909341SAndroid Build Coastguard Worker        add_offset      r5,  r6,  lr,  r5,  r9
1483*c0909341SAndroid Build Coastguard Worker
1484*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  #32          // grain_lut += FG_BLOCK_SIZE * bx
1485*c0909341SAndroid Build Coastguard Worker        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1486*c0909341SAndroid Build Coastguard Worker
1487*c0909341SAndroid Build Coastguard Worker        ldr             r10, [sp, #120]        // type
1488*c0909341SAndroid Build Coastguard Worker        adr             r11, L(fgy_loop_tbl)
1489*c0909341SAndroid Build Coastguard Worker
1490*c0909341SAndroid Build Coastguard Worker        tst             r10, #1
1491*c0909341SAndroid Build Coastguard Worker        ldr             r10, [r11, r10, lsl #2]
1492*c0909341SAndroid Build Coastguard Worker
1493*c0909341SAndroid Build Coastguard Worker        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1494*c0909341SAndroid Build Coastguard Worker        add             r8,  r8,  #32          // grain_lut += FG_BLOCK_SIZE * bx
1495*c0909341SAndroid Build Coastguard Worker
1496*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r10
1497*c0909341SAndroid Build Coastguard Worker
1498*c0909341SAndroid Build Coastguard Worker        beq             1f
1499*c0909341SAndroid Build Coastguard Worker        // y overlap
1500*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d24[0]
1501*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d24[1]
1502*c0909341SAndroid Build Coastguard Worker        mov             r10, r7                // backup actual h
1503*c0909341SAndroid Build Coastguard Worker        mov             r7,  #2
1504*c0909341SAndroid Build Coastguard Worker1:
1505*c0909341SAndroid Build Coastguard Worker        bx              r11
1506*c0909341SAndroid Build Coastguard Workerendfunc
1507*c0909341SAndroid Build Coastguard Worker
1508*c0909341SAndroid Build Coastguard Workerfunction fgy_loop_neon
1509*c0909341SAndroid Build Coastguard WorkerL(fgy_loop_tbl):
1510*c0909341SAndroid Build Coastguard Worker        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
1511*c0909341SAndroid Build Coastguard Worker        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
1512*c0909341SAndroid Build Coastguard Worker        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
1513*c0909341SAndroid Build Coastguard Worker        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
1514*c0909341SAndroid Build Coastguard Worker
1515*c0909341SAndroid Build Coastguard Worker.macro fgy ox, oy
1516*c0909341SAndroid Build Coastguard WorkerL(loop_\ox\oy):
1517*c0909341SAndroid Build Coastguard Worker1:
1518*c0909341SAndroid Build Coastguard Worker.if \ox
1519*c0909341SAndroid Build Coastguard Worker        vld1.8          {d8},       [r4],       r9 // grain_lut old
1520*c0909341SAndroid Build Coastguard Worker.endif
1521*c0909341SAndroid Build Coastguard Worker.if \oy
1522*c0909341SAndroid Build Coastguard Worker        vld1.8          {q2, q3},   [r6],       r9 // grain_lut top
1523*c0909341SAndroid Build Coastguard Worker.endif
1524*c0909341SAndroid Build Coastguard Worker.if \ox && \oy
1525*c0909341SAndroid Build Coastguard Worker        vld1.8          {d10},      [r8],       r9 // grain_lut top old
1526*c0909341SAndroid Build Coastguard Worker.endif
1527*c0909341SAndroid Build Coastguard Worker        vld1.8          {q0,  q1},  [r1, :128], r2 // src
1528*c0909341SAndroid Build Coastguard Worker        vld1.8          {q10, q11}, [r5],       r9 // grain_lut
1529*c0909341SAndroid Build Coastguard Worker
1530*c0909341SAndroid Build Coastguard Worker.if \ox
1531*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d8,  d24
1532*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q4,  d20, d25
1533*c0909341SAndroid Build Coastguard Worker.endif
1534*c0909341SAndroid Build Coastguard Worker
1535*c0909341SAndroid Build Coastguard Worker.if \oy
1536*c0909341SAndroid Build Coastguard Worker.if \ox
1537*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d10, d24
1538*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q5,  d4,  d25
1539*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1540*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d4,  q5,  #5
1541*c0909341SAndroid Build Coastguard Worker.endif
1542*c0909341SAndroid Build Coastguard Worker
1543*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d20, d15
1544*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d21, d15
1545*c0909341SAndroid Build Coastguard Worker        vmull.s8        q8,  d22, d15
1546*c0909341SAndroid Build Coastguard Worker        vmull.s8        q9,  d23, d15
1547*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q4,  d4,  d14
1548*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q5,  d5,  d14
1549*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q8,  d6,  d14
1550*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q9,  d7,  d14
1551*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1552*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d21, q5,  #5
1553*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d22, q8,  #5
1554*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d23, q9,  #5
1555*c0909341SAndroid Build Coastguard Worker.elseif \ox
1556*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1557*c0909341SAndroid Build Coastguard Worker.endif
1558*c0909341SAndroid Build Coastguard Worker
1559*c0909341SAndroid Build Coastguard Worker        bl              gather32_neon
1560*c0909341SAndroid Build Coastguard Worker
1561*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q8,  d20       // grain
1562*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q9,  d21
1563*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q10, d22
1564*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q11, d23
1565*c0909341SAndroid Build Coastguard Worker
1566*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q2,  d8        // scaling
1567*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q3,  d9
1568*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q4,  d10
1569*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q5,  d11
1570*c0909341SAndroid Build Coastguard Worker
1571*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q2   // scaling * grain
1572*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  q3
1573*c0909341SAndroid Build Coastguard Worker        vmul.i16        q10, q10, q4
1574*c0909341SAndroid Build Coastguard Worker        vmul.i16        q11, q11, q5
1575*c0909341SAndroid Build Coastguard Worker
1576*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
1577*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q9,  q9,  q13
1578*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q10, q10, q13
1579*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q11, q11, q13
1580*c0909341SAndroid Build Coastguard Worker
1581*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q8,  q8,  d0   // *src + noise
1582*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q9,  q9,  d1
1583*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q10, q10, d2
1584*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q11, q11, d3
1585*c0909341SAndroid Build Coastguard Worker
1586*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d0,  q8
1587*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d1,  q9
1588*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d2,  q10
1589*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d3,  q11
1590*c0909341SAndroid Build Coastguard Worker
1591*c0909341SAndroid Build Coastguard Worker        vmax.u8         q0,  q0,  q14
1592*c0909341SAndroid Build Coastguard Worker        vmax.u8         q1,  q1,  q14
1593*c0909341SAndroid Build Coastguard Worker        vmin.u8         q0,  q0,  q15
1594*c0909341SAndroid Build Coastguard Worker        vmin.u8         q1,  q1,  q15
1595*c0909341SAndroid Build Coastguard Worker
1596*c0909341SAndroid Build Coastguard Worker        subs            r7,  r7,  #1
1597*c0909341SAndroid Build Coastguard Worker.if \oy
1598*c0909341SAndroid Build Coastguard Worker        vdup.8          d14, d25[0]
1599*c0909341SAndroid Build Coastguard Worker        vdup.8          d15, d25[1]
1600*c0909341SAndroid Build Coastguard Worker.endif
1601*c0909341SAndroid Build Coastguard Worker        vst1.8          {q0, q1}, [r0, :128], r2 // dst
1602*c0909341SAndroid Build Coastguard Worker        bgt             1b
1603*c0909341SAndroid Build Coastguard Worker
1604*c0909341SAndroid Build Coastguard Worker.if \oy
1605*c0909341SAndroid Build Coastguard Worker        cmp             r10, #2
1606*c0909341SAndroid Build Coastguard Worker        sub             r7,  r10, #2           // restore actual remaining h
1607*c0909341SAndroid Build Coastguard Worker        bgt             L(loop_\ox\()0)
1608*c0909341SAndroid Build Coastguard Worker.endif
1609*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1610*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1611*c0909341SAndroid Build Coastguard Worker.endm
1612*c0909341SAndroid Build Coastguard Worker
1613*c0909341SAndroid Build Coastguard Worker        fgy             0, 0
1614*c0909341SAndroid Build Coastguard Worker        fgy             0, 1
1615*c0909341SAndroid Build Coastguard Worker        fgy             1, 0
1616*c0909341SAndroid Build Coastguard Worker        fgy             1, 1
1617*c0909341SAndroid Build Coastguard Workerendfunc
1618*c0909341SAndroid Build Coastguard Worker
1619*c0909341SAndroid Build Coastguard Worker// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
1620*c0909341SAndroid Build Coastguard Worker//                                     const pixel *const src,
1621*c0909341SAndroid Build Coastguard Worker//                                     const ptrdiff_t stride,
1622*c0909341SAndroid Build Coastguard Worker//                                     const uint8_t scaling[SCALING_SIZE],
1623*c0909341SAndroid Build Coastguard Worker//                                     const Dav1dFilmGrainData *const data,
1624*c0909341SAndroid Build Coastguard Worker//                                     const entry grain_lut[][GRAIN_WIDTH],
1625*c0909341SAndroid Build Coastguard Worker//                                     const pixel *const luma_row,
1626*c0909341SAndroid Build Coastguard Worker//                                     const ptrdiff_t luma_stride,
1627*c0909341SAndroid Build Coastguard Worker//                                     const int offsets[][2],
1628*c0909341SAndroid Build Coastguard Worker//                                     const ptrdiff_t h, const ptrdiff_t uv,
1629*c0909341SAndroid Build Coastguard Worker//                                     const ptrdiff_t is_id,
1630*c0909341SAndroid Build Coastguard Worker//                                     const ptrdiff_t type);
1631*c0909341SAndroid Build Coastguard Worker.macro fguv layout, sx, sy
1632*c0909341SAndroid Build Coastguard Workerfunction fguv_32x32_\layout\()_8bpc_neon, export=1
1633*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
1634*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1635*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
1636*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
1637*c0909341SAndroid Build Coastguard Worker        ldrd            r8,  r9,  [sp, #116]   // offsets, h
1638*c0909341SAndroid Build Coastguard Worker        ldrd            r10, r11, [sp, #124]   // uv, is_id
1639*c0909341SAndroid Build Coastguard Worker
1640*c0909341SAndroid Build Coastguard Worker        // !csfl
1641*c0909341SAndroid Build Coastguard Worker        add             r10, r4,  r10, lsl #2  // + 4*uv
1642*c0909341SAndroid Build Coastguard Worker        add             r12, r10, #FGD_UV_LUMA_MULT
1643*c0909341SAndroid Build Coastguard Worker        add             lr,  r10, #FGD_UV_MULT
1644*c0909341SAndroid Build Coastguard Worker        add             r10, r10, #FGD_UV_OFFSET
1645*c0909341SAndroid Build Coastguard Worker        vld1.16         {d4[]},  [r12]         // uv_luma_mult
1646*c0909341SAndroid Build Coastguard Worker        vld1.16         {d4[2]}, [r10]         // uv_offset
1647*c0909341SAndroid Build Coastguard Worker        vld1.16         {d4[1]}, [lr]          // uv_mult
1648*c0909341SAndroid Build Coastguard Worker
1649*c0909341SAndroid Build Coastguard Worker        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
1650*c0909341SAndroid Build Coastguard Worker        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
1651*c0909341SAndroid Build Coastguard Worker        neg             lr,  lr                // -scaling_shift
1652*c0909341SAndroid Build Coastguard Worker
1653*c0909341SAndroid Build Coastguard Worker        cmp             r12, #0
1654*c0909341SAndroid Build Coastguard Worker        vdup.16         q13, lr                // -scaling_shift
1655*c0909341SAndroid Build Coastguard Worker
1656*c0909341SAndroid Build Coastguard Worker        beq             1f
1657*c0909341SAndroid Build Coastguard Worker        // clip
1658*c0909341SAndroid Build Coastguard Worker        cmp             r11, #0
1659*c0909341SAndroid Build Coastguard Worker        vmov.i8         q14, #16
1660*c0909341SAndroid Build Coastguard Worker        vmov.i8         q15, #240
1661*c0909341SAndroid Build Coastguard Worker        beq             2f
1662*c0909341SAndroid Build Coastguard Worker        // is_id
1663*c0909341SAndroid Build Coastguard Worker        vmov.i8         q15, #235
1664*c0909341SAndroid Build Coastguard Worker        b               2f
1665*c0909341SAndroid Build Coastguard Worker1:
1666*c0909341SAndroid Build Coastguard Worker        // no clip
1667*c0909341SAndroid Build Coastguard Worker        vmov.i8         q14, #0
1668*c0909341SAndroid Build Coastguard Worker        vmov.i8         q15, #255
1669*c0909341SAndroid Build Coastguard Worker2:
1670*c0909341SAndroid Build Coastguard Worker
1671*c0909341SAndroid Build Coastguard Worker        mov             r10, #GRAIN_WIDTH      // grain_lut stride
1672*c0909341SAndroid Build Coastguard Worker
1673*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
1674*c0909341SAndroid Build Coastguard Worker.if \sy
1675*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
1676*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
1677*c0909341SAndroid Build Coastguard Worker.else
1678*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
1679*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r10          // grain_lut += grain_stride
1680*c0909341SAndroid Build Coastguard Worker.endif
1681*c0909341SAndroid Build Coastguard Worker
1682*c0909341SAndroid Build Coastguard Worker        ldr             r12, [r8, #8]          // offsets[1][0]
1683*c0909341SAndroid Build Coastguard Worker        calc_offset     r12, r4,  r12, \sx, \sy
1684*c0909341SAndroid Build Coastguard Worker        add_offset      r4,  r12, r4,  r5,  r10
1685*c0909341SAndroid Build Coastguard Worker
1686*c0909341SAndroid Build Coastguard Worker        ldr             r12, [r8, #4]          // offsets[0][1]
1687*c0909341SAndroid Build Coastguard Worker        calc_offset     r12, lr,  r12, \sx, \sy
1688*c0909341SAndroid Build Coastguard Worker        add_offset      lr,  r12, lr,  r5,  r10
1689*c0909341SAndroid Build Coastguard Worker
1690*c0909341SAndroid Build Coastguard Worker        ldr             r12, [r8, #12]         // offsets[1][1]
1691*c0909341SAndroid Build Coastguard Worker        calc_offset     r12, r11, r12, \sx, \sy
1692*c0909341SAndroid Build Coastguard Worker        add_offset      r11, r12, r11, r5,  r10
1693*c0909341SAndroid Build Coastguard Worker
1694*c0909341SAndroid Build Coastguard Worker        ldr             r8,  [r8]              // offsets[0][0]
1695*c0909341SAndroid Build Coastguard Worker        calc_offset     r8,  r12, r8,  \sx, \sy
1696*c0909341SAndroid Build Coastguard Worker        add_offset      r5,  r8,  r12, r5,  r10
1697*c0909341SAndroid Build Coastguard Worker
1698*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
1699*c0909341SAndroid Build Coastguard Worker        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1700*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1701*c0909341SAndroid Build Coastguard Worker        add             r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
1702*c0909341SAndroid Build Coastguard Worker
1703*c0909341SAndroid Build Coastguard Worker        movrel_local    r12, overlap_coeffs_\sx
1704*c0909341SAndroid Build Coastguard Worker        ldr             lr,  [sp, #132]        // type
1705*c0909341SAndroid Build Coastguard Worker
1706*c0909341SAndroid Build Coastguard Worker        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
1707*c0909341SAndroid Build Coastguard Worker
1708*c0909341SAndroid Build Coastguard Worker        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
1709*c0909341SAndroid Build Coastguard Worker#if CONFIG_THUMB
1710*c0909341SAndroid Build Coastguard Worker        // This uses movrel_local instead of adr above, because the target
1711*c0909341SAndroid Build Coastguard Worker        // can be out of range for adr. But movrel_local leaves the thumb bit
1712*c0909341SAndroid Build Coastguard Worker        // set on COFF (but probably wouldn't if building for thumb on ELF),
1713*c0909341SAndroid Build Coastguard Worker        // thus try to clear the bit for robustness.
1714*c0909341SAndroid Build Coastguard Worker        bic             r12, r12, #1
1715*c0909341SAndroid Build Coastguard Worker#endif
1716*c0909341SAndroid Build Coastguard Worker
1717*c0909341SAndroid Build Coastguard Worker        tst             lr,  #1
1718*c0909341SAndroid Build Coastguard Worker        ldr             lr,  [r12, lr,  lsl #2]
1719*c0909341SAndroid Build Coastguard Worker
1720*c0909341SAndroid Build Coastguard Worker        add             r12, r12, lr
1721*c0909341SAndroid Build Coastguard Worker
1722*c0909341SAndroid Build Coastguard Worker        beq             1f
1723*c0909341SAndroid Build Coastguard Worker        // y overlap
1724*c0909341SAndroid Build Coastguard Worker        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
1725*c0909341SAndroid Build Coastguard Worker        mov             r9,  #(2 >> \sy)
1726*c0909341SAndroid Build Coastguard Worker
1727*c0909341SAndroid Build Coastguard Worker1:
1728*c0909341SAndroid Build Coastguard Worker
1729*c0909341SAndroid Build Coastguard Worker.if \sy
1730*c0909341SAndroid Build Coastguard Worker        vmov.i8         d6,  #23
1731*c0909341SAndroid Build Coastguard Worker        vmov.i8         d7,  #22
1732*c0909341SAndroid Build Coastguard Worker.else
1733*c0909341SAndroid Build Coastguard Worker        vmov.i8         d6,  #27
1734*c0909341SAndroid Build Coastguard Worker        vmov.i8         d7,  #17
1735*c0909341SAndroid Build Coastguard Worker.endif
1736*c0909341SAndroid Build Coastguard Worker
1737*c0909341SAndroid Build Coastguard Worker.if \sy
1738*c0909341SAndroid Build Coastguard Worker        add             r7,  r7,  r7           // luma_stride *= 2
1739*c0909341SAndroid Build Coastguard Worker.endif
1740*c0909341SAndroid Build Coastguard Worker
1741*c0909341SAndroid Build Coastguard Worker        bx              r12
1742*c0909341SAndroid Build Coastguard Workerendfunc
1743*c0909341SAndroid Build Coastguard Worker.endm
1744*c0909341SAndroid Build Coastguard Worker
1745*c0909341SAndroid Build Coastguard Workerfguv 420, 1, 1
1746*c0909341SAndroid Build Coastguard Workerfguv 422, 1, 0
1747*c0909341SAndroid Build Coastguard Workerfguv 444, 0, 0
1748*c0909341SAndroid Build Coastguard Worker
1749*c0909341SAndroid Build Coastguard Workerfunction fguv_loop_sx0_neon
1750*c0909341SAndroid Build Coastguard WorkerL(fguv_loop_sx0_tbl):
1751*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1752*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1753*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1754*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1755*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1756*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1757*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1758*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1759*c0909341SAndroid Build Coastguard Worker
1760*c0909341SAndroid Build Coastguard Worker.macro fguv_loop_sx0 csfl, ox, oy
1761*c0909341SAndroid Build Coastguard WorkerL(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
1762*c0909341SAndroid Build Coastguard Worker.if \oy
1763*c0909341SAndroid Build Coastguard Worker        mov             r12, lr
1764*c0909341SAndroid Build Coastguard Worker.endif
1765*c0909341SAndroid Build Coastguard Worker1:
1766*c0909341SAndroid Build Coastguard Worker.if \ox
1767*c0909341SAndroid Build Coastguard Worker        vld1.8          {d8},       [r4],        r10 // grain_lut old
1768*c0909341SAndroid Build Coastguard Worker.endif
1769*c0909341SAndroid Build Coastguard Worker.if \oy
1770*c0909341SAndroid Build Coastguard Worker        vld1.8          {q8, q9},   [r8],        r10 // grain_lut top
1771*c0909341SAndroid Build Coastguard Worker.endif
1772*c0909341SAndroid Build Coastguard Worker.if \ox && \oy
1773*c0909341SAndroid Build Coastguard Worker        vld1.8          {d10},      [r11],       r10 // grain_lut top old
1774*c0909341SAndroid Build Coastguard Worker.endif
1775*c0909341SAndroid Build Coastguard Worker        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
1776*c0909341SAndroid Build Coastguard Worker        vld1.8          {q10, q11}, [r5],        r10 // grain_lut
1777*c0909341SAndroid Build Coastguard Worker
1778*c0909341SAndroid Build Coastguard Worker.if \ox
1779*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d8,  d24
1780*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q4,  d20, d25
1781*c0909341SAndroid Build Coastguard Worker.endif
1782*c0909341SAndroid Build Coastguard Worker
1783*c0909341SAndroid Build Coastguard Worker.if \oy
1784*c0909341SAndroid Build Coastguard Worker.if \ox
1785*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d10, d24
1786*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q5,  d16, d25
1787*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1788*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d16, q5,  #5
1789*c0909341SAndroid Build Coastguard Worker.endif
1790*c0909341SAndroid Build Coastguard Worker
1791*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d20, d7
1792*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d21, d7
1793*c0909341SAndroid Build Coastguard Worker        vmull.s8        q6,  d22, d7
1794*c0909341SAndroid Build Coastguard Worker        vmull.s8        q7,  d23, d7
1795*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q4,  d16, d6
1796*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q5,  d17, d6
1797*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q6,  d18, d6
1798*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q7,  d19, d6
1799*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1800*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d21, q5,  #5
1801*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d22, q6,  #5
1802*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d23, q7,  #5
1803*c0909341SAndroid Build Coastguard Worker.elseif \ox
1804*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1805*c0909341SAndroid Build Coastguard Worker.endif
1806*c0909341SAndroid Build Coastguard Worker.if !\csfl
1807*c0909341SAndroid Build Coastguard Worker        vld1.8          {q8,  q9},  [r1, :128] // src
1808*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q4,  d0
1809*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q5,  d1
1810*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q6,  d2
1811*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q7,  d3
1812*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q0,  d16
1813*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q1,  d17
1814*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q8,  d18
1815*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q9,  d19
1816*c0909341SAndroid Build Coastguard Worker        vmul.i16        q4,  q4,  d4[0]
1817*c0909341SAndroid Build Coastguard Worker        vmul.i16        q5,  q5,  d4[0]
1818*c0909341SAndroid Build Coastguard Worker        vmul.i16        q6,  q6,  d4[0]
1819*c0909341SAndroid Build Coastguard Worker        vmul.i16        q7,  q7,  d4[0]
1820*c0909341SAndroid Build Coastguard Worker        vmul.i16        q0,  q0,  d4[1]
1821*c0909341SAndroid Build Coastguard Worker        vmul.i16        q1,  q1,  d4[1]
1822*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  d4[1]
1823*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  d4[1]
1824*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q4,  q4,  q0
1825*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q5,  q5,  q1
1826*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q6,  q6,  q8
1827*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q7,  q7,  q9
1828*c0909341SAndroid Build Coastguard Worker        vdup.16         q0,  d4[2]
1829*c0909341SAndroid Build Coastguard Worker        vshr.s16        q4,  q4,  #6
1830*c0909341SAndroid Build Coastguard Worker        vshr.s16        q5,  q5,  #6
1831*c0909341SAndroid Build Coastguard Worker        vshr.s16        q6,  q6,  #6
1832*c0909341SAndroid Build Coastguard Worker        vshr.s16        q7,  q7,  #6
1833*c0909341SAndroid Build Coastguard Worker        vadd.i16        q4,  q4,  q0
1834*c0909341SAndroid Build Coastguard Worker        vadd.i16        q5,  q5,  q0
1835*c0909341SAndroid Build Coastguard Worker        vadd.i16        q6,  q6,  q0
1836*c0909341SAndroid Build Coastguard Worker        vadd.i16        q7,  q7,  q0
1837*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d0,  q4
1838*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d1,  q5
1839*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d2,  q6
1840*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d3,  q7
1841*c0909341SAndroid Build Coastguard Worker.endif
1842*c0909341SAndroid Build Coastguard Worker
1843*c0909341SAndroid Build Coastguard Worker        bl              gather32_neon
1844*c0909341SAndroid Build Coastguard Worker
1845*c0909341SAndroid Build Coastguard Worker        vld1.8          {q0,  q1},  [r1, :128], r2 // src
1846*c0909341SAndroid Build Coastguard Worker
1847*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q8,  d20       // grain
1848*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q9,  d21
1849*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q10, d22
1850*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q11, d23
1851*c0909341SAndroid Build Coastguard Worker
1852*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q6,  d8        // scaling
1853*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q7,  d9
1854*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q4,  d10
1855*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q5,  d11
1856*c0909341SAndroid Build Coastguard Worker
1857*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q6   // scaling * grain
1858*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  q7
1859*c0909341SAndroid Build Coastguard Worker        vmul.i16        q10, q10, q4
1860*c0909341SAndroid Build Coastguard Worker        vmul.i16        q11, q11, q5
1861*c0909341SAndroid Build Coastguard Worker
1862*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
1863*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q9,  q9,  q13
1864*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q10, q10, q13
1865*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q11, q11, q13
1866*c0909341SAndroid Build Coastguard Worker
1867*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q8,  q8,  d0   // *src + noise
1868*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q9,  q9,  d1
1869*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q10, q10, d2
1870*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q11, q11, d3
1871*c0909341SAndroid Build Coastguard Worker
1872*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d0,  q8
1873*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d1,  q9
1874*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d2,  q10
1875*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d3,  q11
1876*c0909341SAndroid Build Coastguard Worker
1877*c0909341SAndroid Build Coastguard Worker        vmax.u8         q0,  q0,  q14
1878*c0909341SAndroid Build Coastguard Worker        vmax.u8         q1,  q1,  q14
1879*c0909341SAndroid Build Coastguard Worker        vmin.u8         q0,  q0,  q15
1880*c0909341SAndroid Build Coastguard Worker        vmin.u8         q1,  q1,  q15
1881*c0909341SAndroid Build Coastguard Worker
1882*c0909341SAndroid Build Coastguard Worker        subs            r9,  r9,  #1
1883*c0909341SAndroid Build Coastguard Worker.if \oy
1884*c0909341SAndroid Build Coastguard Worker        vdup.8          d6,  d25[0]
1885*c0909341SAndroid Build Coastguard Worker        vdup.8          d7,  d25[1]
1886*c0909341SAndroid Build Coastguard Worker.endif
1887*c0909341SAndroid Build Coastguard Worker
1888*c0909341SAndroid Build Coastguard Worker        vst1.8          {q0, q1}, [r0, :128], r2 // dst
1889*c0909341SAndroid Build Coastguard Worker        bgt             1b
1890*c0909341SAndroid Build Coastguard Worker
1891*c0909341SAndroid Build Coastguard Worker.if \oy
1892*c0909341SAndroid Build Coastguard Worker        cmp             r12, #0
1893*c0909341SAndroid Build Coastguard Worker        mov             r9,  r12               // restore actual remaining h
1894*c0909341SAndroid Build Coastguard Worker        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
1895*c0909341SAndroid Build Coastguard Worker.endif
1896*c0909341SAndroid Build Coastguard Worker        b               9f
1897*c0909341SAndroid Build Coastguard Worker.endm
1898*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   0, 0, 0
1899*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   0, 0, 1
1900*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   0, 1, 0
1901*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   0, 1, 1
1902*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   1, 0, 0
1903*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   1, 0, 1
1904*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   1, 1, 0
1905*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx0   1, 1, 1
1906*c0909341SAndroid Build Coastguard Worker
1907*c0909341SAndroid Build Coastguard Worker9:
1908*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1909*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1910*c0909341SAndroid Build Coastguard Workerendfunc
1911*c0909341SAndroid Build Coastguard Worker
1912*c0909341SAndroid Build Coastguard Workerfunction fguv_loop_sx1_neon
1913*c0909341SAndroid Build Coastguard WorkerL(fguv_loop_sx1_tbl):
1914*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1915*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1916*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1917*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1918*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1919*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1920*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1921*c0909341SAndroid Build Coastguard Worker        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1922*c0909341SAndroid Build Coastguard Worker
1923*c0909341SAndroid Build Coastguard Worker.macro fguv_loop_sx1 csfl, ox, oy
1924*c0909341SAndroid Build Coastguard WorkerL(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
1925*c0909341SAndroid Build Coastguard Worker.if \oy
1926*c0909341SAndroid Build Coastguard Worker        mov             r12, lr
1927*c0909341SAndroid Build Coastguard Worker.endif
1928*c0909341SAndroid Build Coastguard Worker1:
1929*c0909341SAndroid Build Coastguard Worker.if \ox
1930*c0909341SAndroid Build Coastguard Worker        vld1.8          {d8},       [r4],        r10 // grain_lut old
1931*c0909341SAndroid Build Coastguard Worker.endif
1932*c0909341SAndroid Build Coastguard Worker.if \oy
1933*c0909341SAndroid Build Coastguard Worker        vld1.8          {q8},       [r8],        r10 // grain_lut top
1934*c0909341SAndroid Build Coastguard Worker.endif
1935*c0909341SAndroid Build Coastguard Worker.if \ox && \oy
1936*c0909341SAndroid Build Coastguard Worker        vld1.8          {d10},      [r11],       r10 // grain_lut top old
1937*c0909341SAndroid Build Coastguard Worker.endif
1938*c0909341SAndroid Build Coastguard Worker        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
1939*c0909341SAndroid Build Coastguard Worker        vld1.8          {q10},      [r5],        r10 // grain_lut
1940*c0909341SAndroid Build Coastguard Worker        vld1.8          {q11},      [r1, :128],  r2  // src
1941*c0909341SAndroid Build Coastguard Worker
1942*c0909341SAndroid Build Coastguard Worker.if \ox
1943*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d8,  d24
1944*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q4,  d20, d25
1945*c0909341SAndroid Build Coastguard Worker.endif
1946*c0909341SAndroid Build Coastguard Worker
1947*c0909341SAndroid Build Coastguard Worker        vpaddl.u8       q0,  q0
1948*c0909341SAndroid Build Coastguard Worker        vpaddl.u8       q1,  q1
1949*c0909341SAndroid Build Coastguard Worker.if \oy
1950*c0909341SAndroid Build Coastguard Worker.if \ox
1951*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d10, d24
1952*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q5,  d16, d25
1953*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1954*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d16, q5,  #5
1955*c0909341SAndroid Build Coastguard Worker.endif
1956*c0909341SAndroid Build Coastguard Worker
1957*c0909341SAndroid Build Coastguard Worker        vmull.s8        q4,  d20, d7
1958*c0909341SAndroid Build Coastguard Worker        vmull.s8        q5,  d21, d7
1959*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q4,  d16, d6
1960*c0909341SAndroid Build Coastguard Worker        vmlal.s8        q5,  d17, d6
1961*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1962*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d21, q5,  #5
1963*c0909341SAndroid Build Coastguard Worker.elseif \ox
1964*c0909341SAndroid Build Coastguard Worker        vqrshrn.s16     d20, q4,  #5
1965*c0909341SAndroid Build Coastguard Worker.endif
1966*c0909341SAndroid Build Coastguard Worker.if \csfl
1967*c0909341SAndroid Build Coastguard Worker        vrshrn.u16      d0,  q0,  #1
1968*c0909341SAndroid Build Coastguard Worker        vrshrn.u16      d1,  q1,  #1
1969*c0909341SAndroid Build Coastguard Worker.else
1970*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q4,  q0,  #1
1971*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q5,  q1,  #1
1972*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q0,  d22
1973*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q1,  d23
1974*c0909341SAndroid Build Coastguard Worker        vmul.i16        q4,  q4,  d4[0]
1975*c0909341SAndroid Build Coastguard Worker        vmul.i16        q5,  q5,  d4[0]
1976*c0909341SAndroid Build Coastguard Worker        vmul.i16        q0,  q0,  d4[1]
1977*c0909341SAndroid Build Coastguard Worker        vmul.i16        q1,  q1,  d4[1]
1978*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q4,  q4,  q0
1979*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q5,  q5,  q1
1980*c0909341SAndroid Build Coastguard Worker        vdup.16         q0,  d4[2]
1981*c0909341SAndroid Build Coastguard Worker        vshr.s16        q4,  q4,  #6
1982*c0909341SAndroid Build Coastguard Worker        vshr.s16        q5,  q5,  #6
1983*c0909341SAndroid Build Coastguard Worker        vadd.i16        q4,  q4,  q0
1984*c0909341SAndroid Build Coastguard Worker        vadd.i16        q5,  q5,  q0
1985*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d0,  q4
1986*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d1,  q5
1987*c0909341SAndroid Build Coastguard Worker.endif
1988*c0909341SAndroid Build Coastguard Worker
1989*c0909341SAndroid Build Coastguard Worker        bl              gather16_neon
1990*c0909341SAndroid Build Coastguard Worker
1991*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q8,  d20       // grain
1992*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q9,  d21
1993*c0909341SAndroid Build Coastguard Worker
1994*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q6,  d8        // scaling
1995*c0909341SAndroid Build Coastguard Worker        vmovl.u8        q7,  d9
1996*c0909341SAndroid Build Coastguard Worker
1997*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q6   // scaling * grain
1998*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  q7
1999*c0909341SAndroid Build Coastguard Worker
2000*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
2001*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q9,  q9,  q13
2002*c0909341SAndroid Build Coastguard Worker
2003*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q8,  q8,  d22  // *src + noise
2004*c0909341SAndroid Build Coastguard Worker        vaddw.u8        q9,  q9,  d23
2005*c0909341SAndroid Build Coastguard Worker
2006*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d0,  q8
2007*c0909341SAndroid Build Coastguard Worker        vqmovun.s16     d1,  q9
2008*c0909341SAndroid Build Coastguard Worker
2009*c0909341SAndroid Build Coastguard Worker        vmax.u8         q0,  q0,  q14
2010*c0909341SAndroid Build Coastguard Worker        vmin.u8         q0,  q0,  q15
2011*c0909341SAndroid Build Coastguard Worker
2012*c0909341SAndroid Build Coastguard Worker        subs            r9,  r9,  #1
2013*c0909341SAndroid Build Coastguard Worker.if \oy
2014*c0909341SAndroid Build Coastguard Worker        vswp            d6,  d7
2015*c0909341SAndroid Build Coastguard Worker.endif
2016*c0909341SAndroid Build Coastguard Worker        vst1.8          {q0}, [r0, :128], r2 // dst
2017*c0909341SAndroid Build Coastguard Worker        bgt             1b
2018*c0909341SAndroid Build Coastguard Worker
2019*c0909341SAndroid Build Coastguard Worker.if \oy
2020*c0909341SAndroid Build Coastguard Worker        cmp             r12, #0
2021*c0909341SAndroid Build Coastguard Worker        mov             r9,  r12               // restore actual remaining h
2022*c0909341SAndroid Build Coastguard Worker        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
2023*c0909341SAndroid Build Coastguard Worker.endif
2024*c0909341SAndroid Build Coastguard Worker
2025*c0909341SAndroid Build Coastguard Worker        b               9f
2026*c0909341SAndroid Build Coastguard Worker.endm
2027*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   0, 0, 0
2028*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   0, 0, 1
2029*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   0, 1, 0
2030*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   0, 1, 1
2031*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   1, 0, 0
2032*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   1, 0, 1
2033*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   1, 1, 0
2034*c0909341SAndroid Build Coastguard Worker        fguv_loop_sx1   1, 1, 1
2035*c0909341SAndroid Build Coastguard Worker
2036*c0909341SAndroid Build Coastguard Worker9:
2037*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
2038*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2039*c0909341SAndroid Build Coastguard Workerendfunc
2040