xref: /aosp_15_r20/external/libdav1d/src/arm/32/filmgrain.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30#include "src/arm/asm-offsets.h"
31
32#define GRAIN_WIDTH 82
33#define GRAIN_HEIGHT 73
34
35#define SUB_GRAIN_WIDTH 44
36#define SUB_GRAIN_HEIGHT 38
37
38.macro increment_seed steps, shift=1
39        lsr             r11, r2,  #3
40        lsr             r12, r2,  #12
41        lsr             lr,  r2,  #1
42        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
43        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
44        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
45.if \shift
46        lsr             r2,  r2,  #\steps
47.endif
48        and             r11, r11, #((1 << \steps) - 1)    // bit
49.if \shift
50        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
51.else
52        orr             r2,  r2,  r11, lsl #16            // *state
53.endif
54.endm
55
56.macro read_rand dest, bits, age
57        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
58.endm
59
60.macro read_shift_rand dest, bits
61        ubfx            \dest,  r2,   #17 - \bits, #\bits
62        lsr             r2,  r2,  #1
63.endm
64
65// special calling convention:
66// r2 holds seed
67// r3 holds dav1d_gaussian_sequence
68// clobbers r11-r12
69// returns in d0-d1
70function get_gaussian_neon
71        push            {r5-r6,lr}
72        increment_seed  4
73        read_rand       r5,  11,  3
74        read_rand       r6,  11,  2
75        add             r5,  r3,  r5,  lsl #1
76        add             r6,  r3,  r6,  lsl #1
77        vld1.16         {d0[0]}, [r5]
78        read_rand       r5,  11,  1
79        vld1.16         {d0[1]}, [r6]
80        add             r5,  r3,  r5,  lsl #1
81        read_rand       r6, 11,  0
82        increment_seed  4
83        add             r6,  r3,  r6,  lsl #1
84        vld1.16         {d0[2]}, [r5]
85        read_rand       r5,  11,  3
86        vld1.16         {d0[3]}, [r6]
87        add             r5,  r3,  r5,  lsl #1
88        read_rand       r6,  11,  2
89        vld1.16         {d1[0]}, [r5]
90        add             r6,  r3,  r6,  lsl #1
91        read_rand       r5,  11,  1
92        vld1.16         {d1[1]}, [r6]
93        read_rand       r6,  11,  0
94        add             r5,  r3,  r5,  lsl #1
95        add             r6,  r3,  r6,  lsl #1
96        vld1.16         {d1[2]}, [r5]
97        vld1.16         {d1[3]}, [r6]
98        pop             {r5-r6,pc}
99endfunc
100
101.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
102        bl              get_gaussian_neon
103        vrshl.s16       q0,  q0,  q15
104        vmovn.i16       \r0, q0
105        bl              get_gaussian_neon
106        vrshl.s16       q0,  q0,  q15
107        vmovn.i16       \r1, q0
108        bl              get_gaussian_neon
109        vrshl.s16       q0,  q0,  q15
110        vmovn.i16       \r2, q0
111        bl              get_gaussian_neon
112        vrshl.s16       q0,  q0,  q15
113        vmovn.i16       \r3, q0
114        bl              get_gaussian_neon
115        vrshl.s16       q0,  q0,  q15
116        vmovn.i16       \r4, q0
117        bl              get_gaussian_neon
118        vrshl.s16       q0,  q0,  q15
119        vmovn.i16       \r5, q0
120        bl              get_gaussian_neon
121        vrshl.s16       q0,  q0,  q15
122        vmovn.i16       \r6, q0
123        bl              get_gaussian_neon
124        vrshl.s16       q0,  q0,  q15
125        vmovn.i16       \r7, q0
126        bl              get_gaussian_neon
127        vrshl.s16       q0,  q0,  q15
128        vmovn.i16       \r8, q0
129        bl              get_gaussian_neon
130        vrshl.s16       q0,  q0,  q15
131        vmovn.i16       \r9, q0
132        increment_seed  2
133        read_rand       r11, 11,  1
134        read_rand       r12, 11,  0
135        add             r11, r3,  r11, lsl #1
136        add             r12, r3,  r12, lsl #1
137        vld1.16         {d0[0]}, [r11]
138        vld1.16         {d0[1]}, [r12]
139        vrshl.s16       d0,  d0,  d30
140        vmovn.i16       \r10, q0
141.endm
142
143.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
144        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
145        vst1.16         {\r4, \r5, \r6, \r7}, [r0]!
146        vst1.16         {\r8, \r9},           [r0]!
147        vst1.16         {\r10[0]},            [r0]!
148.endm
149
150.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
151        bl              get_gaussian_neon
152        vrshl.s16       q0,  q0,  q15
153        vmovn.i16       \r0, q0
154        bl              get_gaussian_neon
155        vrshl.s16       q0,  q0,  q15
156        vmovn.i16       \r1, q0
157        bl              get_gaussian_neon
158        vrshl.s16       q0,  q0,  q15
159        vmovn.i16       \r2, q0
160        bl              get_gaussian_neon
161        vrshl.s16       q0,  q0,  q15
162        vmovn.i16       \r3, q0
163        bl              get_gaussian_neon
164        vrshl.s16       q0,  q0,  q15
165        vmovn.i16       \r4, q0
166        increment_seed  4
167        read_rand       r11, 11,  3
168        read_rand       r12, 11,  2
169        add             r11, r3,  r11, lsl #1
170        add             r12, r3,  r12, lsl #1
171        vld1.16         {d0[]}, [r11]
172        read_rand       r11, 11,  1
173        vld1.16         {d0[1]}, [r12]
174        add             r11, r3,  r11, lsl #1
175        read_rand       r12, 11,  0
176        vld1.16         {d0[2]}, [r11]
177        add             r12, r3,  r12, lsl #1
178        vld1.16         {d0[3]}, [r12]
179        vrshl.s16       d0,  d0,  d30
180        vmovn.i16       \r5, q0
181.endm
182
183.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
184        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
185        vst1.16         {\r4, \r5},           [r0]
186        add             r0,  r0,  #GRAIN_WIDTH-32
187.endm
188
189function get_grain_2_neon
190        push            {r11,lr}
191        increment_seed  2
192        read_rand       r11, 11,  1
193        read_rand       r12, 11,  0
194        add             r11, r3,  r11, lsl #1
195        add             r12, r3,  r12, lsl #1
196        vld1.16         {d0[0]}, [r11]
197        vld1.16         {d0[1]}, [r12]
198        vrshl.s16       d0,  d0,  d30
199        vmovn.i16       d0,  q0
200        pop             {r11,pc}
201endfunc
202
203.macro get_grain_2 dst
204        bl              get_grain_2_neon
205.ifnc \dst, d0
206        vmov            \dst, d0
207.endif
208.endm
209
210// r1 holds the number of entries to produce
211// r6, r8 and r10 hold the previous output entries
212// q0 holds the vector of produced entries
213// q1 holds the input vector of sums from above
214.macro output_lag n
215function output_lag\n\()_neon
216        push            {r0, lr}
217.if \n == 1
218        mov             lr,  #-128
219.else
220        mov             r0,  #1
221        mov             lr,  #1
222        sub             r7,  r7,  #1
223        sub             r9,  r9,  #1
224        lsl             r0,  r0,  r7
225        lsl             lr,  lr,  r9
226        add             r7,  r7,  #1
227        add             r9,  r9,  #1
228.endif
2291:
230        read_shift_rand r12, 11
231        vmov.32         r11, d2[0]
232        lsl             r12, r12, #1
233        vext.8          q0,  q0,  q0,  #1
234        ldrsh           r12, [r3, r12]
235.if \n == 1
236        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
237        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
238        add             r12, r12, r10
239        asr             r6,  r6,  r7              // >> ar_coeff_shift
240        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
241        add             r6,  r6,  r12
242        cmp             r6,  r5
243.elseif \n == 2
244        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
245        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
246        mov             r8,  r6
247        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
248        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
249        asr             r6,  r6,  r7              // >> ar_coeff_shift
250        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
251        add             r6,  r6,  r12
252        push            {lr}
253        cmp             r6,  r5
254        mov             lr,  #-128
255.else
256        push            {r1-r3}
257        sbfx            r1,  r4,  #0,  #8
258        sbfx            r2,  r4,  #8,  #8
259        sbfx            r3,  r4,  #16, #8
260        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
261        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
262        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
263        pop             {r1-r3}
264        mov             r10, r8
265        mov             r8,  r6
266
267        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
268        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
269        asr             r6,  r6,  r7              // >> ar_coeff_shift
270        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
271        add             r6,  r6,  r12
272        push            {lr}
273        cmp             r6,  r5
274        mov             lr,  #-128
275.endif
276        it              gt
277        movgt           r6,  r5
278        cmp             r6,  lr
279        it              lt
280        movlt           r6,  lr
281.if \n >= 2
282        pop             {lr}
283.endif
284        subs            r1,  r1,  #1
285        vext.8          q1,  q1,  q1,  #4
286        vmov.8          d1[7], r6
287        bgt             1b
288        pop             {r0, pc}
289endfunc
290.endm
291
292output_lag 1
293output_lag 2
294output_lag 3
295
296
297function sum_lag1_above_neon
298        vmull.s8        q2,  d6,  d28
299        vmull.s8        q3,  d7,  d28
300        vmull.s8        q4,  d0,  d27
301        vmull.s8        q5,  d1,  d27
302
303        vaddl.s16       q0,  d4,  d8
304        vaddl.s16       q2,  d5,  d9
305        vaddl.s16       q4,  d6,  d10
306        vaddl.s16       q5,  d7,  d11
307
308        vmull.s8        q3,  d3,  d29
309        vmull.s8        q1,  d2,  d29
310
311        vaddw.s16       q4,  q4,  d6
312        vaddw.s16       q5,  q5,  d7
313        vaddw.s16       q3,  q2,  d3
314        vaddw.s16       q2,  q0,  d2
315        bx              lr
316endfunc
317
318.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
319.ifc \lag\()_\edge, lag3_left
320        bl              sum_lag3_left_above_neon
321.else
322        bl              sum_\lag\()_above_neon
323.endif
324.ifc \type, uv_420
325        vpush           {q6-q7}
326        add             r12, r11, #GRAIN_WIDTH
327        vld1.16         {q0, q1}, [r11]!
328        vld1.16         {q6, q7}, [r12]!
329        vpaddl.s8       q0,  q0
330        vpaddl.s8       q1,  q1
331        vpaddl.s8       q6,  q6
332        vpaddl.s8       q7,  q7
333        vadd.i16        q0,  q0,  q6
334        vadd.i16        q1,  q1,  q7
335        vpop            {q6-q7}
336        vrshrn.s16      d0,  q0,  #2
337        vrshrn.s16      d1,  q1,  #2
338.endif
339.ifc \type, uv_422
340        vld1.8          {q0, q1}, [r11]!
341        vpaddl.s8       q0,  q0
342        vpaddl.s8       q1,  q1
343        vrshrn.s16      d0,  q0,  #1
344        vrshrn.s16      d1,  q1,  #1
345.endif
346.ifc \type, uv_444
347        vld1.8          {q0}, [r11]!
348.endif
349.if \uv_layout
350.ifnb \uv_coeff
351        vdup.8          d13, \uv_coeff
352.endif
353        vmull.s8        q1,  d0,  d13
354        vmull.s8        q0,  d1,  d13
355        vaddw.s16       q2,  q2,  d2
356        vaddw.s16       q3,  q3,  d3
357        vaddw.s16       q4,  q4,  d0
358        vaddw.s16       q5,  q5,  d1
359.endif
360.if \uv_layout && \elems == 16
361        b               sum_\lag\()_y_\edge\()_start
362.elseif \uv_layout == 444 && \elems == 15
363        b               sum_\lag\()_y_\edge\()_start
364.elseif \uv_layout == 422 && \elems == 9
365        b               sum_\lag\()_uv_420_\edge\()_start
366.else
367sum_\lag\()_\type\()_\edge\()_start:
368        push            {r11}
369.ifc \edge, left
370        increment_seed  4
371        read_rand       r11, 11,  3
372        read_rand       r12, 11,  2
373        add             r11, r3,  r11, lsl #1
374        add             r12, r3,  r12, lsl #1
375        vld1.16         {d1[1]}, [r11]
376        read_rand       r11, 11,  1
377        vld1.16         {d1[2]}, [r12]
378        add             r11, r3,  r11, lsl #1
379        vld1.16         {d1[3]}, [r11]
380        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
381        vrshl.s16       d1,  d1,  d30
382        vmovn.i16       d1,  q0
383        vext.8          q2,  q2,  q2,  #12
384.ifc \lag, lag3
385        vmov.s8         r10, d1[5]
386.endif
387.ifnc \lag, lag1
388        vmov.s8         r8,  d1[6]
389.endif
390        vmov.s8         r6,  d1[7]
391
392        vmov            q1,  q2
393        mov             r1,  #1
394        bl              output_\lag\()_neon
395.else
396        increment_seed  4, shift=0
397        vmov            q1,  q2
398        mov             r1,  #4
399        bl              output_\lag\()_neon
400.endif
401
402        increment_seed  4, shift=0
403        vmov            q1,  q3
404        mov             r1,  #4
405        bl              output_\lag\()_neon
406
407        increment_seed  4, shift=0
408        vmov            q1,  q4
409.if \elems == 9
410        mov             r1,  #1
411        bl              output_\lag\()_neon
412        lsr             r2,  r2,  #3
413
414        read_rand       r11, 11,  2
415        read_rand       r12, 11,  1
416        add             r11, r3,  r11, lsl #1
417        add             r12, r3,  r12, lsl #1
418        vld1.16         {d2[0]}, [r11]
419        read_rand       r11, 11,  0
420        vld1.16         {d2[1]}, [r12]
421        add             r11, r3,  r11, lsl #1
422        vld1.16         {d2[2]}, [r11]
423        vrshl.s16       d2,  d2,  d30
424        vmovn.i16       d2,  q1
425        vext.8          q0,  q0,  q1,  #7
426.else
427        mov             r1,  #4
428        bl              output_\lag\()_neon
429
430        increment_seed  4, shift=0
431        vmov            q1,  q5
432
433.ifc \edge, right
434        mov             r1,  #3
435        bl              output_\lag\()_neon
436        read_shift_rand r11, 11
437        add             r11, r3,  r11, lsl #1
438        vld1.16         {d2[0]}, [r11]
439        vrshl.s16       d2,  d2,  d30
440        vext.8          q0,  q0,  q1,  #1
441.else
442        mov             r1,  #4
443        bl              output_\lag\()_neon
444.endif
445.endif
446.if \store
447        vst1.8          {q0}, [r0]!
448.endif
449        pop             {r11}
450        pop             {r1, pc}
451.endif
452.endm
453
454.macro sum_lag1_func type, uv_layout, edge, elems=16
455function sum_\type\()_lag1_\edge\()_neon
456        push            {r1, lr}
457        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
458endfunc
459.endm
460
461sum_lag1_func y,      0,   left
462sum_lag1_func y,      0,   mid
463sum_lag1_func y,      0,   right, 15
464sum_lag1_func uv_444, 444, left
465sum_lag1_func uv_444, 444, mid
466sum_lag1_func uv_444, 444, right, 15
467sum_lag1_func uv_422, 422, left
468sum_lag1_func uv_422, 422, mid
469sum_lag1_func uv_422, 422, right, 9
470sum_lag1_func uv_420, 420, left
471sum_lag1_func uv_420, 420, mid
472sum_lag1_func uv_420, 420, right, 9
473
474.macro sum_lag1 type, dst, left, mid, right, edge=mid
475        vmov            q3,  \mid
476        vext.8          q0,  \left, \mid,   #15
477        vext.8          q1,  \mid,  \right, #1
478        bl              sum_\type\()_lag1_\edge\()_neon
479        vmov            \dst, q0
480.endm
481
482.macro sum_y_lag1 dst, left, mid, right, edge=mid
483        sum_lag1        y, \dst, \left, \mid, \right, \edge
484.endm
485
486.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
487        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
488.endm
489
490.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
491        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
492.endm
493
494.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
495        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
496.endm
497
498
499function sum_lag2_above_neon
500        push            {lr}
501        sub             r12, r0,  #2*GRAIN_WIDTH - 16
502        sub             lr,  r0,  #1*GRAIN_WIDTH - 16
503        vld1.8          {q10}, [r12] // load top right
504        vld1.8          {q13}, [lr]
505
506        vext.8          q6,  q8,  q9,  #14 // top left, top mid
507        vdup.8          d14, d28[0]
508        vext.8          q8,  q8,  q9,  #15
509        vdup.8          d15, d28[1]
510
511        vmull.s8        q0,  d12, d14
512        vmull.s8        q1,  d13, d14
513        vmull.s8        q6,  d16, d15
514        vmull.s8        q8,  d17, d15
515
516        vaddl.s16       q2,  d0,  d12
517        vaddl.s16       q3,  d1,  d13
518        vaddl.s16       q4,  d2,  d16
519        vaddl.s16       q5,  d3,  d17
520
521        vext.8          q6,  q9,  q10, #1  // top mid, top right
522        vdup.8          d14, d28[3]
523        vext.8          q8,  q9,  q10, #2
524        vdup.8          d15, d28[4]
525
526        vmull.s8        q0,  d12, d14
527        vmull.s8        q1,  d13, d14
528        vmull.s8        q6,  d16, d15
529        vmull.s8        q8,  d17, d15
530
531        vaddl.s16       q7,  d0,  d12
532        vaddl.s16       q0,  d1,  d13
533        vaddl.s16       q6,  d2,  d16
534        vaddl.s16       q1,  d3,  d17
535
536        vadd.i32        q2,  q2,  q7
537        vadd.i32        q3,  q3,  q0
538        vadd.i32        q4,  q4,  q6
539        vadd.i32        q5,  q5,  q1
540
541        vext.8          q6,  q11, q12, #14 // top left, top mid
542        vdup.8          d14, d28[5]
543        vext.8          q8,  q11, q12, #15
544        vdup.8          d15, d28[6]
545
546        vmull.s8        q0,  d12, d14
547        vmull.s8        q1,  d13, d14
548        vmull.s8        q6,  d16, d15
549        vmull.s8        q8,  d17, d15
550
551        vaddl.s16       q7,  d0,  d12
552        vaddl.s16       q0,  d1,  d13
553        vaddl.s16       q6,  d2,  d16
554        vaddl.s16       q1,  d3,  d17
555
556        vadd.i32        q2,  q2,  q7
557        vadd.i32        q3,  q3,  q0
558        vadd.i32        q4,  q4,  q6
559        vadd.i32        q5,  q5,  q1
560
561        vext.8          q6,  q12, q13, #1  // top mid, top right
562        vdup.8          d14, d29[0]
563        vext.8          q8,  q12, q13, #2
564        vdup.8          d15, d29[1]
565
566        vmull.s8        q0,  d12, d14
567        vmull.s8        q1,  d13, d14
568        vmull.s8        q6,  d16, d15
569        vmull.s8        q8,  d17, d15
570
571        vaddl.s16       q7,  d0,  d12
572        vaddl.s16       q0,  d1,  d13
573        vaddl.s16       q6,  d2,  d16
574        vaddl.s16       q1,  d3,  d17
575
576        vadd.i32        q2,  q2,  q7
577        vadd.i32        q3,  q3,  q0
578        vadd.i32        q4,  q4,  q6
579        vadd.i32        q5,  q5,  q1
580
581        vdup.8          d14, d28[2]
582        vdup.8          d15, d28[7]
583
584        vmull.s8        q0,  d18, d14
585        vmull.s8        q1,  d19, d14
586        vmull.s8        q6,  d24, d15
587        vmull.s8        q8,  d25, d15
588
589        vaddl.s16       q7,  d0,  d12
590        vaddl.s16       q0,  d1,  d13
591        vaddl.s16       q6,  d2,  d16
592        vaddl.s16       q1,  d3,  d17
593
594        vmov            q8,  q9
595        vmov            q9,  q10
596
597        vadd.i32        q2,  q2,  q7
598        vadd.i32        q3,  q3,  q0
599        vadd.i32        q4,  q4,  q6
600        vadd.i32        q5,  q5,  q1
601
602        vmov            q11, q12
603        vmov            q12, q13
604
605        pop             {pc}
606endfunc
607
608.macro sum_lag2_func type, uv_layout, edge, elems=16
609function sum_\type\()_lag2_\edge\()_neon
610        push            {r1, lr}
611.ifc \edge, left
612        sub             r12, r0,  #2*GRAIN_WIDTH
613        sub             lr,  r0,  #1*GRAIN_WIDTH
614        vld1.8          {q9},  [r12] // load the previous block right above
615        vld1.8          {q12}, [lr]
616.endif
617        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
618endfunc
619.endm
620
621sum_lag2_func y,      0,   left
622sum_lag2_func y,      0,   mid
623sum_lag2_func y,      0,   right, 15
624sum_lag2_func uv_444, 444, left
625sum_lag2_func uv_444, 444, mid
626sum_lag2_func uv_444, 444, right, 15
627sum_lag2_func uv_422, 422, left
628sum_lag2_func uv_422, 422, mid
629sum_lag2_func uv_422, 422, right, 9
630sum_lag2_func uv_420, 420, left
631sum_lag2_func uv_420, 420, mid
632sum_lag2_func uv_420, 420, right, 9
633
634
635function sum_lag3_left_above_neon
636        // A separate codepath for the left edge, to avoid reading outside
637        // of the edge of the buffer.
638        sub             r12, r0,  #3*GRAIN_WIDTH
639        vld1.8          {q11, q12}, [r12]
640        vext.8          q12, q11, q12, #13
641        vext.8          q11, q11, q11, #13
642        b               sum_lag3_above_start
643endfunc
644
645function sum_lag3_above_neon
646        sub             r12, r0,  #3*GRAIN_WIDTH + 3
647        vld1.8          {q11, q12}, [r12]
648
649sum_lag3_above_start:
650        vdup.8          d20, d26[0]
651        vext.8          q9,  q11, q12, #1
652        vdup.8          d21, d26[1]
653
654        vmull.s8        q0,  d22, d20
655        vmull.s8        q1,  d23, d20
656        vmull.s8        q6,  d18, d21
657        vmull.s8        q7,  d19, d21
658
659        vext.8          q8,  q11, q12, #2
660        vdup.8          d20, d26[2]
661        vext.8          q9,  q11, q12, #3
662        vdup.8          d21, d26[3]
663
664        vaddl.s16       q2,  d0,  d12
665        vaddl.s16       q3,  d1,  d13
666        vaddl.s16       q4,  d2,  d14
667        vaddl.s16       q5,  d3,  d15
668
669        vmull.s8        q0,  d16, d20
670        vmull.s8        q1,  d17, d20
671        vmull.s8        q6,  d18, d21
672        vmull.s8        q7,  d19, d21
673
674        vaddl.s16       q8,  d0,  d12
675        vaddl.s16       q9,  d1,  d13
676        vaddl.s16       q0,  d2,  d14
677        vaddl.s16       q1,  d3,  d15
678
679        vext.8          q6,  q11, q12, #4
680        vdup.8          d20, d26[4]
681        vext.8          q7,  q11, q12, #5
682        vdup.8          d21, d26[5]
683
684        vadd.i32        q2,  q2,  q8
685        vadd.i32        q3,  q3,  q9
686        vadd.i32        q4,  q4,  q0
687        vadd.i32        q5,  q5,  q1
688
689        vmull.s8        q0,  d12, d20
690        vmull.s8        q1,  d13, d20
691        vmull.s8        q8,  d14, d21
692        vmull.s8        q9,  d15, d21
693
694        sub             r12, r0,  #2*GRAIN_WIDTH + 3
695
696        vaddl.s16       q6,  d0,  d16
697        vaddl.s16       q7,  d1,  d17
698        vaddl.s16       q0,  d2,  d18
699        vaddl.s16       q1,  d3,  d19
700
701        vext.8          q8,  q11, q12, #6
702        vld1.8          {q11, q12}, [r12]
703        vdup.8          d20, d26[6]
704        vdup.8          d21, d26[7]
705
706        vadd.i32        q2,  q2,  q6
707        vadd.i32        q3,  q3,  q7
708        vadd.i32        q4,  q4,  q0
709        vadd.i32        q5,  q5,  q1
710
711        vmull.s8        q0,  d16, d20
712        vmull.s8        q1,  d17, d20
713        vmull.s8        q6,  d22, d21
714        vmull.s8        q7,  d23, d21
715
716        vaddl.s16       q8,  d0,  d12
717        vaddl.s16       q9,  d1,  d13
718        vaddl.s16       q0,  d2,  d14
719        vaddl.s16       q1,  d3,  d15
720
721        vext.8          q6,  q11, q12, #1
722        vdup.8          d20, d27[0]
723        vext.8          q7,  q11, q12, #2
724        vdup.8          d21, d27[1]
725
726        vadd.i32        q2,  q2,  q8
727        vadd.i32        q3,  q3,  q9
728        vadd.i32        q4,  q4,  q0
729        vadd.i32        q5,  q5,  q1
730
731        vmull.s8        q0,  d12, d20
732        vmull.s8        q1,  d13, d20
733        vmull.s8        q8,  d14, d21
734        vmull.s8        q9,  d15, d21
735
736        vaddl.s16       q6,  d0,  d16
737        vaddl.s16       q7,  d1,  d17
738        vaddl.s16       q0,  d2,  d18
739        vaddl.s16       q1,  d3,  d19
740
741        vext.8          q8,  q11, q12, #3
742        vdup.8          d20, d27[2]
743        vext.8          q9,  q11, q12, #4
744        vdup.8          d21, d27[3]
745
746        vadd.i32        q2,  q2,  q6
747        vadd.i32        q3,  q3,  q7
748        vadd.i32        q4,  q4,  q0
749        vadd.i32        q5,  q5,  q1
750
751        vmull.s8        q0,  d16, d20
752        vmull.s8        q1,  d17, d20
753        vmull.s8        q6,  d18, d21
754        vmull.s8        q7,  d19, d21
755
756        sub             r12, r0,  #1*GRAIN_WIDTH + 3
757
758        vaddl.s16       q8,  d0,  d12
759        vaddl.s16       q9,  d1,  d13
760        vaddl.s16       q0,  d2,  d14
761        vaddl.s16       q1,  d3,  d15
762
763        vext.8          q6,  q11, q12, #5
764        vdup.8          d20, d27[4]
765        vext.8          q7,  q11, q12, #6
766        vdup.8          d21, d27[5]
767
768        vld1.8          {q11, q12}, [r12]
769
770        vadd.i32        q2,  q2,  q8
771        vadd.i32        q3,  q3,  q9
772        vadd.i32        q4,  q4,  q0
773        vadd.i32        q5,  q5,  q1
774
775        vmull.s8        q0,  d12, d20
776        vmull.s8        q1,  d13, d20
777        vmull.s8        q8,  d14, d21
778        vmull.s8        q9,  d15, d21
779
780        vaddl.s16       q6,  d0,  d16
781        vaddl.s16       q7,  d1,  d17
782        vaddl.s16       q0,  d2,  d18
783        vaddl.s16       q1,  d3,  d19
784
785        vdup.8          d20, d27[6]
786        vext.8          q9,  q11, q12, #1
787        vdup.8          d21, d27[7]
788
789        vadd.i32        q2,  q2,  q6
790        vadd.i32        q3,  q3,  q7
791        vadd.i32        q4,  q4,  q0
792        vadd.i32        q5,  q5,  q1
793
794        vmull.s8        q0,  d22, d20
795        vmull.s8        q1,  d23, d20
796        vmull.s8        q6,  d18, d21
797        vmull.s8        q7,  d19, d21
798
799        vaddl.s16       q8,  d0,  d12
800        vaddl.s16       q9,  d1,  d13
801        vaddl.s16       q0,  d2,  d14
802        vaddl.s16       q1,  d3,  d15
803
804        vext.8          q6,  q11, q12, #2
805        vdup.8          d20, d28[0]
806        vext.8          q7,  q11, q12, #3
807        vdup.8          d21, d28[1]
808
809        vadd.i32        q2,  q2,  q8
810        vadd.i32        q3,  q3,  q9
811        vadd.i32        q4,  q4,  q0
812        vadd.i32        q5,  q5,  q1
813
814        vmull.s8        q0,  d12, d20
815        vmull.s8        q1,  d13, d20
816        vmull.s8        q8,  d14, d21
817        vmull.s8        q9,  d15, d21
818
819        vaddl.s16       q6,  d0,  d16
820        vaddl.s16       q7,  d1,  d17
821        vaddl.s16       q0,  d2,  d18
822        vaddl.s16       q1,  d3,  d19
823
824        vext.8          q8,  q11, q12, #4
825        vdup.8          d20, d28[2]
826        vext.8          q9,  q11, q12, #5
827        vdup.8          d21, d28[3]
828
829        vadd.i32        q2,  q2,  q6
830        vadd.i32        q3,  q3,  q7
831        vadd.i32        q4,  q4,  q0
832        vadd.i32        q5,  q5,  q1
833
834        vmull.s8        q0,  d16, d20
835        vmull.s8        q1,  d17, d20
836        vmull.s8        q6,  d18, d21
837        vmull.s8        q7,  d19, d21
838
839        vaddl.s16       q8,  d0,  d12
840        vaddl.s16       q9,  d1,  d13
841        vaddl.s16       q0,  d2,  d14
842        vaddl.s16       q1,  d3,  d15
843
844        vext.8          q6,  q11, q12, #6
845        vdup.8          d20, d28[4]
846
847        vadd.i32        q2,  q2,  q8
848        vadd.i32        q3,  q3,  q9
849        vadd.i32        q4,  q4,  q0
850        vadd.i32        q5,  q5,  q1
851
852        vmull.s8        q0,  d12, d20
853        vmull.s8        q1,  d13, d20
854
855        vaddw.s16       q2,  q2,  d0
856        vaddw.s16       q3,  q3,  d1
857        vaddw.s16       q4,  q4,  d2
858        vaddw.s16       q5,  q5,  d3
859
860        bx              lr
861endfunc
862
863.macro sum_lag3_func type, uv_layout, edge, elems=16
864function sum_\type\()_lag3_\edge\()_neon
865        push            {r1, lr}
866        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
867endfunc
868.endm
869
870sum_lag3_func y,      0,   left
871sum_lag3_func y,      0,   mid
872sum_lag3_func y,      0,   right, 15
873sum_lag3_func uv_444, 444, left
874sum_lag3_func uv_444, 444, mid
875sum_lag3_func uv_444, 444, right, 15
876sum_lag3_func uv_422, 422, left
877sum_lag3_func uv_422, 422, mid
878sum_lag3_func uv_422, 422, right, 9
879sum_lag3_func uv_420, 420, left
880sum_lag3_func uv_420, 420, mid
881sum_lag3_func uv_420, 420, right, 9
882
883function generate_grain_rows_neon
884        push            {r11,lr}
8851:
886        get_grain_row   d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
887        subs            r1,  r1,  #1
888        store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
889        bgt             1b
890        pop             {r11,pc}
891endfunc
892
893function generate_grain_rows_44_neon
894        push            {r11,lr}
8951:
896        get_grain_row_44 d16, d17, d18, d19, d20, d21
897        subs            r1,  r1,  #1
898        store_grain_row_44 d16, d17, d18, d19, d20, d21
899        bgt             1b
900        pop             {r11,pc}
901endfunc
902
903function gen_grain_uv_444_lag0_neon
904        vld1.8          {q3}, [r11]!
905        push            {r11,lr}
906        bl              get_gaussian_neon
907        vrshl.s16       q8,  q0,  q15
908        bl              get_gaussian_neon
909        vrshl.s16       q9,  q0,  q15
910        vqmovn.s16      d0,  q8
911        vqmovn.s16      d1,  q9
912
913        vand            q3,  q3,  q1
914        vmull.s8        q2,  d6,  d22
915        vmull.s8        q3,  d7,  d22
916        vrshl.s16       q2,  q2,  q12
917        vrshl.s16       q3,  q3,  q12
918        vaddw.s8        q2,  q2,  d0
919        vaddw.s8        q3,  q3,  d1
920        vqmovn.s16      d4,  q2
921        vqmovn.s16      d5,  q3
922        vst1.8          {q2}, [r0]!
923        pop             {r11,pc}
924endfunc
925
926function get_grain_row_44_neon
927        push            {r11,lr}
928        get_grain_row_44 d16, d17, d18, d19, d20, d21
929        pop             {r11,pc}
930endfunc
931
932function add_uv_420_coeff_lag0_neon
933        vld1.16         {q2, q3}, [r11]!
934        vld1.16         {q4, q5}, [r12]!
935        vpaddl.s8       q2,  q2
936        vpaddl.s8       q3,  q3
937        vpaddl.s8       q4,  q4
938        vpaddl.s8       q5,  q5
939        vadd.i16        q2,  q2,  q4
940        vadd.i16        q3,  q3,  q5
941        vrshrn.s16      d4,  q2,  #2
942        vrshrn.s16      d5,  q3,  #2
943        b               add_coeff_lag0_start
944endfunc
945
946function add_uv_422_coeff_lag0_neon
947        vld1.16         {q2, q3}, [r11]!
948        vpaddl.s8       q2,  q2
949        vpaddl.s8       q3,  q3
950        vrshrn.s16      d4,  q2,  #1
951        vrshrn.s16      d5,  q3,  #1
952
953add_coeff_lag0_start:
954        vand            q3,  q2,  q1
955        vmull.s8        q2,  d6,  d22
956        vmull.s8        q3,  d7,  d22
957        vrshl.s16       q2,  q2,  q12
958        vrshl.s16       q3,  q3,  q12
959        vaddw.s8        q2,  q2,  d0
960        vaddw.s8        q3,  q3,  d1
961        vqmovn.s16      d4,  q2
962        vqmovn.s16      d5,  q3
963        bx              lr
964endfunc
965
966.macro gen_grain_82 type
967function generate_grain_\type\()_8bpc_neon, export=1
968        push            {r4-r11,lr}
969
970.ifc \type, uv_444
971        mov             r12, r3
972        mov             lr,  #28
973        add             r11, r1,  #3*GRAIN_WIDTH
974        mov             r1,  r2
975        mul             r12, r12, lr
976.endif
977        movrel          r3,  X(gaussian_sequence)
978        ldr             r2,  [r1, #FGD_SEED]
979        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
980.ifc \type, y
981        add             r4,  r1,  #FGD_AR_COEFFS_Y
982.else
983        add             r4,  r1,  #FGD_AR_COEFFS_UV
984.endif
985        adr             r5,  L(gen_grain_\type\()_tbl)
986        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
987        add             r9,  r9,  #4
988        ldr             r6,  [r5, r6, lsl #2]
989        vdup.16         q15, r9    // 4 + data->grain_scale_shift
990        add             r5,  r5,  r6
991        vneg.s16        q15, q15
992
993.ifc \type, uv_444
994        cmp             r12, #0
995        movw            r10, #0x49d8
996        movw            lr,  #0xb524
997        // Intentionally using a separate register instead of moveq with an
998        // immediate constant, to avoid armv8 deprecated it instruction forms.
999        it              eq
1000        moveq           r10, lr
1001        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
1002        eor             r2,  r2,  r10
1003.endif
1004
1005        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
1006        mov             r8,  #1
1007        mov             r10, #1
1008        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
1009        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
1010        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
1011        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1012
1013        bx              r5
1014
1015        .align 2
1016L(gen_grain_\type\()_tbl):
1017        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1018        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1019        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1020        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1021
1022L(generate_grain_\type\()_lag0):
1023.ifc \type, y
1024        mov             r1,  #GRAIN_HEIGHT
1025        bl              generate_grain_rows_neon
1026.else
1027
1028        mov             r1,  #3
1029        bl              generate_grain_rows_neon
1030        mov             r1,  #GRAIN_HEIGHT-3
1031
1032        vdup.16         q12, r7
1033        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
1034        vmov.i8         q0,  #0
1035        vmov.i8         q1,  #255
1036        vext.8          q13, q0,  q1,  #13
1037        vext.8          q14, q1,  q0,  #1
1038        vneg.s16        q12, q12
1039
10401:
1041        vmov            q1,  q13
1042        bl              gen_grain_uv_444_lag0_neon // 16
1043        vmov.i8         q1,  #255
1044        bl              gen_grain_uv_444_lag0_neon // 32
1045        bl              gen_grain_uv_444_lag0_neon // 48
1046        bl              gen_grain_uv_444_lag0_neon // 64
1047        vmov            q1,  q14
1048        bl              gen_grain_uv_444_lag0_neon // 80
1049        get_grain_2     d16
1050        subs            r1,  r1,  #1
1051        add             r11, r11, #2
1052        vst1.16         {d16[0]}, [r0]!
1053        bgt             1b
1054.endif
1055        pop             {r4-r11,pc}
1056
1057L(generate_grain_\type\()_lag1):
1058        vpush           {q4-q7}
1059        mov             r5,  #127
1060        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
1061        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
1062        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
1063.ifc \type, y
1064        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
1065.else
1066        add             r4,  r4,  #2
1067.endif
1068
1069        mov             r1,  #3
1070.ifc \type, uv_444
1071        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
1072        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
1073.endif
1074        bl              generate_grain_rows_neon
1075
1076        mov             r1,  #GRAIN_HEIGHT - 3
10771:
1078        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
1079        sum_\type\()_lag1 q8,  q8,  q9,  q10
1080        sum_\type\()_lag1 q9,  q9,  q10, q11
1081        sum_\type\()_lag1 q10, q10, q11, q12
1082        sum_\type\()_lag1 q12, q11, q12, q13, right
1083        get_grain_2     d26
1084        subs            r1,  r1,  #1
1085.ifc \type, uv_444
1086        add             r11, r11, #2
1087.endif
1088        store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
1089        vmov            q11, q10
1090        vmov            q10, q9
1091        vmov            q9,  q8
1092        vmov            q8,  q7
1093        bgt             1b
1094
1095        vpop            {q4-q7}
1096        pop             {r4-r11,pc}
1097
1098L(generate_grain_\type\()_lag2):
1099        vpush           {q4-q7}
1100        mov             r5,  #127
1101        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
1102
1103        vmov.s8         r4,  d29[2]
1104        vmov.s8         r10, d29[3]
1105
1106        mov             r1,  #3
1107        bl              generate_grain_rows_neon
1108
1109        mov             r1,  #GRAIN_HEIGHT - 3
11101:
1111        bl              sum_\type\()_lag2_left_neon
1112        bl              sum_\type\()_lag2_mid_neon
1113        bl              sum_\type\()_lag2_mid_neon
1114        bl              sum_\type\()_lag2_mid_neon
1115        bl              sum_\type\()_lag2_right_neon
1116        get_grain_2     d16
1117        subs            r1,  r1,  #1
1118.ifc \type, uv_444
1119        add             r11, r11, #2
1120.endif
1121        vst1.16         {d16[0]}, [r0]!
1122        bgt             1b
1123
1124        vpop            {q4-q7}
1125        pop             {r4-r11,pc}
1126
1127L(generate_grain_\type\()_lag3):
1128        vpush           {q4-q7}
1129        mov             r5,  #127
1130        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
1131
1132        vmov.u8         r4,  d28[5]
1133        vmov.u8         r10, d28[6]
1134        vmov.u8         r12, d28[7]
1135
1136        orr             r4,  r4,  r10, lsl #8
1137        orr             r4,  r4,  r12, lsl #16
1138
1139        mov             r1,  #3
1140        vpush           {d26}
1141        bl              generate_grain_rows_neon
1142        vpop            {d26}
1143
1144        mov             r1,  #GRAIN_HEIGHT - 3
11451:
1146        bl              sum_\type\()_lag3_left_neon
1147        bl              sum_\type\()_lag3_mid_neon
1148        bl              sum_\type\()_lag3_mid_neon
1149        bl              sum_\type\()_lag3_mid_neon
1150        bl              sum_\type\()_lag3_right_neon
1151        get_grain_2     d16
1152        subs            r1,  r1,  #1
1153.ifc \type, uv_444
1154        add             r11, r11, #2
1155.endif
1156        vst1.16         {d16[0]}, [r0]!
1157        bgt             1b
1158
1159        vpop            {q4-q7}
1160        pop             {r4-r11,pc}
1161endfunc
1162.endm
1163
1164gen_grain_82 y
1165gen_grain_82 uv_444
1166
1167.macro set_height dst, type
1168.ifc \type, uv_420
1169        mov             \dst,  #SUB_GRAIN_HEIGHT-3
1170.else
1171        mov             \dst,  #GRAIN_HEIGHT-3
1172.endif
1173.endm
1174
1175.macro increment_y_ptr reg, type
1176.ifc \type, uv_420
1177        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
1178.else
1179        sub             \reg, \reg, #3*32-GRAIN_WIDTH
1180.endif
1181.endm
1182
1183.macro gen_grain_44 type
1184function generate_grain_\type\()_8bpc_neon, export=1
1185        push            {r4-r11,lr}
1186
1187        mov             r12, r3
1188        mov             lr,  #28
1189        add             r11, r1,  #3*GRAIN_WIDTH-3
1190        mov             r1,  r2
1191        mul             r12, r12, lr
1192
1193        movrel          r3,  X(gaussian_sequence)
1194        ldr             r2,  [r1, #FGD_SEED]
1195        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
1196        add             r4,  r1,  #FGD_AR_COEFFS_UV
1197        adr             r5,  L(gen_grain_\type\()_tbl)
1198        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
1199        add             r9,  r9,  #4
1200        ldr             r6,  [r5, r6, lsl #2]
1201        vdup.16         q15, r9    // 4 + data->grain_scale_shift
1202        add             r5,  r5,  r6
1203        vneg.s16        q15, q15
1204
1205        cmp             r12, #0
1206        movw            r10, #0x49d8
1207        movw            lr,  #0xb524
1208        // Intentionally using a separate register instead of moveq with an
1209        // immediate constant, to avoid armv8 deprecated it instruction forms.
1210        it              eq
1211        moveq           r10, lr
1212        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
1213        eor             r2,  r2,  r10
1214
1215        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
1216        mov             r8,  #1
1217        mov             r10, #1
1218        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
1219        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
1220        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
1221        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1222        bx              r5
1223
1224        .align 2
1225L(gen_grain_\type\()_tbl):
1226        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1227        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1228        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1229        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1230
1231L(generate_grain_\type\()_lag0):
1232.ifc \type, uv_420
1233        vpush           {q4-q5}
1234.endif
1235        mov             r1,  #3
1236        bl              generate_grain_rows_44_neon
1237        set_height      r1,  \type
1238
1239        vdup.16         q12, r7
1240        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
1241        vmov.i8         q0,  #0
1242        vmov.i8         q1,  #255
1243        vext.8          q13, q0,  q1,  #13
1244        vext.8          q14, q1,  q0,  #7
1245        vneg.s16        q12, q12
1246
12471:
1248        bl              get_grain_row_44_neon
1249.ifc \type, uv_420
1250        add             r12, r11, #GRAIN_WIDTH
1251.endif
1252        vmov            q1,  q13
1253        vmov            q0,  q8
1254        bl              add_\type\()_coeff_lag0_neon
1255        vmov.i8         q1,  #255
1256        vmov            q0,  q9
1257        vmov            q8,  q2
1258        bl              add_\type\()_coeff_lag0_neon
1259        vmov.i8         q1,  q14
1260        vmov            q0,  q10
1261        vmov            q9,  q2
1262        bl              add_\type\()_coeff_lag0_neon
1263        vmov            q10, q2
1264        subs            r1,  r1,  #1
1265        increment_y_ptr r11, \type
1266        store_grain_row_44 d16, d17, d18, d19, d20, d21
1267        bgt             1b
1268
1269.ifc \type, uv_420
1270        vpop            {q4-q5}
1271.endif
1272        pop             {r4-r11,pc}
1273
1274L(generate_grain_\type\()_lag1):
1275        vpush           {q4-q7}
1276        mov             r5,  #127
1277        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
1278        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
1279        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
1280        add             r4,  r4,  #2
1281
1282        mov             r1,  #3
1283        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
1284        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
1285        bl              generate_grain_rows_44_neon
1286
1287        set_height      r1,  \type
12881:
1289        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
1290        sum_\type\()_lag1 q8,  q8,  q9,  q10
1291        sum_\type\()_lag1 q10, q9,  q10, q11, right
1292        subs            r1,  r1,  #1
1293        increment_y_ptr r11, \type
1294        store_grain_row_44 d14, d15, d16, d17, d20, d21
1295        vmov            q9,  q8
1296        vmov            q8,  q7
1297        bgt             1b
1298
1299        vpop            {q4-q7}
1300        pop             {r4-r11,pc}
1301
1302L(generate_grain_\type\()_lag2):
1303        vpush           {q4-q7}
1304        mov             r5,  #127
1305        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
1306
1307        vmov.s8         r4,  d29[2]
1308        vmov.s8         r10, d29[3]
1309
1310        mov             r1,  #3
1311        bl              generate_grain_rows_44_neon
1312
1313        set_height      r1,  \type
13141:
1315        bl              sum_\type\()_lag2_left_neon
1316        bl              sum_\type\()_lag2_mid_neon
1317        bl              sum_\type\()_lag2_right_neon
1318        subs            r1,  r1,  #1
1319        increment_y_ptr r11, \type
1320        add             r0,  r0,  #GRAIN_WIDTH-48
1321        bgt             1b
1322
1323        vpop            {q4-q7}
1324        pop             {r4-r11,pc}
1325
1326L(generate_grain_\type\()_lag3):
1327        vpush           {q4-q7}
1328        mov             r5,  #127
1329        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
1330
1331        vmov.u8         r4,  d28[5]
1332        vmov.u8         r10, d28[6]
1333        vmov.u8         r12, d28[7]
1334
1335        orr             r4,  r4,  r10, lsl #8
1336        orr             r4,  r4,  r12, lsl #16
1337
1338        mov             r1,  #3
1339        bl              generate_grain_rows_44_neon
1340
1341        set_height      r1,  \type
13421:
1343        bl              sum_\type\()_lag3_left_neon
1344        bl              sum_\type\()_lag3_mid_neon
1345        bl              sum_\type\()_lag3_right_neon
1346        subs            r1,  r1,  #1
1347        increment_y_ptr r11, \type
1348        add             r0,  r0,  #GRAIN_WIDTH-48
1349        bgt             1b
1350
1351        vpop            {q4-q7}
1352        pop             {r4-r11,pc}
1353endfunc
1354.endm
1355
1356gen_grain_44 uv_420
1357gen_grain_44 uv_422
1358
1359.macro gather_interleaved dst1, dst2, src1, src2, off
1360        vmov.u8         r11, \src1[0+\off]
1361        vmov.u8         r12, \src2[0+\off]
1362        add             r11, r11, r3
1363        vmov.u8         lr,  \src1[2+\off]
1364        add             r12, r12, r3
1365        vld1.8          {\dst1[0+\off]}, [r11]
1366        vmov.u8         r11, \src2[2+\off]
1367        add             lr,  lr,  r3
1368        vld1.8          {\dst2[0+\off]}, [r12]
1369        vmov.u8         r12, \src1[4+\off]
1370        add             r11, r11, r3
1371        vld1.8          {\dst1[2+\off]}, [lr]
1372        vmov.u8         lr,  \src2[4+\off]
1373        add             r12, r12, r3
1374        vld1.8          {\dst2[2+\off]}, [r11]
1375        vmov.u8         r11, \src1[6+\off]
1376        add             lr,  lr,  r3
1377        vld1.8          {\dst1[4+\off]}, [r12]
1378        vmov.u8         r12, \src2[6+\off]
1379        add             r11, r11, r3
1380        vld1.8          {\dst2[4+\off]}, [lr]
1381        add             r12, r12, r3
1382        vld1.8          {\dst1[6+\off]}, [r11]
1383        vld1.8          {\dst2[6+\off]}, [r12]
1384.endm
1385
1386.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
1387        gather_interleaved \dst1, \dst3, \src1, \src3, 0
1388        gather_interleaved \dst1, \dst3, \src1, \src3, 1
1389        gather_interleaved \dst2, \dst4, \src2, \src4, 0
1390        gather_interleaved \dst2, \dst4, \src2, \src4, 1
1391.endm
1392
1393function gather32_neon
1394        push            {r11-r12,lr}
1395        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3
1396        pop             {r11-r12,pc}
1397endfunc
1398
1399function gather16_neon
1400        push            {r11-r12,lr}
1401        gather_interleaved d8,  d9,  d0,  d1,  0
1402        gather_interleaved d8,  d9,  d0,  d1,  1
1403        pop             {r11-r12,pc}
1404endfunc
1405
1406const overlap_coeffs_0, align=4
1407        .byte 27, 17, 0,  0,  0,  0,  0,  0
1408        .byte 17, 27, 32, 32, 32, 32, 32, 32
1409endconst
1410
1411const overlap_coeffs_1, align=4
1412        .byte 23, 0,  0,  0,  0,  0,  0,  0
1413        .byte 22, 32, 32, 32, 32, 32, 32, 32
1414endconst
1415
1416.macro calc_offset offx, offy, src, sx, sy
1417        and             \offy, \src,  #0xF     // randval & 0xF
1418        lsr             \offx, \src,  #4       // randval >> 4
1419.if \sy == 0
1420        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
1421.endif
1422.if \sx == 0
1423        add             \offx, \offx, \offx    // 2 * (randval >> 4)
1424.endif
1425.endm
1426
1427.macro add_offset dst, offx, offy, src, stride
1428        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
1429        add             \dst, \dst, \offx          // grain_lut += offx
1430.endm
1431
1432// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
1433//                                const ptrdiff_t stride,
1434//                                const uint8_t scaling[SCALING_SIZE],
1435//                                const int scaling_shift,
1436//                                const entry grain_lut[][GRAIN_WIDTH],
1437//                                const int offsets[][2],
1438//                                const int h, const ptrdiff_t clip,
1439//                                const ptrdiff_t type);
1440function fgy_32x32_8bpc_neon, export=1
1441        push            {r4-r11,lr}
1442        vpush           {q4-q7}
1443        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
1444        ldrd            r6,  r7,  [sp, #108]   // offsets, h
1445        ldr             r8,       [sp, #116]   // clip
1446        mov             r9,  #GRAIN_WIDTH      // grain_lut stride
1447
1448        neg             r4,  r4
1449        vdup.16         q13, r4                // -scaling_shift
1450        cmp             r8,  #0
1451
1452        movrel_local    r12, overlap_coeffs_0
1453
1454        beq             1f
1455        // clip
1456        vmov.i8         q14, #16
1457        vmov.i8         q15, #235
1458        b               2f
14591:
1460        // no clip
1461        vmov.i8         q14, #0
1462        vmov.i8         q15, #255
14632:
1464
1465        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
1466
1467        add             r5,  r5,  #9           // grain_lut += 9
1468        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
1469        add             r5,  r5,  r9           // grain_lut += grain_stride
1470
1471        ldr             r10, [r6, #8]          // offsets[1][0]
1472        calc_offset     r10, r4,  r10, 0,   0
1473        add_offset      r4,  r10, r4,  r5,  r9
1474        ldr             r10, [r6, #4]          // offsets[0][1]
1475        calc_offset     r10, r11, r10, 0,   0
1476        add_offset      r11, r10, r11, r5,  r9
1477        ldr             r10, [r6, #12]         // offsets[1][1]
1478        calc_offset     r10, r8,  r10, 0,   0
1479        add_offset      r8,  r10, r8,  r5,  r9
1480        ldr             r6,  [r6]              // offsets[0][0]
1481        calc_offset     r6,  lr,  r6,  0,   0
1482        add_offset      r5,  r6,  lr,  r5,  r9
1483
1484        add             r4,  r4,  #32          // grain_lut += FG_BLOCK_SIZE * bx
1485        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1486
1487        ldr             r10, [sp, #120]        // type
1488        adr             r11, L(fgy_loop_tbl)
1489
1490        tst             r10, #1
1491        ldr             r10, [r11, r10, lsl #2]
1492
1493        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1494        add             r8,  r8,  #32          // grain_lut += FG_BLOCK_SIZE * bx
1495
1496        add             r11, r11, r10
1497
1498        beq             1f
1499        // y overlap
1500        vdup.8          d14, d24[0]
1501        vdup.8          d15, d24[1]
1502        mov             r10, r7                // backup actual h
1503        mov             r7,  #2
15041:
1505        bx              r11
1506endfunc
1507
1508function fgy_loop_neon
1509L(fgy_loop_tbl):
1510        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
1511        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
1512        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
1513        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
1514
1515.macro fgy ox, oy
1516L(loop_\ox\oy):
15171:
1518.if \ox
1519        vld1.8          {d8},       [r4],       r9 // grain_lut old
1520.endif
1521.if \oy
1522        vld1.8          {q2, q3},   [r6],       r9 // grain_lut top
1523.endif
1524.if \ox && \oy
1525        vld1.8          {d10},      [r8],       r9 // grain_lut top old
1526.endif
1527        vld1.8          {q0,  q1},  [r1, :128], r2 // src
1528        vld1.8          {q10, q11}, [r5],       r9 // grain_lut
1529
1530.if \ox
1531        vmull.s8        q4,  d8,  d24
1532        vmlal.s8        q4,  d20, d25
1533.endif
1534
1535.if \oy
1536.if \ox
1537        vmull.s8        q5,  d10, d24
1538        vmlal.s8        q5,  d4,  d25
1539        vqrshrn.s16     d20, q4,  #5
1540        vqrshrn.s16     d4,  q5,  #5
1541.endif
1542
1543        vmull.s8        q4,  d20, d15
1544        vmull.s8        q5,  d21, d15
1545        vmull.s8        q8,  d22, d15
1546        vmull.s8        q9,  d23, d15
1547        vmlal.s8        q4,  d4,  d14
1548        vmlal.s8        q5,  d5,  d14
1549        vmlal.s8        q8,  d6,  d14
1550        vmlal.s8        q9,  d7,  d14
1551        vqrshrn.s16     d20, q4,  #5
1552        vqrshrn.s16     d21, q5,  #5
1553        vqrshrn.s16     d22, q8,  #5
1554        vqrshrn.s16     d23, q9,  #5
1555.elseif \ox
1556        vqrshrn.s16     d20, q4,  #5
1557.endif
1558
1559        bl              gather32_neon
1560
1561        vmovl.s8        q8,  d20       // grain
1562        vmovl.s8        q9,  d21
1563        vmovl.s8        q10, d22
1564        vmovl.s8        q11, d23
1565
1566        vmovl.u8        q2,  d8        // scaling
1567        vmovl.u8        q3,  d9
1568        vmovl.u8        q4,  d10
1569        vmovl.u8        q5,  d11
1570
1571        vmul.i16        q8,  q8,  q2   // scaling * grain
1572        vmul.i16        q9,  q9,  q3
1573        vmul.i16        q10, q10, q4
1574        vmul.i16        q11, q11, q5
1575
1576        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
1577        vrshl.s16       q9,  q9,  q13
1578        vrshl.s16       q10, q10, q13
1579        vrshl.s16       q11, q11, q13
1580
1581        vaddw.u8        q8,  q8,  d0   // *src + noise
1582        vaddw.u8        q9,  q9,  d1
1583        vaddw.u8        q10, q10, d2
1584        vaddw.u8        q11, q11, d3
1585
1586        vqmovun.s16     d0,  q8
1587        vqmovun.s16     d1,  q9
1588        vqmovun.s16     d2,  q10
1589        vqmovun.s16     d3,  q11
1590
1591        vmax.u8         q0,  q0,  q14
1592        vmax.u8         q1,  q1,  q14
1593        vmin.u8         q0,  q0,  q15
1594        vmin.u8         q1,  q1,  q15
1595
1596        subs            r7,  r7,  #1
1597.if \oy
1598        vdup.8          d14, d25[0]
1599        vdup.8          d15, d25[1]
1600.endif
1601        vst1.8          {q0, q1}, [r0, :128], r2 // dst
1602        bgt             1b
1603
1604.if \oy
1605        cmp             r10, #2
1606        sub             r7,  r10, #2           // restore actual remaining h
1607        bgt             L(loop_\ox\()0)
1608.endif
1609        vpop            {q4-q7}
1610        pop             {r4-r11,pc}
1611.endm
1612
1613        fgy             0, 0
1614        fgy             0, 1
1615        fgy             1, 0
1616        fgy             1, 1
1617endfunc
1618
1619// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
1620//                                     const pixel *const src,
1621//                                     const ptrdiff_t stride,
1622//                                     const uint8_t scaling[SCALING_SIZE],
1623//                                     const Dav1dFilmGrainData *const data,
1624//                                     const entry grain_lut[][GRAIN_WIDTH],
1625//                                     const pixel *const luma_row,
1626//                                     const ptrdiff_t luma_stride,
1627//                                     const int offsets[][2],
1628//                                     const ptrdiff_t h, const ptrdiff_t uv,
1629//                                     const ptrdiff_t is_id,
1630//                                     const ptrdiff_t type);
1631.macro fguv layout, sx, sy
1632function fguv_32x32_\layout\()_8bpc_neon, export=1
1633        push            {r4-r11,lr}
1634        vpush           {q4-q7}
1635        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
1636        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
1637        ldrd            r8,  r9,  [sp, #116]   // offsets, h
1638        ldrd            r10, r11, [sp, #124]   // uv, is_id
1639
1640        // !csfl
1641        add             r10, r4,  r10, lsl #2  // + 4*uv
1642        add             r12, r10, #FGD_UV_LUMA_MULT
1643        add             lr,  r10, #FGD_UV_MULT
1644        add             r10, r10, #FGD_UV_OFFSET
1645        vld1.16         {d4[]},  [r12]         // uv_luma_mult
1646        vld1.16         {d4[2]}, [r10]         // uv_offset
1647        vld1.16         {d4[1]}, [lr]          // uv_mult
1648
1649        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
1650        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
1651        neg             lr,  lr                // -scaling_shift
1652
1653        cmp             r12, #0
1654        vdup.16         q13, lr                // -scaling_shift
1655
1656        beq             1f
1657        // clip
1658        cmp             r11, #0
1659        vmov.i8         q14, #16
1660        vmov.i8         q15, #240
1661        beq             2f
1662        // is_id
1663        vmov.i8         q15, #235
1664        b               2f
16651:
1666        // no clip
1667        vmov.i8         q14, #0
1668        vmov.i8         q15, #255
16692:
1670
1671        mov             r10, #GRAIN_WIDTH      // grain_lut stride
1672
1673        add             r5,  r5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
1674.if \sy
1675        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
1676        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
1677.else
1678        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
1679        add             r5,  r5,  r10          // grain_lut += grain_stride
1680.endif
1681
1682        ldr             r12, [r8, #8]          // offsets[1][0]
1683        calc_offset     r12, r4,  r12, \sx, \sy
1684        add_offset      r4,  r12, r4,  r5,  r10
1685
1686        ldr             r12, [r8, #4]          // offsets[0][1]
1687        calc_offset     r12, lr,  r12, \sx, \sy
1688        add_offset      lr,  r12, lr,  r5,  r10
1689
1690        ldr             r12, [r8, #12]         // offsets[1][1]
1691        calc_offset     r12, r11, r12, \sx, \sy
1692        add_offset      r11, r12, r11, r5,  r10
1693
1694        ldr             r8,  [r8]              // offsets[0][0]
1695        calc_offset     r8,  r12, r8,  \sx, \sy
1696        add_offset      r5,  r8,  r12, r5,  r10
1697
1698        add             r4,  r4,  #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
1699        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1700        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1701        add             r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
1702
1703        movrel_local    r12, overlap_coeffs_\sx
1704        ldr             lr,  [sp, #132]        // type
1705
1706        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
1707
1708        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
1709#if CONFIG_THUMB
1710        // This uses movrel_local instead of adr above, because the target
1711        // can be out of range for adr. But movrel_local leaves the thumb bit
1712        // set on COFF (but probably wouldn't if building for thumb on ELF),
1713        // thus try to clear the bit for robustness.
1714        bic             r12, r12, #1
1715#endif
1716
1717        tst             lr,  #1
1718        ldr             lr,  [r12, lr,  lsl #2]
1719
1720        add             r12, r12, lr
1721
1722        beq             1f
1723        // y overlap
1724        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
1725        mov             r9,  #(2 >> \sy)
1726
17271:
1728
1729.if \sy
1730        vmov.i8         d6,  #23
1731        vmov.i8         d7,  #22
1732.else
1733        vmov.i8         d6,  #27
1734        vmov.i8         d7,  #17
1735.endif
1736
1737.if \sy
1738        add             r7,  r7,  r7           // luma_stride *= 2
1739.endif
1740
1741        bx              r12
1742endfunc
1743.endm
1744
1745fguv 420, 1, 1
1746fguv 422, 1, 0
1747fguv 444, 0, 0
1748
1749function fguv_loop_sx0_neon
1750L(fguv_loop_sx0_tbl):
1751        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1752        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1753        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1754        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1755        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1756        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1757        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1758        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1759
1760.macro fguv_loop_sx0 csfl, ox, oy
1761L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
1762.if \oy
1763        mov             r12, lr
1764.endif
17651:
1766.if \ox
1767        vld1.8          {d8},       [r4],        r10 // grain_lut old
1768.endif
1769.if \oy
1770        vld1.8          {q8, q9},   [r8],        r10 // grain_lut top
1771.endif
1772.if \ox && \oy
1773        vld1.8          {d10},      [r11],       r10 // grain_lut top old
1774.endif
1775        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
1776        vld1.8          {q10, q11}, [r5],        r10 // grain_lut
1777
1778.if \ox
1779        vmull.s8        q4,  d8,  d24
1780        vmlal.s8        q4,  d20, d25
1781.endif
1782
1783.if \oy
1784.if \ox
1785        vmull.s8        q5,  d10, d24
1786        vmlal.s8        q5,  d16, d25
1787        vqrshrn.s16     d20, q4,  #5
1788        vqrshrn.s16     d16, q5,  #5
1789.endif
1790
1791        vmull.s8        q4,  d20, d7
1792        vmull.s8        q5,  d21, d7
1793        vmull.s8        q6,  d22, d7
1794        vmull.s8        q7,  d23, d7
1795        vmlal.s8        q4,  d16, d6
1796        vmlal.s8        q5,  d17, d6
1797        vmlal.s8        q6,  d18, d6
1798        vmlal.s8        q7,  d19, d6
1799        vqrshrn.s16     d20, q4,  #5
1800        vqrshrn.s16     d21, q5,  #5
1801        vqrshrn.s16     d22, q6,  #5
1802        vqrshrn.s16     d23, q7,  #5
1803.elseif \ox
1804        vqrshrn.s16     d20, q4,  #5
1805.endif
1806.if !\csfl
1807        vld1.8          {q8,  q9},  [r1, :128] // src
1808        vmovl.u8        q4,  d0
1809        vmovl.u8        q5,  d1
1810        vmovl.u8        q6,  d2
1811        vmovl.u8        q7,  d3
1812        vmovl.u8        q0,  d16
1813        vmovl.u8        q1,  d17
1814        vmovl.u8        q8,  d18
1815        vmovl.u8        q9,  d19
1816        vmul.i16        q4,  q4,  d4[0]
1817        vmul.i16        q5,  q5,  d4[0]
1818        vmul.i16        q6,  q6,  d4[0]
1819        vmul.i16        q7,  q7,  d4[0]
1820        vmul.i16        q0,  q0,  d4[1]
1821        vmul.i16        q1,  q1,  d4[1]
1822        vmul.i16        q8,  q8,  d4[1]
1823        vmul.i16        q9,  q9,  d4[1]
1824        vqadd.s16       q4,  q4,  q0
1825        vqadd.s16       q5,  q5,  q1
1826        vqadd.s16       q6,  q6,  q8
1827        vqadd.s16       q7,  q7,  q9
1828        vdup.16         q0,  d4[2]
1829        vshr.s16        q4,  q4,  #6
1830        vshr.s16        q5,  q5,  #6
1831        vshr.s16        q6,  q6,  #6
1832        vshr.s16        q7,  q7,  #6
1833        vadd.i16        q4,  q4,  q0
1834        vadd.i16        q5,  q5,  q0
1835        vadd.i16        q6,  q6,  q0
1836        vadd.i16        q7,  q7,  q0
1837        vqmovun.s16     d0,  q4
1838        vqmovun.s16     d1,  q5
1839        vqmovun.s16     d2,  q6
1840        vqmovun.s16     d3,  q7
1841.endif
1842
1843        bl              gather32_neon
1844
1845        vld1.8          {q0,  q1},  [r1, :128], r2 // src
1846
1847        vmovl.s8        q8,  d20       // grain
1848        vmovl.s8        q9,  d21
1849        vmovl.s8        q10, d22
1850        vmovl.s8        q11, d23
1851
1852        vmovl.u8        q6,  d8        // scaling
1853        vmovl.u8        q7,  d9
1854        vmovl.u8        q4,  d10
1855        vmovl.u8        q5,  d11
1856
1857        vmul.i16        q8,  q8,  q6   // scaling * grain
1858        vmul.i16        q9,  q9,  q7
1859        vmul.i16        q10, q10, q4
1860        vmul.i16        q11, q11, q5
1861
1862        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
1863        vrshl.s16       q9,  q9,  q13
1864        vrshl.s16       q10, q10, q13
1865        vrshl.s16       q11, q11, q13
1866
1867        vaddw.u8        q8,  q8,  d0   // *src + noise
1868        vaddw.u8        q9,  q9,  d1
1869        vaddw.u8        q10, q10, d2
1870        vaddw.u8        q11, q11, d3
1871
1872        vqmovun.s16     d0,  q8
1873        vqmovun.s16     d1,  q9
1874        vqmovun.s16     d2,  q10
1875        vqmovun.s16     d3,  q11
1876
1877        vmax.u8         q0,  q0,  q14
1878        vmax.u8         q1,  q1,  q14
1879        vmin.u8         q0,  q0,  q15
1880        vmin.u8         q1,  q1,  q15
1881
1882        subs            r9,  r9,  #1
1883.if \oy
1884        vdup.8          d6,  d25[0]
1885        vdup.8          d7,  d25[1]
1886.endif
1887
1888        vst1.8          {q0, q1}, [r0, :128], r2 // dst
1889        bgt             1b
1890
1891.if \oy
1892        cmp             r12, #0
1893        mov             r9,  r12               // restore actual remaining h
1894        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
1895.endif
1896        b               9f
1897.endm
1898        fguv_loop_sx0   0, 0, 0
1899        fguv_loop_sx0   0, 0, 1
1900        fguv_loop_sx0   0, 1, 0
1901        fguv_loop_sx0   0, 1, 1
1902        fguv_loop_sx0   1, 0, 0
1903        fguv_loop_sx0   1, 0, 1
1904        fguv_loop_sx0   1, 1, 0
1905        fguv_loop_sx0   1, 1, 1
1906
19079:
1908        vpop            {q4-q7}
1909        pop             {r4-r11,pc}
1910endfunc
1911
1912function fguv_loop_sx1_neon
1913L(fguv_loop_sx1_tbl):
1914        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1915        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1916        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1917        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1918        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1919        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1920        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1921        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1922
1923.macro fguv_loop_sx1 csfl, ox, oy
1924L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
1925.if \oy
1926        mov             r12, lr
1927.endif
19281:
1929.if \ox
1930        vld1.8          {d8},       [r4],        r10 // grain_lut old
1931.endif
1932.if \oy
1933        vld1.8          {q8},       [r8],        r10 // grain_lut top
1934.endif
1935.if \ox && \oy
1936        vld1.8          {d10},      [r11],       r10 // grain_lut top old
1937.endif
1938        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
1939        vld1.8          {q10},      [r5],        r10 // grain_lut
1940        vld1.8          {q11},      [r1, :128],  r2  // src
1941
1942.if \ox
1943        vmull.s8        q4,  d8,  d24
1944        vmlal.s8        q4,  d20, d25
1945.endif
1946
1947        vpaddl.u8       q0,  q0
1948        vpaddl.u8       q1,  q1
1949.if \oy
1950.if \ox
1951        vmull.s8        q5,  d10, d24
1952        vmlal.s8        q5,  d16, d25
1953        vqrshrn.s16     d20, q4,  #5
1954        vqrshrn.s16     d16, q5,  #5
1955.endif
1956
1957        vmull.s8        q4,  d20, d7
1958        vmull.s8        q5,  d21, d7
1959        vmlal.s8        q4,  d16, d6
1960        vmlal.s8        q5,  d17, d6
1961        vqrshrn.s16     d20, q4,  #5
1962        vqrshrn.s16     d21, q5,  #5
1963.elseif \ox
1964        vqrshrn.s16     d20, q4,  #5
1965.endif
1966.if \csfl
1967        vrshrn.u16      d0,  q0,  #1
1968        vrshrn.u16      d1,  q1,  #1
1969.else
1970        vrshr.u16       q4,  q0,  #1
1971        vrshr.u16       q5,  q1,  #1
1972        vmovl.u8        q0,  d22
1973        vmovl.u8        q1,  d23
1974        vmul.i16        q4,  q4,  d4[0]
1975        vmul.i16        q5,  q5,  d4[0]
1976        vmul.i16        q0,  q0,  d4[1]
1977        vmul.i16        q1,  q1,  d4[1]
1978        vqadd.s16       q4,  q4,  q0
1979        vqadd.s16       q5,  q5,  q1
1980        vdup.16         q0,  d4[2]
1981        vshr.s16        q4,  q4,  #6
1982        vshr.s16        q5,  q5,  #6
1983        vadd.i16        q4,  q4,  q0
1984        vadd.i16        q5,  q5,  q0
1985        vqmovun.s16     d0,  q4
1986        vqmovun.s16     d1,  q5
1987.endif
1988
1989        bl              gather16_neon
1990
1991        vmovl.s8        q8,  d20       // grain
1992        vmovl.s8        q9,  d21
1993
1994        vmovl.u8        q6,  d8        // scaling
1995        vmovl.u8        q7,  d9
1996
1997        vmul.i16        q8,  q8,  q6   // scaling * grain
1998        vmul.i16        q9,  q9,  q7
1999
2000        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
2001        vrshl.s16       q9,  q9,  q13
2002
2003        vaddw.u8        q8,  q8,  d22  // *src + noise
2004        vaddw.u8        q9,  q9,  d23
2005
2006        vqmovun.s16     d0,  q8
2007        vqmovun.s16     d1,  q9
2008
2009        vmax.u8         q0,  q0,  q14
2010        vmin.u8         q0,  q0,  q15
2011
2012        subs            r9,  r9,  #1
2013.if \oy
2014        vswp            d6,  d7
2015.endif
2016        vst1.8          {q0}, [r0, :128], r2 // dst
2017        bgt             1b
2018
2019.if \oy
2020        cmp             r12, #0
2021        mov             r9,  r12               // restore actual remaining h
2022        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
2023.endif
2024
2025        b               9f
2026.endm
2027        fguv_loop_sx1   0, 0, 0
2028        fguv_loop_sx1   0, 0, 1
2029        fguv_loop_sx1   0, 1, 0
2030        fguv_loop_sx1   0, 1, 1
2031        fguv_loop_sx1   1, 0, 0
2032        fguv_loop_sx1   1, 0, 1
2033        fguv_loop_sx1   1, 1, 0
2034        fguv_loop_sx1   1, 1, 1
2035
20369:
2037        vpop            {q4-q7}
2038        pop             {r4-r11,pc}
2039endfunc
2040