xref: /aosp_15_r20/external/libdav1d/src/arm/32/looprestoration_tmpl.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29
30#define FILTER_OUT_STRIDE 384
31
32.macro sgr_funcs bpc
33// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
34//                                         const pixel *src, const ptrdiff_t stride,
35//                                         const int32_t *a, const int16_t *b,
36//                                         const int w, const int h);
37function sgr_finish_filter1_\bpc\()bpc_neon, export=1
38        push            {r4-r11,lr}
39        vpush           {q4-q7}
40        ldrd            r4,  r5,  [sp, #100]
41        ldr             r6,  [sp, #108]
42        sub             r7,  r3,  #(4*SUM_STRIDE)
43        add             r8,  r3,  #(4*SUM_STRIDE)
44        sub             r9,  r4,  #(2*SUM_STRIDE)
45        add             r10, r4,  #(2*SUM_STRIDE)
46        mov             r11, #SUM_STRIDE
47        mov             r12, #FILTER_OUT_STRIDE
48        add             lr,  r5,  #3
49        bic             lr,  lr,  #3 // Aligned width
50.if \bpc == 8
51        sub             r2,  r2,  lr
52.else
53        sub             r2,  r2,  lr, lsl #1
54.endif
55        sub             r12, r12, lr
56        sub             r11, r11, lr
57        sub             r11, r11, #4 // We read 4 extra elements from both a and b
58        mov             lr,  r5
59        vmov.i16        q14, #3
60        vmov.i32        q15, #3
611:
62        vld1.16         {q0},       [r9,  :128]!
63        vld1.16         {q1},       [r4,  :128]!
64        vld1.16         {q2},       [r10, :128]!
65        vld1.32         {q8,  q9},  [r7,  :128]!
66        vld1.32         {q10, q11}, [r3,  :128]!
67        vld1.32         {q12, q13}, [r8,  :128]!
68
692:
70        subs            r5,  r5,  #4
71        vext.8          d6,  d0,  d1,  #2  // -stride
72        vext.8          d7,  d2,  d3,  #2  // 0
73        vext.8          d8,  d4,  d5,  #2  // +stride
74        vext.8          d9,  d0,  d1,  #4  // +1-stride
75        vext.8          d10, d2,  d3,  #4  // +1
76        vext.8          d11, d4,  d5,  #4  // +1+stride
77        vadd.i16        d2,  d2,  d6       // -1, -stride
78        vadd.i16        d7,  d7,  d8       // 0, +stride
79        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
80        vadd.i16        d2,  d2,  d7
81        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
82        vadd.i16        d2,  d2,  d10      // +1
83        vadd.i16        d0,  d0,  d4
84
85        vext.8          q3,  q8,  q9,  #4  // -stride
86        vshl.i16        d2,  d2,  #2
87        vext.8          q4,  q8,  q9,  #8  // +1-stride
88        vext.8          q5,  q10, q11, #4  // 0
89        vext.8          q6,  q10, q11, #8  // +1
90        vmla.i16        d2,  d0,  d28      // * 3 -> a
91        vadd.i32        q3,  q3,  q10      // -stride, -1
92        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
93        vadd.i32        q5,  q5,  q6       // 0, +1
94        vadd.i32        q8,  q8,  q12      // -1+stride
95        vadd.i32        q3,  q3,  q5
96        vext.8          q7,  q12, q13, #4  // +stride
97        vext.8          q10, q12, q13, #8  // +1+stride
98.if \bpc == 8
99        vld1.32         {d24[0]}, [r1, :32]! // src
100.else
101        vld1.16         {d24}, [r1, :64]!    // src
102.endif
103        vadd.i32        q3,  q3,  q7       // +stride
104        vadd.i32        q8,  q8,  q10      // +1+stride
105        vshl.i32        q3,  q3,  #2
106        vmla.i32        q3,  q8,  q15      // * 3 -> b
107.if \bpc == 8
108        vmovl.u8        q12, d24           // src
109.endif
110        vmov            d0,  d1
111        vmlal.u16       q3,  d2,  d24      // b + a * src
112        vmov            d2,  d3
113        vrshrn.i32      d6,  q3,  #9
114        vmov            d4,  d5
115        vst1.16         {d6}, [r0]!
116
117        ble             3f
118        vmov            q8,  q9
119        vmov            q10, q11
120        vmov            q12, q13
121        vld1.16         {d1},  [r9,  :64]!
122        vld1.16         {d3},  [r4,  :64]!
123        vld1.16         {d5},  [r10, :64]!
124        vld1.32         {q9},  [r7,  :128]!
125        vld1.32         {q11}, [r3,  :128]!
126        vld1.32         {q13}, [r8,  :128]!
127        b               2b
128
1293:
130        subs            r6,  r6,  #1
131        ble             0f
132        mov             r5,  lr
133        add             r0,  r0,  r12, lsl #1
134        add             r1,  r1,  r2
135        add             r3,  r3,  r11, lsl #2
136        add             r7,  r7,  r11, lsl #2
137        add             r8,  r8,  r11, lsl #2
138        add             r4,  r4,  r11, lsl #1
139        add             r9,  r9,  r11, lsl #1
140        add             r10, r10, r11, lsl #1
141        b               1b
1420:
143        vpop            {q4-q7}
144        pop             {r4-r11,pc}
145endfunc
146
147// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
148//                                         const pixel *src, const ptrdiff_t stride,
149//                                         const int32_t *a, const int16_t *b,
150//                                         const int w, const int h);
151function sgr_finish_filter2_\bpc\()bpc_neon, export=1
152        push            {r4-r11,lr}
153        vpush           {q4-q7}
154        ldrd            r4,  r5,  [sp, #100]
155        ldr             r6,  [sp, #108]
156        add             r7,  r3,  #(4*(SUM_STRIDE))
157        sub             r3,  r3,  #(4*(SUM_STRIDE))
158        add             r8,  r4,  #(2*(SUM_STRIDE))
159        sub             r4,  r4,  #(2*(SUM_STRIDE))
160        mov             r9,  #(2*SUM_STRIDE)
161        mov             r10, #FILTER_OUT_STRIDE
162        add             r11, r5,  #7
163        bic             r11, r11, #7 // Aligned width
164.if \bpc == 8
165        sub             r2,  r2,  r11
166.else
167        sub             r2,  r2,  r11, lsl #1
168.endif
169        sub             r10, r10, r11
170        sub             r9,  r9,  r11
171        sub             r9,  r9,  #4 // We read 4 extra elements from a
172        sub             r12, r9,  #4 // We read 8 extra elements from b
173        mov             lr,  r5
174
1751:
176        vld1.16         {q0,  q1},  [r4, :128]!
177        vld1.16         {q2,  q3},  [r8, :128]!
178        vld1.32         {q8,  q9},  [r3, :128]!
179        vld1.32         {q11, q12}, [r7, :128]!
180        vld1.32         {q10},      [r3, :128]!
181        vld1.32         {q13},      [r7, :128]!
182
1832:
184        vmov.i16        q14, #5
185        vmov.i16        q15, #6
186        subs            r5,  r5,  #8
187        vext.8          q4,  q0,  q1,  #4  // +1-stride
188        vext.8          q5,  q2,  q3,  #4  // +1+stride
189        vext.8          q6,  q0,  q1,  #2  // -stride
190        vext.8          q7,  q2,  q3,  #2  // +stride
191        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
192        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
193        vadd.i16        q2,  q6,  q7       // -stride, +stride
194        vadd.i16        q0,  q0,  q5
195
196        vext.8          q4,  q8,  q9,  #8  // +1-stride
197        vext.8          q5,  q9,  q10, #8
198        vext.8          q6,  q11, q12, #8  // +1+stride
199        vext.8          q7,  q12, q13, #8
200        vmul.i16        q0,  q0,  q14      // * 5
201        vmla.i16        q0,  q2,  q15      // * 6
202        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
203        vadd.i32        q5,  q5,  q9
204        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
205        vadd.i32        q7,  q7,  q12
206        vadd.i32        q4,  q4,  q6
207        vadd.i32        q5,  q5,  q7
208        vext.8          q6,  q8,  q9,  #4  // -stride
209        vext.8          q7,  q9,  q10, #4
210        vext.8          q8,  q11, q12, #4  // +stride
211        vext.8          q11, q12, q13, #4
212
213.if \bpc == 8
214        vld1.8          {d4}, [r1, :64]!
215.else
216        vld1.8          {q2}, [r1, :128]!
217.endif
218
219        vmov.i32        q14, #5
220        vmov.i32        q15, #6
221
222        vadd.i32        q6,  q6,  q8       // -stride, +stride
223        vadd.i32        q7,  q7,  q11
224        vmul.i32        q4,  q4,  q14      // * 5
225        vmla.i32        q4,  q6,  q15      // * 6
226        vmul.i32        q5,  q5,  q14      // * 5
227        vmla.i32        q5,  q7,  q15      // * 6
228
229.if \bpc == 8
230        vmovl.u8        q2,  d4
231.endif
232        vmlal.u16       q4,  d0,  d4       // b + a * src
233        vmlal.u16       q5,  d1,  d5       // b + a * src
234        vmov            q0,  q1
235        vrshrn.i32      d8,  q4,  #9
236        vrshrn.i32      d9,  q5,  #9
237        vmov            q2,  q3
238        vst1.16         {q4}, [r0, :128]!
239
240        ble             3f
241        vmov            q8,  q10
242        vmov            q11, q13
243        vld1.16         {q1},       [r4, :128]!
244        vld1.16         {q3},       [r8, :128]!
245        vld1.32         {q9,  q10}, [r3, :128]!
246        vld1.32         {q12, q13}, [r7, :128]!
247        b               2b
248
2493:
250        subs            r6,  r6,  #1
251        ble             0f
252        mov             r5,  lr
253        add             r0,  r0,  r10, lsl #1
254        add             r1,  r1,  r2
255        add             r3,  r3,  r9,  lsl #2
256        add             r7,  r7,  r9,  lsl #2
257        add             r4,  r4,  r12, lsl #1
258        add             r8,  r8,  r12, lsl #1
259
260        vld1.32         {q8, q9}, [r3, :128]!
261        vld1.16         {q0, q1}, [r4, :128]!
262        vld1.32         {q10},    [r3, :128]!
263
264        vmov.i16        q12, #5
265        vmov.i16        q13, #6
266
2674:
268        subs            r5,  r5,  #8
269        vext.8          q3,  q0,  q1,  #4  // +1
270        vext.8          q2,  q0,  q1,  #2  // 0
271        vadd.i16        q0,  q0,  q3       // -1, +1
272
273        vext.8          q4,  q8,  q9,  #4  // 0
274        vext.8          q5,  q9,  q10, #4
275        vext.8          q6,  q8,  q9,  #8  // +1
276        vext.8          q7,  q9,  q10, #8
277        vmul.i16        q2,  q2,  q13      // * 6
278        vmla.i16        q2,  q0,  q12      // * 5 -> a
279.if \bpc == 8
280        vld1.8          {d22}, [r1, :64]!
281.else
282        vld1.16         {q11}, [r1, :128]!
283.endif
284        vadd.i32        q8,  q8,  q6       // -1, +1
285        vadd.i32        q9,  q9,  q7
286.if \bpc == 8
287        vmovl.u8        q11, d22
288.endif
289        vmul.i32        q4,  q4,  q15      // * 6
290        vmla.i32        q4,  q8,  q14      // * 5 -> b
291        vmul.i32        q5,  q5,  q15      // * 6
292        vmla.i32        q5,  q9,  q14      // * 5 -> b
293
294        vmlal.u16       q4,  d4,  d22      // b + a * src
295        vmlal.u16       q5,  d5,  d23
296        vmov            q0,  q1
297        vrshrn.i32      d8,  q4,  #8
298        vrshrn.i32      d9,  q5,  #8
299        vmov            q8,  q10
300        vst1.16         {q4}, [r0, :128]!
301
302        ble             5f
303        vld1.16         {q1},      [r4, :128]!
304        vld1.32         {q9, q10}, [r3, :128]!
305        b               4b
306
3075:
308        subs            r6,  r6,  #1
309        ble             0f
310        mov             r5,  lr
311        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
312        sub             r4,  r4,  r11, lsl #1
313        add             r0,  r0,  r10, lsl #1
314        add             r1,  r1,  r2
315        sub             r3,  r3,  #16
316        sub             r4,  r4,  #16
317        b               1b
3180:
319        vpop            {q4-q7}
320        pop             {r4-r11,pc}
321endfunc
322
323// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
324//                                    const pixel *src, const ptrdiff_t src_stride,
325//                                    const int16_t *t1, const int w, const int h,
326//                                    const int wt, const int bitdepth_max);
327function sgr_weighted1_\bpc\()bpc_neon, export=1
328        push            {r4-r9,lr}
329        ldrd            r4,  r5,  [sp, #28]
330        ldrd            r6,  r7,  [sp, #36]
331.if \bpc == 16
332        ldr             r8,  [sp, #44]
333.endif
334        vdup.16         d31, r7
335        cmp             r6,  #2
336.if \bpc == 16
337        vdup.16         q14, r8
338.endif
339        add             r9,  r0,  r1
340        add             r12, r2,  r3
341        add             lr,  r4,  #2*FILTER_OUT_STRIDE
342        mov             r7,  #(4*FILTER_OUT_STRIDE)
343        lsl             r1,  r1,  #1
344        lsl             r3,  r3,  #1
345        add             r8,  r5,  #7
346        bic             r8,  r8,  #7 // Aligned width
347.if \bpc == 8
348        sub             r1,  r1,  r8
349        sub             r3,  r3,  r8
350.else
351        sub             r1,  r1,  r8, lsl #1
352        sub             r3,  r3,  r8, lsl #1
353.endif
354        sub             r7,  r7,  r8, lsl #1
355        mov             r8,  r5
356        blt             2f
3571:
358.if \bpc == 8
359        vld1.8          {d0},  [r2,  :64]!
360        vld1.8          {d16}, [r12, :64]!
361.else
362        vld1.16         {q0},  [r2,  :128]!
363        vld1.16         {q8},  [r12, :128]!
364.endif
365        vld1.16         {q1},  [r4,  :128]!
366        vld1.16         {q9},  [lr,  :128]!
367        subs            r5,  r5,  #8
368.if \bpc == 8
369        vshll.u8        q0,  d0,  #4     // u
370        vshll.u8        q8,  d16, #4     // u
371.else
372        vshl.i16        q0,  q0,  #4     // u
373        vshl.i16        q8,  q8,  #4     // u
374.endif
375        vsub.i16        q1,  q1,  q0     // t1 - u
376        vsub.i16        q9,  q9,  q8     // t1 - u
377        vshll.u16       q2,  d0,  #7     // u << 7
378        vshll.u16       q3,  d1,  #7     // u << 7
379        vshll.u16       q10, d16, #7     // u << 7
380        vshll.u16       q11, d17, #7     // u << 7
381        vmlal.s16       q2,  d2,  d31    // v
382        vmlal.s16       q3,  d3,  d31    // v
383        vmlal.s16       q10, d18, d31    // v
384        vmlal.s16       q11, d19, d31    // v
385.if \bpc == 8
386        vrshrn.i32      d4,  q2,  #11
387        vrshrn.i32      d5,  q3,  #11
388        vrshrn.i32      d20, q10, #11
389        vrshrn.i32      d21, q11, #11
390        vqmovun.s16     d4,  q2
391        vqmovun.s16     d20, q10
392        vst1.8          {d4},  [r0, :64]!
393        vst1.8          {d20}, [r9, :64]!
394.else
395        vqrshrun.s32    d4,  q2,  #11
396        vqrshrun.s32    d5,  q3,  #11
397        vqrshrun.s32    d20, q10, #11
398        vqrshrun.s32    d21, q11, #11
399        vmin.u16        q2,  q2,  q14
400        vmin.u16        q10, q10, q14
401        vst1.16         {q2},  [r0, :128]!
402        vst1.16         {q10}, [r9, :128]!
403.endif
404        bgt             1b
405
406        sub             r6,  r6,  #2
407        cmp             r6,  #1
408        blt             0f
409        mov             r5,  r8
410        add             r0,  r0,  r1
411        add             r9,  r9,  r1
412        add             r2,  r2,  r3
413        add             r12, r12, r3
414        add             r4,  r4,  r7
415        add             lr,  lr,  r7
416        beq             2f
417        b               1b
418
4192:
420.if \bpc == 8
421        vld1.8          {d0}, [r2, :64]!
422.else
423        vld1.16         {q0}, [r2, :128]!
424.endif
425        vld1.16         {q1}, [r4, :128]!
426        subs            r5,  r5,  #8
427.if \bpc == 8
428        vshll.u8        q0,  d0,  #4     // u
429.else
430        vshl.i16        q0,  q0,  #4     // u
431.endif
432        vsub.i16        q1,  q1,  q0     // t1 - u
433        vshll.u16       q2,  d0,  #7     // u << 7
434        vshll.u16       q3,  d1,  #7     // u << 7
435        vmlal.s16       q2,  d2,  d31    // v
436        vmlal.s16       q3,  d3,  d31    // v
437.if \bpc == 8
438        vrshrn.i32      d4,  q2,  #11
439        vrshrn.i32      d5,  q3,  #11
440        vqmovun.s16     d2,  q2
441        vst1.8          {d2}, [r0, :64]!
442.else
443        vqrshrun.s32    d4,  q2,  #11
444        vqrshrun.s32    d5,  q3,  #11
445        vmin.u16        q2,  q2,  q14
446        vst1.16         {q2}, [r0, :128]!
447.endif
448        bgt             2b
4490:
450        pop             {r4-r9,pc}
451endfunc
452
453// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
454//                                    const pixel *src, const ptrdiff_t src_stride,
455//                                    const int16_t *t1, const int16_t *t2,
456//                                    const int w, const int h,
457//                                    const int16_t wt[2], const int bitdepth_max);
458function sgr_weighted2_\bpc\()bpc_neon, export=1
459        push            {r4-r11,lr}
460        ldrd            r4,  r5,  [sp, #36]
461        ldrd            r6,  r7,  [sp, #44]
462.if \bpc == 8
463        ldr             r8,  [sp, #52]
464.else
465        ldrd            r8,  r9,  [sp, #52]
466.endif
467        cmp             r7,  #2
468        add             r10, r0,  r1
469        add             r11, r2,  r3
470        add             r12, r4,  #2*FILTER_OUT_STRIDE
471        add             lr,  r5,  #2*FILTER_OUT_STRIDE
472        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
473.if \bpc == 16
474        vdup.16         q14, r9
475.endif
476        mov             r8,  #4*FILTER_OUT_STRIDE
477        lsl             r1,  r1,  #1
478        lsl             r3,  r3,  #1
479        add             r9,  r6,  #7
480        bic             r9,  r9,  #7 // Aligned width
481.if \bpc == 8
482        sub             r1,  r1,  r9
483        sub             r3,  r3,  r9
484.else
485        sub             r1,  r1,  r9, lsl #1
486        sub             r3,  r3,  r9, lsl #1
487.endif
488        sub             r8,  r8,  r9, lsl #1
489        mov             r9,  r6
490        blt             2f
4911:
492.if \bpc == 8
493        vld1.8          {d0},  [r2,  :64]!
494        vld1.8          {d16}, [r11, :64]!
495.else
496        vld1.16         {q0},  [r2,  :128]!
497        vld1.16         {q8},  [r11, :128]!
498.endif
499        vld1.16         {q1},  [r4,  :128]!
500        vld1.16         {q9},  [r12, :128]!
501        vld1.16         {q2},  [r5,  :128]!
502        vld1.16         {q10}, [lr,  :128]!
503        subs            r6,  r6,  #8
504.if \bpc == 8
505        vshll.u8        q0,  d0,  #4     // u
506        vshll.u8        q8,  d16, #4     // u
507.else
508        vshl.i16        q0,  q0,  #4     // u
509        vshl.i16        q8,  q8,  #4     // u
510.endif
511        vsub.i16        q1,  q1,  q0     // t1 - u
512        vsub.i16        q2,  q2,  q0     // t2 - u
513        vsub.i16        q9,  q9,  q8     // t1 - u
514        vsub.i16        q10, q10, q8     // t2 - u
515        vshll.u16       q3,  d0,  #7     // u << 7
516        vshll.u16       q0,  d1,  #7     // u << 7
517        vshll.u16       q11, d16, #7     // u << 7
518        vshll.u16       q8,  d17, #7     // u << 7
519        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
520        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
521        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
522        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
523        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
524        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
525        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
526        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
527.if \bpc == 8
528        vrshrn.i32      d6,  q3,  #11
529        vrshrn.i32      d7,  q0,  #11
530        vrshrn.i32      d22, q11, #11
531        vrshrn.i32      d23, q8,  #11
532        vqmovun.s16     d6,  q3
533        vqmovun.s16     d22, q11
534        vst1.8          {d6},  [r0,  :64]!
535        vst1.8          {d22}, [r10, :64]!
536.else
537        vqrshrun.s32    d6,  q3,  #11
538        vqrshrun.s32    d7,  q0,  #11
539        vqrshrun.s32    d22, q11, #11
540        vqrshrun.s32    d23, q8,  #11
541        vmin.u16        q3,  q3,  q14
542        vmin.u16        q11, q11, q14
543        vst1.16         {q3},  [r0,  :128]!
544        vst1.16         {q11}, [r10, :128]!
545.endif
546        bgt             1b
547
548        subs            r7,  r7,  #2
549        cmp             r7,  #1
550        blt             0f
551        mov             r6,  r9
552        add             r0,  r0,  r1
553        add             r10, r10, r1
554        add             r2,  r2,  r3
555        add             r11, r11, r3
556        add             r4,  r4,  r8
557        add             r12, r12, r8
558        add             r5,  r5,  r8
559        add             lr,  lr,  r8
560        beq             2f
561        b               1b
562
5632:
564.if \bpc == 8
565        vld1.8          {d0}, [r2, :64]!
566.else
567        vld1.16         {q0}, [r2, :128]!
568.endif
569        vld1.16         {q1}, [r4, :128]!
570        vld1.16         {q2}, [r5, :128]!
571        subs            r6,  r6,  #8
572.if \bpc == 8
573        vshll.u8        q0,  d0,  #4     // u
574.else
575        vshl.i16        q0,  q0,  #4     // u
576.endif
577        vsub.i16        q1,  q1,  q0     // t1 - u
578        vsub.i16        q2,  q2,  q0     // t2 - u
579        vshll.u16       q3,  d0,  #7     // u << 7
580        vshll.u16       q0,  d1,  #7     // u << 7
581        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
582        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
583        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
584        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
585.if \bpc == 8
586        vrshrn.i32      d6,  q3,  #11
587        vrshrn.i32      d7,  q0,  #11
588        vqmovun.s16     d6,  q3
589        vst1.8          {d6}, [r0, :64]!
590.else
591        vqrshrun.s32    d6,  q3,  #11
592        vqrshrun.s32    d7,  q0,  #11
593        vmin.u16        q3,  q3,  q14
594        vst1.16         {q3}, [r0, :128]!
595.endif
596        bgt             1b
5970:
598        pop             {r4-r11,pc}
599endfunc
600.endm
601