1*c0909341SAndroid Build Coastguard Worker/* 2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors 3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo 4*c0909341SAndroid Build Coastguard Worker * All rights reserved. 5*c0909341SAndroid Build Coastguard Worker * 6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker * 9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker * 12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker * 16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker */ 27*c0909341SAndroid Build Coastguard Worker 28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S" 29*c0909341SAndroid Build Coastguard Worker#include "util.S" 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Workerconst right_ext_mask_buf 32*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 37*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 38*c0909341SAndroid Build Coastguard Workerright_ext_mask: 39*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 42*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 43*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 44*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 45*c0909341SAndroid Build Coastguard Workerendconst 46*c0909341SAndroid Build Coastguard Worker 47*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, 48*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], const pixel *lpf, 49*c0909341SAndroid Build Coastguard Worker// const int w, int h, 50*c0909341SAndroid Build Coastguard Worker// const int16_t filter[2][8], 51*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges, 52*c0909341SAndroid Build Coastguard Worker// const int bitdepth_max); 53*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_16bpc_neon, export=1 54*c0909341SAndroid Build Coastguard Worker ldr w8, [sp] 55*c0909341SAndroid Build Coastguard Worker AARCH64_SIGN_LINK_REGISTER 56*c0909341SAndroid Build Coastguard Worker stp x29, x30, [sp, #-32]! 57*c0909341SAndroid Build Coastguard Worker stp d8, d9, [sp, #16] 58*c0909341SAndroid Build Coastguard Worker mov x29, sp 59*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x6] 60*c0909341SAndroid Build Coastguard Worker tst w7, #4 // LR_HAVE_TOP 61*c0909341SAndroid Build Coastguard Worker sub_sp 384*2*6 62*c0909341SAndroid Build Coastguard Worker 63*c0909341SAndroid Build Coastguard Worker dup v28.8h, w8 // bitdepth_max 64*c0909341SAndroid Build Coastguard Worker clz w8, w8 65*c0909341SAndroid Build Coastguard Worker movi v30.4s, #1 66*c0909341SAndroid Build Coastguard Worker sub w10, w8, #38 // -(bitdepth + 6) 67*c0909341SAndroid Build Coastguard Worker sub w11, w8, #11 // round_bits_v 68*c0909341SAndroid Build Coastguard Worker sub w8, w8, #25 // -round_bits_h 69*c0909341SAndroid Build Coastguard Worker neg w10, w10 // bitdepth + 6 70*c0909341SAndroid Build Coastguard Worker neg w11, w11 // -round_bits_v 71*c0909341SAndroid Build Coastguard Worker dup v2.4s, w10 72*c0909341SAndroid Build Coastguard Worker dup v29.4s, w8 // -round_bits_h 73*c0909341SAndroid Build Coastguard Worker dup v27.4s, w11 // -round_bits_v 74*c0909341SAndroid Build Coastguard Worker movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 75*c0909341SAndroid Build Coastguard Worker ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) 76*c0909341SAndroid Build Coastguard Worker 77*c0909341SAndroid Build Coastguard Worker zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 78*c0909341SAndroid Build Coastguard Worker 79*c0909341SAndroid Build Coastguard Worker // x9 - t6 80*c0909341SAndroid Build Coastguard Worker // x10 - t5 81*c0909341SAndroid Build Coastguard Worker // x11 - t4 82*c0909341SAndroid Build Coastguard Worker // x12 - t3 83*c0909341SAndroid Build Coastguard Worker // x13 - t2 84*c0909341SAndroid Build Coastguard Worker // x14 - t1 85*c0909341SAndroid Build Coastguard Worker // x15 - t0 86*c0909341SAndroid Build Coastguard Worker mov x14, sp // t1 87*c0909341SAndroid Build Coastguard Worker b.eq L(no_top_7) 88*c0909341SAndroid Build Coastguard Worker 89*c0909341SAndroid Build Coastguard Worker mov x16, x2 // backup left 90*c0909341SAndroid Build Coastguard Worker mov x2, #0 91*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 92*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride 93*c0909341SAndroid Build Coastguard Worker mov x9, x14 // t6 94*c0909341SAndroid Build Coastguard Worker mov x10, x14 // t5 95*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 96*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 97*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 98*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride*5 99*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 100*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 101*c0909341SAndroid Build Coastguard Worker mov x2, x16 // left 102*c0909341SAndroid Build Coastguard Worker mov x16, x3 // backup lpf 103*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 104*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 105*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 106*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 107*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 108*c0909341SAndroid Build Coastguard Worker b.eq L(v1_7) 109*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 110*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 111*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 112*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 113*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 114*c0909341SAndroid Build Coastguard Worker b.eq L(v2_7) 115*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 116*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 117*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 118*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 119*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 120*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 121*c0909341SAndroid Build Coastguard Worker 122*c0909341SAndroid Build Coastguard WorkerL(main_7): 123*c0909341SAndroid Build Coastguard Worker add x15, x14, #384*2 // t0 = t1 + 384*2 124*c0909341SAndroid Build Coastguard WorkerL(main_loop_7): 125*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_16bpc_neon 126*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 127*c0909341SAndroid Build Coastguard Worker b.ne L(main_loop_7) 128*c0909341SAndroid Build Coastguard Worker tst w7, #8 // LR_HAVE_BOTTOM 129*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 130*c0909341SAndroid Build Coastguard Worker 131*c0909341SAndroid Build Coastguard Worker mov x3, x16 // restore lpf 132*c0909341SAndroid Build Coastguard Worker mov x2, #0 // left = NULL 133*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_16bpc_neon 134*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_16bpc_neon 135*c0909341SAndroid Build Coastguard WorkerL(v1_7): 136*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_v_16bpc_neon 137*c0909341SAndroid Build Coastguard Worker 138*c0909341SAndroid Build Coastguard Worker mov sp, x29 139*c0909341SAndroid Build Coastguard Worker ldp d8, d9, [sp, #16] 140*c0909341SAndroid Build Coastguard Worker ldp x29, x30, [sp], #32 141*c0909341SAndroid Build Coastguard Worker AARCH64_VALIDATE_LINK_REGISTER 142*c0909341SAndroid Build Coastguard Worker ret 143*c0909341SAndroid Build Coastguard Worker 144*c0909341SAndroid Build Coastguard WorkerL(no_top_7): 145*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 146*c0909341SAndroid Build Coastguard Worker add x16, x3, x1, lsl #1 // lpf += stride*6, backup 147*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 148*c0909341SAndroid Build Coastguard Worker 149*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 150*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 151*c0909341SAndroid Build Coastguard Worker mov x9, x14 // t6 152*c0909341SAndroid Build Coastguard Worker mov x10, x14 // t5 153*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 154*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 155*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 156*c0909341SAndroid Build Coastguard Worker b.eq L(v1_7) 157*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += p_stride 158*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 159*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 160*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 161*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 162*c0909341SAndroid Build Coastguard Worker b.eq L(v2_7) 163*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += p_stride 164*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 165*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_16bpc_neon 166*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 167*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 168*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += p_stride 169*c0909341SAndroid Build Coastguard Worker add x15, x14, #384*2 // t0 = t1 + 384*2 170*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_16bpc_neon 171*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 172*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 173*c0909341SAndroid Build Coastguard Worker add x15, x15, #384*2*4 // t0 += 384*2*4 174*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_16bpc_neon 175*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 176*c0909341SAndroid Build Coastguard Worker b.ne L(main_7) 177*c0909341SAndroid Build Coastguard WorkerL(v3_7): 178*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_v_16bpc_neon 179*c0909341SAndroid Build Coastguard WorkerL(v2_7): 180*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_v_16bpc_neon 181*c0909341SAndroid Build Coastguard Worker b L(v1_7) 182*c0909341SAndroid Build Coastguard Workerendfunc 183*c0909341SAndroid Build Coastguard Worker 184*c0909341SAndroid Build Coastguard Worker 185*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_h_16bpc_neon 186*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #-32]! 187*c0909341SAndroid Build Coastguard Worker str x14, [sp, #16] 188*c0909341SAndroid Build Coastguard Worker 189*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 190*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 191*c0909341SAndroid Build Coastguard Worker b.eq 1f 192*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 193*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 194*c0909341SAndroid Build Coastguard Worker // left == NULL 195*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 196*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 197*c0909341SAndroid Build Coastguard Worker b 2f 198*c0909341SAndroid Build Coastguard Worker 199*c0909341SAndroid Build Coastguard Worker0: 200*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 201*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 202*c0909341SAndroid Build Coastguard Worker ld1 {v4.d}[1], [x2], #8 203*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded earlier, 204*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 205*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 206*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #10 207*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #10 208*c0909341SAndroid Build Coastguard Worker b 2f 209*c0909341SAndroid Build Coastguard Worker 210*c0909341SAndroid Build Coastguard Worker1: 211*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 212*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v4 with the leftmost pixel 213*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 3x the first pixel at the front. 214*c0909341SAndroid Build Coastguard Worker dup v4.8h, v2.h[0] 215*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded before, 216*c0909341SAndroid Build Coastguard Worker // which we shifted out. 217*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 218*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #10 219*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #10 220*c0909341SAndroid Build Coastguard Worker 221*c0909341SAndroid Build Coastguard Worker2: 222*c0909341SAndroid Build Coastguard Worker ld1 {v4.8h}, [x3], #16 223*c0909341SAndroid Build Coastguard Worker 224*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 225*c0909341SAndroid Build Coastguard Worker b.ne 4f 226*c0909341SAndroid Build Coastguard Worker 227*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 228*c0909341SAndroid Build Coastguard Worker 229*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 230*c0909341SAndroid Build Coastguard Worker cmp w4, #19 231*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 19, all used input pixels are valid 232*c0909341SAndroid Build Coastguard Worker 233*c0909341SAndroid Build Coastguard Worker // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 234*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 235*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 236*c0909341SAndroid Build Coastguard Worker 237*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 238*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 239*c0909341SAndroid Build Coastguard Worker sub w17, w4, #22 240*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 241*c0909341SAndroid Build Coastguard Worker // buffer pointer. 242*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -6 243*c0909341SAndroid Build Coastguard Worker ldr h26, [x3, w17, sxtw #1] 244*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 245*c0909341SAndroid Build Coastguard Worker dup v26.8h, v26.h[0] 246*c0909341SAndroid Build Coastguard Worker ld1 {v23.16b, v24.16b, v25.16b}, [x6] 247*c0909341SAndroid Build Coastguard Worker 248*c0909341SAndroid Build Coastguard Worker bit v2.16b, v26.16b, v23.16b 249*c0909341SAndroid Build Coastguard Worker bit v3.16b, v26.16b, v24.16b 250*c0909341SAndroid Build Coastguard Worker bit v4.16b, v26.16b, v25.16b 251*c0909341SAndroid Build Coastguard Worker 252*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 253*c0909341SAndroid Build Coastguard Worker // Interleaving the mul/mla chains actually hurts performance 254*c0909341SAndroid Build Coastguard Worker // significantly on Cortex A53, thus keeping mul/mla tightly 255*c0909341SAndroid Build Coastguard Worker // chained like this. 256*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 257*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 258*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 259*c0909341SAndroid Build Coastguard Worker ext v20.16b, v2.16b, v3.16b, #10 260*c0909341SAndroid Build Coastguard Worker ext v21.16b, v2.16b, v3.16b, #12 261*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 262*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 263*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 264*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v2.8h 265*c0909341SAndroid Build Coastguard Worker smull v6.4s, v18.4h, v0.h[3] 266*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v19.4h, v0.h[2] 267*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v20.4h, v0.h[1] 268*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v21.4h, v0.h[0] 269*c0909341SAndroid Build Coastguard Worker smull2 v7.4s, v18.8h, v0.h[3] 270*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v19.8h, v0.h[2] 271*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v20.8h, v0.h[1] 272*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v21.8h, v0.h[0] 273*c0909341SAndroid Build Coastguard Worker 274*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 275*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 276*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 277*c0909341SAndroid Build Coastguard Worker ext v20.16b, v3.16b, v4.16b, #10 278*c0909341SAndroid Build Coastguard Worker ext v21.16b, v3.16b, v4.16b, #12 279*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 280*c0909341SAndroid Build Coastguard Worker 281*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 282*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 283*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v3.8h 284*c0909341SAndroid Build Coastguard Worker smull v16.4s, v18.4h, v0.h[3] 285*c0909341SAndroid Build Coastguard Worker smlal v16.4s, v19.4h, v0.h[2] 286*c0909341SAndroid Build Coastguard Worker smlal v16.4s, v20.4h, v0.h[1] 287*c0909341SAndroid Build Coastguard Worker smlal v16.4s, v21.4h, v0.h[0] 288*c0909341SAndroid Build Coastguard Worker smull2 v17.4s, v18.8h, v0.h[3] 289*c0909341SAndroid Build Coastguard Worker smlal2 v17.4s, v19.8h, v0.h[2] 290*c0909341SAndroid Build Coastguard Worker smlal2 v17.4s, v20.8h, v0.h[1] 291*c0909341SAndroid Build Coastguard Worker smlal2 v17.4s, v21.8h, v0.h[0] 292*c0909341SAndroid Build Coastguard Worker 293*c0909341SAndroid Build Coastguard Worker mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 294*c0909341SAndroid Build Coastguard Worker add v6.4s, v6.4s, v30.4s 295*c0909341SAndroid Build Coastguard Worker add v7.4s, v7.4s, v30.4s 296*c0909341SAndroid Build Coastguard Worker add v16.4s, v16.4s, v30.4s 297*c0909341SAndroid Build Coastguard Worker add v17.4s, v17.4s, v30.4s 298*c0909341SAndroid Build Coastguard Worker srshl v6.4s, v6.4s, v29.4s 299*c0909341SAndroid Build Coastguard Worker srshl v7.4s, v7.4s, v29.4s 300*c0909341SAndroid Build Coastguard Worker srshl v16.4s, v16.4s, v29.4s 301*c0909341SAndroid Build Coastguard Worker srshl v17.4s, v17.4s, v29.4s 302*c0909341SAndroid Build Coastguard Worker sqxtun v6.4h, v6.4s 303*c0909341SAndroid Build Coastguard Worker sqxtun2 v6.8h, v7.4s 304*c0909341SAndroid Build Coastguard Worker sqxtun v7.4h, v16.4s 305*c0909341SAndroid Build Coastguard Worker sqxtun2 v7.8h, v17.4s 306*c0909341SAndroid Build Coastguard Worker umin v6.8h, v6.8h, v24.8h 307*c0909341SAndroid Build Coastguard Worker umin v7.8h, v7.8h, v24.8h 308*c0909341SAndroid Build Coastguard Worker sub v6.8h, v6.8h, v31.8h 309*c0909341SAndroid Build Coastguard Worker sub v7.8h, v7.8h, v31.8h 310*c0909341SAndroid Build Coastguard Worker 311*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 312*c0909341SAndroid Build Coastguard Worker 313*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x14], #32 314*c0909341SAndroid Build Coastguard Worker 315*c0909341SAndroid Build Coastguard Worker b.le 0f 316*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 317*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 318*c0909341SAndroid Build Coastguard Worker ld1 {v3.8h, v4.8h}, [x3], #32 319*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 320*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 321*c0909341SAndroid Build Coastguard Worker 322*c0909341SAndroid Build Coastguard Worker0: 323*c0909341SAndroid Build Coastguard Worker ldr x14, [sp, #16] 324*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp], #32 325*c0909341SAndroid Build Coastguard Worker ret 326*c0909341SAndroid Build Coastguard Workerendfunc 327*c0909341SAndroid Build Coastguard Worker 328*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_v_16bpc_neon 329*c0909341SAndroid Build Coastguard Worker // Backing up/restoring registers shifted, so that x9 gets the value 330*c0909341SAndroid Build Coastguard Worker // of x10, etc, afterwards. 331*c0909341SAndroid Build Coastguard Worker stp x10, x11, [sp, #-64]! 332*c0909341SAndroid Build Coastguard Worker stp x12, x13, [sp, #16] 333*c0909341SAndroid Build Coastguard Worker stp x14, x14, [sp, #32] 334*c0909341SAndroid Build Coastguard Worker stp x0, x4, [sp, #48] 335*c0909341SAndroid Build Coastguard Worker1: 336*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x9], #32 337*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x10], #32 338*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x11], #32 339*c0909341SAndroid Build Coastguard Worker ld1 {v22.8h, v23.8h}, [x12], #32 340*c0909341SAndroid Build Coastguard Worker ld1 {v24.8h, v25.8h}, [x13], #32 341*c0909341SAndroid Build Coastguard Worker ld1 {v6.8h, v7.8h}, [x14], #32 342*c0909341SAndroid Build Coastguard Worker 343*c0909341SAndroid Build Coastguard Worker smull v2.4s, v16.4h, v0.h[4] 344*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v18.4h, v0.h[5] 345*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v20.4h, v0.h[6] 346*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v22.4h, v0.h[7] 347*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v24.4h, v0.h[6] 348*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v6.4h, v0.h[5] 349*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v6.4h, v0.h[4] 350*c0909341SAndroid Build Coastguard Worker smull2 v3.4s, v16.8h, v0.h[4] 351*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v18.8h, v0.h[5] 352*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v20.8h, v0.h[6] 353*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v22.8h, v0.h[7] 354*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v24.8h, v0.h[6] 355*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v6.8h, v0.h[5] 356*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v6.8h, v0.h[4] 357*c0909341SAndroid Build Coastguard Worker smull v4.4s, v17.4h, v0.h[4] 358*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v19.4h, v0.h[5] 359*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v21.4h, v0.h[6] 360*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v23.4h, v0.h[7] 361*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v25.4h, v0.h[6] 362*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v7.4h, v0.h[5] 363*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v7.4h, v0.h[4] 364*c0909341SAndroid Build Coastguard Worker smull2 v5.4s, v17.8h, v0.h[4] 365*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v19.8h, v0.h[5] 366*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v21.8h, v0.h[6] 367*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v23.8h, v0.h[7] 368*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v25.8h, v0.h[6] 369*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v7.8h, v0.h[5] 370*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v7.8h, v0.h[4] 371*c0909341SAndroid Build Coastguard Worker srshl v2.4s, v2.4s, v27.4s // -round_bits_v 372*c0909341SAndroid Build Coastguard Worker srshl v3.4s, v3.4s, v27.4s 373*c0909341SAndroid Build Coastguard Worker srshl v4.4s, v4.4s, v27.4s 374*c0909341SAndroid Build Coastguard Worker srshl v5.4s, v5.4s, v27.4s 375*c0909341SAndroid Build Coastguard Worker sqxtun v2.4h, v2.4s 376*c0909341SAndroid Build Coastguard Worker sqxtun2 v2.8h, v3.4s 377*c0909341SAndroid Build Coastguard Worker sqxtun v3.4h, v4.4s 378*c0909341SAndroid Build Coastguard Worker sqxtun2 v3.8h, v5.4s 379*c0909341SAndroid Build Coastguard Worker umin v2.8h, v2.8h, v28.8h // bitdepth_max 380*c0909341SAndroid Build Coastguard Worker umin v3.8h, v3.8h, v28.8h 381*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 382*c0909341SAndroid Build Coastguard Worker st1 {v2.8h, v3.8h}, [x0], #32 383*c0909341SAndroid Build Coastguard Worker b.gt 1b 384*c0909341SAndroid Build Coastguard Worker 385*c0909341SAndroid Build Coastguard Worker ldp x0, x4, [sp, #48] 386*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #32] 387*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp, #16] 388*c0909341SAndroid Build Coastguard Worker ldp x9, x10, [sp], #64 389*c0909341SAndroid Build Coastguard Worker 390*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 391*c0909341SAndroid Build Coastguard Worker ret 392*c0909341SAndroid Build Coastguard Workerendfunc 393*c0909341SAndroid Build Coastguard Worker 394*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_hv_16bpc_neon 395*c0909341SAndroid Build Coastguard Worker // Backing up/restoring registers shifted, so that x9 gets the value 396*c0909341SAndroid Build Coastguard Worker // of x10, etc, and x15==x9, afterwards. 397*c0909341SAndroid Build Coastguard Worker stp x10, x11, [sp, #-80]! 398*c0909341SAndroid Build Coastguard Worker stp x12, x13, [sp, #16] 399*c0909341SAndroid Build Coastguard Worker stp x14, x15, [sp, #32] 400*c0909341SAndroid Build Coastguard Worker stp x10, x0, [sp, #48] 401*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #64] 402*c0909341SAndroid Build Coastguard Worker 403*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 404*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 405*c0909341SAndroid Build Coastguard Worker b.eq 1f 406*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 407*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 408*c0909341SAndroid Build Coastguard Worker // left == NULL 409*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 410*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 411*c0909341SAndroid Build Coastguard Worker b 2f 412*c0909341SAndroid Build Coastguard Worker 413*c0909341SAndroid Build Coastguard Worker0: 414*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 415*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 416*c0909341SAndroid Build Coastguard Worker ld1 {v4.d}[1], [x2], #8 417*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded earlier, 418*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 419*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 420*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #10 421*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #10 422*c0909341SAndroid Build Coastguard Worker b 2f 423*c0909341SAndroid Build Coastguard Worker1: 424*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 425*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v4 with the leftmost pixel 426*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 3x the first pixel at the front. 427*c0909341SAndroid Build Coastguard Worker dup v4.8h, v2.h[0] 428*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded before, 429*c0909341SAndroid Build Coastguard Worker // which we shifted out. 430*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 431*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #10 432*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #10 433*c0909341SAndroid Build Coastguard Worker 434*c0909341SAndroid Build Coastguard Worker2: 435*c0909341SAndroid Build Coastguard Worker ld1 {v4.8h}, [x3], #16 436*c0909341SAndroid Build Coastguard Worker 437*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 438*c0909341SAndroid Build Coastguard Worker b.ne 4f 439*c0909341SAndroid Build Coastguard Worker 440*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 441*c0909341SAndroid Build Coastguard Worker 442*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 443*c0909341SAndroid Build Coastguard Worker cmp w4, #19 444*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 19, all used input pixels are valid 445*c0909341SAndroid Build Coastguard Worker 446*c0909341SAndroid Build Coastguard Worker // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 447*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 448*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 449*c0909341SAndroid Build Coastguard Worker 450*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 451*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 452*c0909341SAndroid Build Coastguard Worker sub w17, w4, #22 453*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 454*c0909341SAndroid Build Coastguard Worker // buffer pointer. 455*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -6 456*c0909341SAndroid Build Coastguard Worker ldr h26, [x3, w17, sxtw #1] 457*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 458*c0909341SAndroid Build Coastguard Worker dup v26.8h, v26.h[0] 459*c0909341SAndroid Build Coastguard Worker ld1 {v23.16b, v24.16b, v25.16b}, [x6] 460*c0909341SAndroid Build Coastguard Worker 461*c0909341SAndroid Build Coastguard Worker bit v2.16b, v26.16b, v23.16b 462*c0909341SAndroid Build Coastguard Worker bit v3.16b, v26.16b, v24.16b 463*c0909341SAndroid Build Coastguard Worker bit v4.16b, v26.16b, v25.16b 464*c0909341SAndroid Build Coastguard Worker 465*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 466*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 467*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 468*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 469*c0909341SAndroid Build Coastguard Worker ext v20.16b, v2.16b, v3.16b, #10 470*c0909341SAndroid Build Coastguard Worker ext v21.16b, v2.16b, v3.16b, #12 471*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 472*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 473*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 474*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v2.8h 475*c0909341SAndroid Build Coastguard Worker smull v6.4s, v18.4h, v0.h[3] 476*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v19.4h, v0.h[2] 477*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v20.4h, v0.h[1] 478*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v21.4h, v0.h[0] 479*c0909341SAndroid Build Coastguard Worker smull2 v7.4s, v18.8h, v0.h[3] 480*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v19.8h, v0.h[2] 481*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v20.8h, v0.h[1] 482*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v21.8h, v0.h[0] 483*c0909341SAndroid Build Coastguard Worker 484*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 485*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 486*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 487*c0909341SAndroid Build Coastguard Worker ext v20.16b, v3.16b, v4.16b, #10 488*c0909341SAndroid Build Coastguard Worker ext v21.16b, v3.16b, v4.16b, #12 489*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 490*c0909341SAndroid Build Coastguard Worker 491*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 492*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 493*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v3.8h 494*c0909341SAndroid Build Coastguard Worker smull v24.4s, v18.4h, v0.h[3] 495*c0909341SAndroid Build Coastguard Worker smlal v24.4s, v19.4h, v0.h[2] 496*c0909341SAndroid Build Coastguard Worker smlal v24.4s, v20.4h, v0.h[1] 497*c0909341SAndroid Build Coastguard Worker smlal v24.4s, v21.4h, v0.h[0] 498*c0909341SAndroid Build Coastguard Worker smull2 v25.4s, v18.8h, v0.h[3] 499*c0909341SAndroid Build Coastguard Worker smlal2 v25.4s, v19.8h, v0.h[2] 500*c0909341SAndroid Build Coastguard Worker smlal2 v25.4s, v20.8h, v0.h[1] 501*c0909341SAndroid Build Coastguard Worker smlal2 v25.4s, v21.8h, v0.h[0] 502*c0909341SAndroid Build Coastguard Worker 503*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x9], #32 504*c0909341SAndroid Build Coastguard Worker 505*c0909341SAndroid Build Coastguard Worker mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 506*c0909341SAndroid Build Coastguard Worker add v6.4s, v6.4s, v30.4s 507*c0909341SAndroid Build Coastguard Worker add v7.4s, v7.4s, v30.4s 508*c0909341SAndroid Build Coastguard Worker add v24.4s, v24.4s, v30.4s 509*c0909341SAndroid Build Coastguard Worker add v25.4s, v25.4s, v30.4s 510*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x10], #32 511*c0909341SAndroid Build Coastguard Worker srshl v6.4s, v6.4s, v29.4s 512*c0909341SAndroid Build Coastguard Worker srshl v7.4s, v7.4s, v29.4s 513*c0909341SAndroid Build Coastguard Worker srshl v24.4s, v24.4s, v29.4s 514*c0909341SAndroid Build Coastguard Worker srshl v25.4s, v25.4s, v29.4s 515*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x11], #32 516*c0909341SAndroid Build Coastguard Worker sqxtun v6.4h, v6.4s 517*c0909341SAndroid Build Coastguard Worker sqxtun2 v6.8h, v7.4s 518*c0909341SAndroid Build Coastguard Worker sqxtun v7.4h, v24.4s 519*c0909341SAndroid Build Coastguard Worker sqxtun2 v7.8h, v25.4s 520*c0909341SAndroid Build Coastguard Worker ld1 {v22.8h, v23.8h}, [x12], #32 521*c0909341SAndroid Build Coastguard Worker umin v6.8h, v6.8h, v26.8h 522*c0909341SAndroid Build Coastguard Worker umin v7.8h, v7.8h, v26.8h 523*c0909341SAndroid Build Coastguard Worker ld1 {v24.8h, v25.8h}, [x13], #32 524*c0909341SAndroid Build Coastguard Worker sub v6.8h, v6.8h, v31.8h 525*c0909341SAndroid Build Coastguard Worker sub v7.8h, v7.8h, v31.8h 526*c0909341SAndroid Build Coastguard Worker 527*c0909341SAndroid Build Coastguard Worker ld1 {v8.8h, v9.8h}, [x14], #32 528*c0909341SAndroid Build Coastguard Worker 529*c0909341SAndroid Build Coastguard Worker smull v1.4s, v16.4h, v0.h[4] 530*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v18.4h, v0.h[5] 531*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v20.4h, v0.h[6] 532*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v22.4h, v0.h[7] 533*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v24.4h, v0.h[6] 534*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v8.4h, v0.h[5] 535*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v6.4h, v0.h[4] 536*c0909341SAndroid Build Coastguard Worker smull2 v5.4s, v16.8h, v0.h[4] 537*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v18.8h, v0.h[5] 538*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v20.8h, v0.h[6] 539*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v22.8h, v0.h[7] 540*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v24.8h, v0.h[6] 541*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v8.8h, v0.h[5] 542*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v6.8h, v0.h[4] 543*c0909341SAndroid Build Coastguard Worker smull v26.4s, v17.4h, v0.h[4] 544*c0909341SAndroid Build Coastguard Worker smlal v26.4s, v19.4h, v0.h[5] 545*c0909341SAndroid Build Coastguard Worker smlal v26.4s, v21.4h, v0.h[6] 546*c0909341SAndroid Build Coastguard Worker smlal v26.4s, v23.4h, v0.h[7] 547*c0909341SAndroid Build Coastguard Worker smlal v26.4s, v25.4h, v0.h[6] 548*c0909341SAndroid Build Coastguard Worker smlal v26.4s, v9.4h, v0.h[5] 549*c0909341SAndroid Build Coastguard Worker smlal v26.4s, v7.4h, v0.h[4] 550*c0909341SAndroid Build Coastguard Worker smull2 v16.4s, v17.8h, v0.h[4] 551*c0909341SAndroid Build Coastguard Worker smlal2 v16.4s, v19.8h, v0.h[5] 552*c0909341SAndroid Build Coastguard Worker smlal2 v16.4s, v21.8h, v0.h[6] 553*c0909341SAndroid Build Coastguard Worker smlal2 v16.4s, v23.8h, v0.h[7] 554*c0909341SAndroid Build Coastguard Worker smlal2 v16.4s, v25.8h, v0.h[6] 555*c0909341SAndroid Build Coastguard Worker smlal2 v16.4s, v9.8h, v0.h[5] 556*c0909341SAndroid Build Coastguard Worker smlal2 v16.4s, v7.8h, v0.h[4] 557*c0909341SAndroid Build Coastguard Worker srshl v1.4s, v1.4s, v27.4s // -round_bits_v 558*c0909341SAndroid Build Coastguard Worker srshl v5.4s, v5.4s, v27.4s 559*c0909341SAndroid Build Coastguard Worker srshl v26.4s, v26.4s, v27.4s 560*c0909341SAndroid Build Coastguard Worker srshl v16.4s, v16.4s, v27.4s 561*c0909341SAndroid Build Coastguard Worker sqxtun v18.4h, v1.4s 562*c0909341SAndroid Build Coastguard Worker sqxtun2 v18.8h, v5.4s 563*c0909341SAndroid Build Coastguard Worker sqxtun v19.4h, v26.4s 564*c0909341SAndroid Build Coastguard Worker sqxtun2 v19.8h, v16.4s 565*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x15], #32 566*c0909341SAndroid Build Coastguard Worker umin v18.8h, v18.8h, v28.8h // bitdepth_max 567*c0909341SAndroid Build Coastguard Worker umin v19.8h, v19.8h, v28.8h 568*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 569*c0909341SAndroid Build Coastguard Worker 570*c0909341SAndroid Build Coastguard Worker st1 {v18.8h, v19.8h}, [x0], #32 571*c0909341SAndroid Build Coastguard Worker 572*c0909341SAndroid Build Coastguard Worker b.le 0f 573*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 574*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 575*c0909341SAndroid Build Coastguard Worker ld1 {v3.8h, v4.8h}, [x3], #32 576*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 577*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 578*c0909341SAndroid Build Coastguard Worker 579*c0909341SAndroid Build Coastguard Worker0: 580*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp, #64] 581*c0909341SAndroid Build Coastguard Worker ldp x15, x0, [sp, #48] 582*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #32] 583*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp, #16] 584*c0909341SAndroid Build Coastguard Worker ldp x9, x10, [sp], #80 585*c0909341SAndroid Build Coastguard Worker 586*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 587*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 588*c0909341SAndroid Build Coastguard Worker 589*c0909341SAndroid Build Coastguard Worker ret 590*c0909341SAndroid Build Coastguard Workerendfunc 591*c0909341SAndroid Build Coastguard Worker 592*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, 593*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], const pixel *lpf, 594*c0909341SAndroid Build Coastguard Worker// const int w, int h, 595*c0909341SAndroid Build Coastguard Worker// const int16_t filter[2][8], 596*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges, 597*c0909341SAndroid Build Coastguard Worker// const int bitdepth_max); 598*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_16bpc_neon, export=1 599*c0909341SAndroid Build Coastguard Worker ldr w8, [sp] 600*c0909341SAndroid Build Coastguard Worker AARCH64_SIGN_LINK_REGISTER 601*c0909341SAndroid Build Coastguard Worker stp x29, x30, [sp, #-32]! 602*c0909341SAndroid Build Coastguard Worker stp d8, d9, [sp, #16] 603*c0909341SAndroid Build Coastguard Worker mov x29, sp 604*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x6] 605*c0909341SAndroid Build Coastguard Worker tst w7, #4 // LR_HAVE_TOP 606*c0909341SAndroid Build Coastguard Worker sub_sp 384*2*4 607*c0909341SAndroid Build Coastguard Worker 608*c0909341SAndroid Build Coastguard Worker dup v28.8h, w8 // bitdepth_max 609*c0909341SAndroid Build Coastguard Worker clz w8, w8 610*c0909341SAndroid Build Coastguard Worker movi v30.4s, #1 611*c0909341SAndroid Build Coastguard Worker sub w10, w8, #38 // -(bitdepth + 6) 612*c0909341SAndroid Build Coastguard Worker sub w11, w8, #11 // round_bits_v 613*c0909341SAndroid Build Coastguard Worker sub w8, w8, #25 // -round_bits_h 614*c0909341SAndroid Build Coastguard Worker neg w10, w10 // bitdepth + 6 615*c0909341SAndroid Build Coastguard Worker neg w11, w11 // -round_bits_v 616*c0909341SAndroid Build Coastguard Worker dup v2.4s, w10 617*c0909341SAndroid Build Coastguard Worker dup v29.4s, w8 // -round_bits_h 618*c0909341SAndroid Build Coastguard Worker dup v27.4s, w11 // -round_bits_v 619*c0909341SAndroid Build Coastguard Worker movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 620*c0909341SAndroid Build Coastguard Worker ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) 621*c0909341SAndroid Build Coastguard Worker 622*c0909341SAndroid Build Coastguard Worker zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 623*c0909341SAndroid Build Coastguard Worker 624*c0909341SAndroid Build Coastguard Worker // x11 - t4 625*c0909341SAndroid Build Coastguard Worker // x12 - t3 626*c0909341SAndroid Build Coastguard Worker // x13 - t2 627*c0909341SAndroid Build Coastguard Worker // x14 - t1 628*c0909341SAndroid Build Coastguard Worker // x15 - t0 629*c0909341SAndroid Build Coastguard Worker mov x14, sp // t1 630*c0909341SAndroid Build Coastguard Worker b.eq L(no_top_5) 631*c0909341SAndroid Build Coastguard Worker 632*c0909341SAndroid Build Coastguard Worker mov x16, x2 // backup left 633*c0909341SAndroid Build Coastguard Worker mov x2, #0 634*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_16bpc_neon 635*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride 636*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 637*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 638*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_16bpc_neon 639*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 640*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride*5 641*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 642*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 643*c0909341SAndroid Build Coastguard Worker mov x2, x16 // left 644*c0909341SAndroid Build Coastguard Worker mov x16, x3 // backup lpf 645*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 646*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_16bpc_neon 647*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 648*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 649*c0909341SAndroid Build Coastguard Worker b.eq L(v1_5) 650*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 651*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 652*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_16bpc_neon 653*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 654*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 655*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 656*c0909341SAndroid Build Coastguard Worker 657*c0909341SAndroid Build Coastguard WorkerL(main_5): 658*c0909341SAndroid Build Coastguard Worker mov x15, x11 // t0 = t4 659*c0909341SAndroid Build Coastguard WorkerL(main_loop_5): 660*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_16bpc_neon 661*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 662*c0909341SAndroid Build Coastguard Worker b.ne L(main_loop_5) 663*c0909341SAndroid Build Coastguard Worker tst w7, #8 // LR_HAVE_BOTTOM 664*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 665*c0909341SAndroid Build Coastguard Worker 666*c0909341SAndroid Build Coastguard Worker mov x3, x16 // restore lpf 667*c0909341SAndroid Build Coastguard Worker mov x2, #0 // left = NULL 668*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_16bpc_neon 669*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_16bpc_neon 670*c0909341SAndroid Build Coastguard WorkerL(end_5): 671*c0909341SAndroid Build Coastguard Worker 672*c0909341SAndroid Build Coastguard Worker mov sp, x29 673*c0909341SAndroid Build Coastguard Worker ldp d8, d9, [sp, #16] 674*c0909341SAndroid Build Coastguard Worker ldp x29, x30, [sp], #32 675*c0909341SAndroid Build Coastguard Worker AARCH64_VALIDATE_LINK_REGISTER 676*c0909341SAndroid Build Coastguard Worker ret 677*c0909341SAndroid Build Coastguard Worker 678*c0909341SAndroid Build Coastguard WorkerL(no_top_5): 679*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 680*c0909341SAndroid Build Coastguard Worker add x16, x3, x1, lsl #1 // lpf += stride*6, backup 681*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 682*c0909341SAndroid Build Coastguard Worker 683*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_16bpc_neon 684*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 685*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 686*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 687*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 688*c0909341SAndroid Build Coastguard Worker b.eq L(v1_5) 689*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 690*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 691*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_16bpc_neon 692*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 693*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 694*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 695*c0909341SAndroid Build Coastguard Worker add x15, x14, #384*2 // t0 = t1 + 384*2 696*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_16bpc_neon 697*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 698*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 699*c0909341SAndroid Build Coastguard Worker add x15, x15, #384*2*3 // t0 += 384*2*3 700*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_16bpc_neon 701*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 702*c0909341SAndroid Build Coastguard Worker b.ne L(main_5) 703*c0909341SAndroid Build Coastguard WorkerL(v2_5): 704*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_v_16bpc_neon 705*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 706*c0909341SAndroid Build Coastguard Worker mov x11, x12 707*c0909341SAndroid Build Coastguard Worker mov x12, x13 708*c0909341SAndroid Build Coastguard Worker mov x13, x14 709*c0909341SAndroid Build Coastguard WorkerL(v1_5): 710*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_v_16bpc_neon 711*c0909341SAndroid Build Coastguard Worker b L(end_5) 712*c0909341SAndroid Build Coastguard Workerendfunc 713*c0909341SAndroid Build Coastguard Worker 714*c0909341SAndroid Build Coastguard Worker 715*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_h_16bpc_neon 716*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #-32]! 717*c0909341SAndroid Build Coastguard Worker str x14, [sp, #16] 718*c0909341SAndroid Build Coastguard Worker 719*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 720*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 721*c0909341SAndroid Build Coastguard Worker b.eq 1f 722*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 723*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 724*c0909341SAndroid Build Coastguard Worker // left == NULL 725*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 726*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 727*c0909341SAndroid Build Coastguard Worker b 2f 728*c0909341SAndroid Build Coastguard Worker 729*c0909341SAndroid Build Coastguard Worker0: 730*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 731*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 732*c0909341SAndroid Build Coastguard Worker ld1 {v4.d}[1], [x2], #8 733*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 pixels we loaded earlier, 734*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 735*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 736*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #12 737*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #12 738*c0909341SAndroid Build Coastguard Worker b 2f 739*c0909341SAndroid Build Coastguard Worker 740*c0909341SAndroid Build Coastguard Worker1: 741*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 742*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 743*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 3x the first pixel at the front. 744*c0909341SAndroid Build Coastguard Worker dup v4.8h, v2.h[0] 745*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 pixels we loaded before, 746*c0909341SAndroid Build Coastguard Worker // which we shifted out. 747*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 748*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #12 749*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #12 750*c0909341SAndroid Build Coastguard Worker 751*c0909341SAndroid Build Coastguard Worker2: 752*c0909341SAndroid Build Coastguard Worker ld1 {v4.8h}, [x3], #16 753*c0909341SAndroid Build Coastguard Worker 754*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 755*c0909341SAndroid Build Coastguard Worker b.ne 4f 756*c0909341SAndroid Build Coastguard Worker 757*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 758*c0909341SAndroid Build Coastguard Worker 759*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 760*c0909341SAndroid Build Coastguard Worker cmp w4, #18 761*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 18, all used input pixels are valid 762*c0909341SAndroid Build Coastguard Worker 763*c0909341SAndroid Build Coastguard Worker // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 764*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 765*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 766*c0909341SAndroid Build Coastguard Worker 767*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 768*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 769*c0909341SAndroid Build Coastguard Worker sub w17, w4, #23 770*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 771*c0909341SAndroid Build Coastguard Worker // buffer pointer. 772*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -4 773*c0909341SAndroid Build Coastguard Worker ldr h26, [x3, w17, sxtw #1] 774*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 775*c0909341SAndroid Build Coastguard Worker dup v26.8h, v26.h[0] 776*c0909341SAndroid Build Coastguard Worker ld1 {v23.16b, v24.16b, v25.16b}, [x6] 777*c0909341SAndroid Build Coastguard Worker 778*c0909341SAndroid Build Coastguard Worker bit v2.16b, v26.16b, v23.16b 779*c0909341SAndroid Build Coastguard Worker bit v3.16b, v26.16b, v24.16b 780*c0909341SAndroid Build Coastguard Worker bit v4.16b, v26.16b, v25.16b 781*c0909341SAndroid Build Coastguard Worker 782*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 783*c0909341SAndroid Build Coastguard Worker // Interleaving the mul/mla chains actually hurts performance 784*c0909341SAndroid Build Coastguard Worker // significantly on Cortex A53, thus keeping mul/mla tightly 785*c0909341SAndroid Build Coastguard Worker // chained like this. 786*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 787*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 788*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 789*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 790*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 791*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v2.8h 792*c0909341SAndroid Build Coastguard Worker smull v6.4s, v17.4h, v0.h[3] 793*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v18.4h, v0.h[2] 794*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v19.4h, v0.h[1] 795*c0909341SAndroid Build Coastguard Worker smull2 v7.4s, v17.8h, v0.h[3] 796*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v18.8h, v0.h[2] 797*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v19.8h, v0.h[1] 798*c0909341SAndroid Build Coastguard Worker 799*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 800*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 801*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 802*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 803*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 804*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v3.8h 805*c0909341SAndroid Build Coastguard Worker smull v16.4s, v17.4h, v0.h[3] 806*c0909341SAndroid Build Coastguard Worker smlal v16.4s, v18.4h, v0.h[2] 807*c0909341SAndroid Build Coastguard Worker smlal v16.4s, v19.4h, v0.h[1] 808*c0909341SAndroid Build Coastguard Worker smull2 v17.4s, v17.8h, v0.h[3] 809*c0909341SAndroid Build Coastguard Worker smlal2 v17.4s, v18.8h, v0.h[2] 810*c0909341SAndroid Build Coastguard Worker smlal2 v17.4s, v19.8h, v0.h[1] 811*c0909341SAndroid Build Coastguard Worker 812*c0909341SAndroid Build Coastguard Worker mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 813*c0909341SAndroid Build Coastguard Worker add v6.4s, v6.4s, v30.4s 814*c0909341SAndroid Build Coastguard Worker add v7.4s, v7.4s, v30.4s 815*c0909341SAndroid Build Coastguard Worker add v16.4s, v16.4s, v30.4s 816*c0909341SAndroid Build Coastguard Worker add v17.4s, v17.4s, v30.4s 817*c0909341SAndroid Build Coastguard Worker srshl v6.4s, v6.4s, v29.4s 818*c0909341SAndroid Build Coastguard Worker srshl v7.4s, v7.4s, v29.4s 819*c0909341SAndroid Build Coastguard Worker srshl v16.4s, v16.4s, v29.4s 820*c0909341SAndroid Build Coastguard Worker srshl v17.4s, v17.4s, v29.4s 821*c0909341SAndroid Build Coastguard Worker sqxtun v6.4h, v6.4s 822*c0909341SAndroid Build Coastguard Worker sqxtun2 v6.8h, v7.4s 823*c0909341SAndroid Build Coastguard Worker sqxtun v7.4h, v16.4s 824*c0909341SAndroid Build Coastguard Worker sqxtun2 v7.8h, v17.4s 825*c0909341SAndroid Build Coastguard Worker umin v6.8h, v6.8h, v24.8h 826*c0909341SAndroid Build Coastguard Worker umin v7.8h, v7.8h, v24.8h 827*c0909341SAndroid Build Coastguard Worker sub v6.8h, v6.8h, v31.8h 828*c0909341SAndroid Build Coastguard Worker sub v7.8h, v7.8h, v31.8h 829*c0909341SAndroid Build Coastguard Worker 830*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 831*c0909341SAndroid Build Coastguard Worker 832*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x14], #32 833*c0909341SAndroid Build Coastguard Worker 834*c0909341SAndroid Build Coastguard Worker b.le 0f 835*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 836*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 837*c0909341SAndroid Build Coastguard Worker ld1 {v3.8h, v4.8h}, [x3], #32 838*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 839*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 840*c0909341SAndroid Build Coastguard Worker 841*c0909341SAndroid Build Coastguard Worker0: 842*c0909341SAndroid Build Coastguard Worker ldr x14, [sp, #16] 843*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp], #32 844*c0909341SAndroid Build Coastguard Worker ret 845*c0909341SAndroid Build Coastguard Workerendfunc 846*c0909341SAndroid Build Coastguard Worker 847*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_v_16bpc_neon 848*c0909341SAndroid Build Coastguard Worker stp x11, x12, [sp, #-48]! 849*c0909341SAndroid Build Coastguard Worker stp x13, x14, [sp, #16] 850*c0909341SAndroid Build Coastguard Worker stp x0, x4, [sp, #32] 851*c0909341SAndroid Build Coastguard Worker1: 852*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x11], #32 853*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x12], #32 854*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x13], #32 855*c0909341SAndroid Build Coastguard Worker ld1 {v22.8h, v23.8h}, [x14], #32 856*c0909341SAndroid Build Coastguard Worker 857*c0909341SAndroid Build Coastguard Worker smull v2.4s, v16.4h, v0.h[5] 858*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v18.4h, v0.h[6] 859*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v20.4h, v0.h[7] 860*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v22.4h, v0.h[6] 861*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v22.4h, v0.h[5] 862*c0909341SAndroid Build Coastguard Worker smull2 v3.4s, v16.8h, v0.h[5] 863*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v18.8h, v0.h[6] 864*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v20.8h, v0.h[7] 865*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v22.8h, v0.h[6] 866*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v22.8h, v0.h[5] 867*c0909341SAndroid Build Coastguard Worker smull v4.4s, v17.4h, v0.h[5] 868*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v19.4h, v0.h[6] 869*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v21.4h, v0.h[7] 870*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v23.4h, v0.h[6] 871*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v23.4h, v0.h[5] 872*c0909341SAndroid Build Coastguard Worker smull2 v5.4s, v17.8h, v0.h[5] 873*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v19.8h, v0.h[6] 874*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v21.8h, v0.h[7] 875*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v23.8h, v0.h[6] 876*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v23.8h, v0.h[5] 877*c0909341SAndroid Build Coastguard Worker srshl v2.4s, v2.4s, v27.4s // -round_bits_v 878*c0909341SAndroid Build Coastguard Worker srshl v3.4s, v3.4s, v27.4s 879*c0909341SAndroid Build Coastguard Worker srshl v4.4s, v4.4s, v27.4s 880*c0909341SAndroid Build Coastguard Worker srshl v5.4s, v5.4s, v27.4s 881*c0909341SAndroid Build Coastguard Worker sqxtun v2.4h, v2.4s 882*c0909341SAndroid Build Coastguard Worker sqxtun2 v2.8h, v3.4s 883*c0909341SAndroid Build Coastguard Worker sqxtun v3.4h, v4.4s 884*c0909341SAndroid Build Coastguard Worker sqxtun2 v3.8h, v5.4s 885*c0909341SAndroid Build Coastguard Worker umin v2.8h, v2.8h, v28.8h // bitdepth_max 886*c0909341SAndroid Build Coastguard Worker umin v3.8h, v3.8h, v28.8h 887*c0909341SAndroid Build Coastguard Worker 888*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 889*c0909341SAndroid Build Coastguard Worker st1 {v2.8h, v3.8h}, [x0], #32 890*c0909341SAndroid Build Coastguard Worker b.gt 1b 891*c0909341SAndroid Build Coastguard Worker 892*c0909341SAndroid Build Coastguard Worker ldp x0, x4, [sp, #32] 893*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #16] 894*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp], #48 895*c0909341SAndroid Build Coastguard Worker 896*c0909341SAndroid Build Coastguard Worker ret 897*c0909341SAndroid Build Coastguard Workerendfunc 898*c0909341SAndroid Build Coastguard Worker 899*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_hv_16bpc_neon 900*c0909341SAndroid Build Coastguard Worker // Backing up/restoring registers shifted, so that x11 gets the value 901*c0909341SAndroid Build Coastguard Worker // of x12, etc, and x15==x11, afterwards. 902*c0909341SAndroid Build Coastguard Worker stp x12, x13, [sp, #-64]! 903*c0909341SAndroid Build Coastguard Worker stp x14, x15, [sp, #16] 904*c0909341SAndroid Build Coastguard Worker stp x12, x0, [sp, #32] 905*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #48] 906*c0909341SAndroid Build Coastguard Worker 907*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 908*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 909*c0909341SAndroid Build Coastguard Worker b.eq 1f 910*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 911*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 912*c0909341SAndroid Build Coastguard Worker // left == NULL 913*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 914*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 915*c0909341SAndroid Build Coastguard Worker b 2f 916*c0909341SAndroid Build Coastguard Worker 917*c0909341SAndroid Build Coastguard Worker0: 918*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 919*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 920*c0909341SAndroid Build Coastguard Worker ld1 {v4.d}[1], [x2], #8 921*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 pixels we loaded earlier, 922*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 923*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 924*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #12 925*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #12 926*c0909341SAndroid Build Coastguard Worker b 2f 927*c0909341SAndroid Build Coastguard Worker1: 928*c0909341SAndroid Build Coastguard Worker ld1 {v2.8h, v3.8h}, [x3], #32 929*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 930*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 2x the first pixel at the front. 931*c0909341SAndroid Build Coastguard Worker dup v4.8h, v2.h[0] 932*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 pixels we loaded before, 933*c0909341SAndroid Build Coastguard Worker // which we shifted out. 934*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 935*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #12 936*c0909341SAndroid Build Coastguard Worker ext v2.16b, v4.16b, v2.16b, #12 937*c0909341SAndroid Build Coastguard Worker 938*c0909341SAndroid Build Coastguard Worker2: 939*c0909341SAndroid Build Coastguard Worker ld1 {v4.8h}, [x3], #16 940*c0909341SAndroid Build Coastguard Worker 941*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 942*c0909341SAndroid Build Coastguard Worker b.ne 4f 943*c0909341SAndroid Build Coastguard Worker 944*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 945*c0909341SAndroid Build Coastguard Worker 946*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 947*c0909341SAndroid Build Coastguard Worker cmp w4, #18 948*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 18, all used input pixels are valid 949*c0909341SAndroid Build Coastguard Worker 950*c0909341SAndroid Build Coastguard Worker // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 951*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 952*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 953*c0909341SAndroid Build Coastguard Worker 954*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie 955*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 956*c0909341SAndroid Build Coastguard Worker sub w17, w4, #23 957*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 958*c0909341SAndroid Build Coastguard Worker // buffer pointer. 959*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -4 960*c0909341SAndroid Build Coastguard Worker ldr h26, [x3, w17, sxtw #1] 961*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 962*c0909341SAndroid Build Coastguard Worker dup v26.8h, v26.h[0] 963*c0909341SAndroid Build Coastguard Worker ld1 {v23.16b, v24.16b, v25.16b}, [x6] 964*c0909341SAndroid Build Coastguard Worker 965*c0909341SAndroid Build Coastguard Worker bit v2.16b, v26.16b, v23.16b 966*c0909341SAndroid Build Coastguard Worker bit v3.16b, v26.16b, v24.16b 967*c0909341SAndroid Build Coastguard Worker bit v4.16b, v26.16b, v25.16b 968*c0909341SAndroid Build Coastguard Worker 969*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 970*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 971*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 972*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 973*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 974*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 975*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v2.8h 976*c0909341SAndroid Build Coastguard Worker smull v6.4s, v17.4h, v0.h[3] 977*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v18.4h, v0.h[2] 978*c0909341SAndroid Build Coastguard Worker smlal v6.4s, v19.4h, v0.h[1] 979*c0909341SAndroid Build Coastguard Worker smull2 v7.4s, v17.8h, v0.h[3] 980*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v18.8h, v0.h[2] 981*c0909341SAndroid Build Coastguard Worker smlal2 v7.4s, v19.8h, v0.h[1] 982*c0909341SAndroid Build Coastguard Worker 983*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 984*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 985*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 986*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 987*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 988*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v3.8h 989*c0909341SAndroid Build Coastguard Worker smull v24.4s, v17.4h, v0.h[3] 990*c0909341SAndroid Build Coastguard Worker smlal v24.4s, v18.4h, v0.h[2] 991*c0909341SAndroid Build Coastguard Worker smlal v24.4s, v19.4h, v0.h[1] 992*c0909341SAndroid Build Coastguard Worker smull2 v25.4s, v17.8h, v0.h[3] 993*c0909341SAndroid Build Coastguard Worker smlal2 v25.4s, v18.8h, v0.h[2] 994*c0909341SAndroid Build Coastguard Worker smlal2 v25.4s, v19.8h, v0.h[1] 995*c0909341SAndroid Build Coastguard Worker 996*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x11], #32 997*c0909341SAndroid Build Coastguard Worker mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 998*c0909341SAndroid Build Coastguard Worker add v6.4s, v6.4s, v30.4s 999*c0909341SAndroid Build Coastguard Worker add v7.4s, v7.4s, v30.4s 1000*c0909341SAndroid Build Coastguard Worker add v24.4s, v24.4s, v30.4s 1001*c0909341SAndroid Build Coastguard Worker add v25.4s, v25.4s, v30.4s 1002*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x12], #32 1003*c0909341SAndroid Build Coastguard Worker srshl v6.4s, v6.4s, v29.4s 1004*c0909341SAndroid Build Coastguard Worker srshl v7.4s, v7.4s, v29.4s 1005*c0909341SAndroid Build Coastguard Worker srshl v24.4s, v24.4s, v29.4s 1006*c0909341SAndroid Build Coastguard Worker srshl v25.4s, v25.4s, v29.4s 1007*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x13], #32 1008*c0909341SAndroid Build Coastguard Worker sqxtun v6.4h, v6.4s 1009*c0909341SAndroid Build Coastguard Worker sqxtun2 v6.8h, v7.4s 1010*c0909341SAndroid Build Coastguard Worker sqxtun v7.4h, v24.4s 1011*c0909341SAndroid Build Coastguard Worker sqxtun2 v7.8h, v25.4s 1012*c0909341SAndroid Build Coastguard Worker ld1 {v22.8h, v23.8h}, [x14], #32 1013*c0909341SAndroid Build Coastguard Worker umin v6.8h, v6.8h, v26.8h 1014*c0909341SAndroid Build Coastguard Worker umin v7.8h, v7.8h, v26.8h 1015*c0909341SAndroid Build Coastguard Worker sub v6.8h, v6.8h, v31.8h 1016*c0909341SAndroid Build Coastguard Worker sub v7.8h, v7.8h, v31.8h 1017*c0909341SAndroid Build Coastguard Worker 1018*c0909341SAndroid Build Coastguard Worker smull v8.4s, v16.4h, v0.h[5] 1019*c0909341SAndroid Build Coastguard Worker smlal v8.4s, v18.4h, v0.h[6] 1020*c0909341SAndroid Build Coastguard Worker smlal v8.4s, v20.4h, v0.h[7] 1021*c0909341SAndroid Build Coastguard Worker smlal v8.4s, v22.4h, v0.h[6] 1022*c0909341SAndroid Build Coastguard Worker smlal v8.4s, v6.4h, v0.h[5] 1023*c0909341SAndroid Build Coastguard Worker smull2 v9.4s, v16.8h, v0.h[5] 1024*c0909341SAndroid Build Coastguard Worker smlal2 v9.4s, v18.8h, v0.h[6] 1025*c0909341SAndroid Build Coastguard Worker smlal2 v9.4s, v20.8h, v0.h[7] 1026*c0909341SAndroid Build Coastguard Worker smlal2 v9.4s, v22.8h, v0.h[6] 1027*c0909341SAndroid Build Coastguard Worker smlal2 v9.4s, v6.8h, v0.h[5] 1028*c0909341SAndroid Build Coastguard Worker smull v1.4s, v17.4h, v0.h[5] 1029*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v19.4h, v0.h[6] 1030*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v21.4h, v0.h[7] 1031*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v23.4h, v0.h[6] 1032*c0909341SAndroid Build Coastguard Worker smlal v1.4s, v7.4h, v0.h[5] 1033*c0909341SAndroid Build Coastguard Worker smull2 v5.4s, v17.8h, v0.h[5] 1034*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v19.8h, v0.h[6] 1035*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v21.8h, v0.h[7] 1036*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v23.8h, v0.h[6] 1037*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v7.8h, v0.h[5] 1038*c0909341SAndroid Build Coastguard Worker srshl v8.4s, v8.4s, v27.4s // -round_bits_v 1039*c0909341SAndroid Build Coastguard Worker srshl v9.4s, v9.4s, v27.4s 1040*c0909341SAndroid Build Coastguard Worker srshl v1.4s, v1.4s, v27.4s 1041*c0909341SAndroid Build Coastguard Worker srshl v5.4s, v5.4s, v27.4s 1042*c0909341SAndroid Build Coastguard Worker sqxtun v8.4h, v8.4s 1043*c0909341SAndroid Build Coastguard Worker sqxtun2 v8.8h, v9.4s 1044*c0909341SAndroid Build Coastguard Worker sqxtun v9.4h, v1.4s 1045*c0909341SAndroid Build Coastguard Worker sqxtun2 v9.8h, v5.4s 1046*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x15], #32 1047*c0909341SAndroid Build Coastguard Worker umin v8.8h, v8.8h, v28.8h // bitdepth_max 1048*c0909341SAndroid Build Coastguard Worker umin v9.8h, v9.8h, v28.8h 1049*c0909341SAndroid Build Coastguard Worker 1050*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 1051*c0909341SAndroid Build Coastguard Worker 1052*c0909341SAndroid Build Coastguard Worker st1 {v8.8h, v9.8h}, [x0], #32 1053*c0909341SAndroid Build Coastguard Worker 1054*c0909341SAndroid Build Coastguard Worker b.le 0f 1055*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 1056*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 1057*c0909341SAndroid Build Coastguard Worker ld1 {v3.8h, v4.8h}, [x3], #32 1058*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 1059*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1060*c0909341SAndroid Build Coastguard Worker 1061*c0909341SAndroid Build Coastguard Worker0: 1062*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp, #48] 1063*c0909341SAndroid Build Coastguard Worker ldp x15, x0, [sp, #32] 1064*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #16] 1065*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp], #64 1066*c0909341SAndroid Build Coastguard Worker 1067*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 1068*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 1069*c0909341SAndroid Build Coastguard Worker 1070*c0909341SAndroid Build Coastguard Worker ret 1071*c0909341SAndroid Build Coastguard Workerendfunc 1072*c0909341SAndroid Build Coastguard Worker 1073*c0909341SAndroid Build Coastguard Worker#include "looprestoration_tmpl.S" 1074*c0909341SAndroid Build Coastguard Worker 1075*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 1076*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 1077*c0909341SAndroid Build Coastguard Worker// const pixel *src, const int w, 1078*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 1079*c0909341SAndroid Build Coastguard Workerfunction sgr_box3_row_h_16bpc_neon, export=1 1080*c0909341SAndroid Build Coastguard Worker add w4, w4, #2 // w += 2 1081*c0909341SAndroid Build Coastguard Worker 1082*c0909341SAndroid Build Coastguard Worker tst w5, #1 // LR_HAVE_LEFT 1083*c0909341SAndroid Build Coastguard Worker b.eq 1f 1084*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 1085*c0909341SAndroid Build Coastguard Worker 1086*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT && left == NULL 1087*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 1088*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x3], #32 1089*c0909341SAndroid Build Coastguard Worker b 2f 1090*c0909341SAndroid Build Coastguard Worker 1091*c0909341SAndroid Build Coastguard Worker0: 1092*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 1093*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x3], #32 1094*c0909341SAndroid Build Coastguard Worker ld1 {v2.d}[1], [x2] 1095*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 pixels we loaded earlier, 1096*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 1097*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 1098*c0909341SAndroid Build Coastguard Worker ext v1.16b, v0.16b, v1.16b, #12 1099*c0909341SAndroid Build Coastguard Worker ext v0.16b, v2.16b, v0.16b, #12 1100*c0909341SAndroid Build Coastguard Worker b 2f 1101*c0909341SAndroid Build Coastguard Worker 1102*c0909341SAndroid Build Coastguard Worker1: 1103*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x3], #32 1104*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 1105*c0909341SAndroid Build Coastguard Worker // and shift v0/v1 to have 2x the first pixel at the front. 1106*c0909341SAndroid Build Coastguard Worker dup v2.8h, v0.h[0] 1107*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 pixels we loaded before, 1108*c0909341SAndroid Build Coastguard Worker // which we shifted out. 1109*c0909341SAndroid Build Coastguard Worker sub x3, x3, #4 1110*c0909341SAndroid Build Coastguard Worker ext v1.16b, v0.16b, v1.16b, #12 1111*c0909341SAndroid Build Coastguard Worker ext v0.16b, v2.16b, v0.16b, #12 1112*c0909341SAndroid Build Coastguard Worker 1113*c0909341SAndroid Build Coastguard Worker2: 1114*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1115*c0909341SAndroid Build Coastguard Worker b.ne 4f 1116*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that pixel to pad with 1117*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 1118*c0909341SAndroid Build Coastguard Worker sub w13, w4, #(2 + 16 - 2 + 1) 1119*c0909341SAndroid Build Coastguard Worker ldr h30, [x3, w13, sxtw #1] 1120*c0909341SAndroid Build Coastguard Worker // Fill v30 with the right padding pixel 1121*c0909341SAndroid Build Coastguard Worker dup v30.8h, v30.h[0] 1122*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 1123*c0909341SAndroid Build Coastguard Worker 1124*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 1125*c0909341SAndroid Build Coastguard Worker cmp w4, #10 1126*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 10, all used input pixels are valid 1127*c0909341SAndroid Build Coastguard Worker 1128*c0909341SAndroid Build Coastguard Worker // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called 1129*c0909341SAndroid Build Coastguard Worker // again; it's not strictly needed in those cases (we pad enough here), 1130*c0909341SAndroid Build Coastguard Worker // but keeping the code as simple as possible. 1131*c0909341SAndroid Build Coastguard Worker 1132*c0909341SAndroid Build Coastguard Worker // Insert padding in v0.b[w] onwards 1133*c0909341SAndroid Build Coastguard Worker movrel x13, right_ext_mask 1134*c0909341SAndroid Build Coastguard Worker sub x13, x13, w4, uxtw #1 1135*c0909341SAndroid Build Coastguard Worker ld1 {v28.16b, v29.16b}, [x13] 1136*c0909341SAndroid Build Coastguard Worker 1137*c0909341SAndroid Build Coastguard Worker bit v0.16b, v30.16b, v28.16b 1138*c0909341SAndroid Build Coastguard Worker bit v1.16b, v30.16b, v29.16b 1139*c0909341SAndroid Build Coastguard Worker 1140*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 1141*c0909341SAndroid Build Coastguard Worker ext v26.16b, v0.16b, v1.16b, #2 1142*c0909341SAndroid Build Coastguard Worker ext v27.16b, v0.16b, v1.16b, #4 1143*c0909341SAndroid Build Coastguard Worker 1144*c0909341SAndroid Build Coastguard Worker add v6.8h, v0.8h, v26.8h 1145*c0909341SAndroid Build Coastguard Worker umull v22.4s, v0.4h, v0.4h 1146*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v26.4h, v26.4h 1147*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v27.4h, v27.4h 1148*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v27.8h 1149*c0909341SAndroid Build Coastguard Worker umull2 v23.4s, v0.8h, v0.8h 1150*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v26.8h, v26.8h 1151*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v27.8h, v27.8h 1152*c0909341SAndroid Build Coastguard Worker 1153*c0909341SAndroid Build Coastguard Worker subs w4, w4, #8 1154*c0909341SAndroid Build Coastguard Worker 1155*c0909341SAndroid Build Coastguard Worker st1 {v6.8h}, [x1], #16 1156*c0909341SAndroid Build Coastguard Worker st1 {v22.4s,v23.4s}, [x0], #32 1157*c0909341SAndroid Build Coastguard Worker 1158*c0909341SAndroid Build Coastguard Worker b.le 9f 1159*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1160*c0909341SAndroid Build Coastguard Worker mov v0.16b, v1.16b 1161*c0909341SAndroid Build Coastguard Worker ld1 {v1.8h}, [x3], #16 1162*c0909341SAndroid Build Coastguard Worker 1163*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep summing. 1164*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1165*c0909341SAndroid Build Coastguard Worker 1166*c0909341SAndroid Build Coastguard Worker9: 1167*c0909341SAndroid Build Coastguard Worker ret 1168*c0909341SAndroid Build Coastguard Workerendfunc 1169*c0909341SAndroid Build Coastguard Worker 1170*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 1171*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 1172*c0909341SAndroid Build Coastguard Worker// const pixel *src, const int w, 1173*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 1174*c0909341SAndroid Build Coastguard Workerfunction sgr_box5_row_h_16bpc_neon, export=1 1175*c0909341SAndroid Build Coastguard Worker add w4, w4, #2 // w += 2 1176*c0909341SAndroid Build Coastguard Worker 1177*c0909341SAndroid Build Coastguard Worker tst w5, #1 // LR_HAVE_LEFT 1178*c0909341SAndroid Build Coastguard Worker b.eq 1f 1179*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 1180*c0909341SAndroid Build Coastguard Worker 1181*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT && left == NULL 1182*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 1183*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x3], #32 1184*c0909341SAndroid Build Coastguard Worker b 2f 1185*c0909341SAndroid Build Coastguard Worker 1186*c0909341SAndroid Build Coastguard Worker0: 1187*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 1188*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x3], #32 1189*c0909341SAndroid Build Coastguard Worker ld1 {v2.d}[1], [x2], #8 1190*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded earlier, 1191*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 1192*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 1193*c0909341SAndroid Build Coastguard Worker ext v1.16b, v0.16b, v1.16b, #10 1194*c0909341SAndroid Build Coastguard Worker ext v0.16b, v2.16b, v0.16b, #10 1195*c0909341SAndroid Build Coastguard Worker b 2f 1196*c0909341SAndroid Build Coastguard Worker 1197*c0909341SAndroid Build Coastguard Worker1: 1198*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x3], #32 1199*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 1200*c0909341SAndroid Build Coastguard Worker // and shift v0/v1 to have 3x the first pixel at the front. 1201*c0909341SAndroid Build Coastguard Worker dup v2.8h, v0.h[0] 1202*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded before, 1203*c0909341SAndroid Build Coastguard Worker // which we shifted out. 1204*c0909341SAndroid Build Coastguard Worker sub x3, x3, #6 1205*c0909341SAndroid Build Coastguard Worker ext v1.16b, v0.16b, v1.16b, #10 1206*c0909341SAndroid Build Coastguard Worker ext v0.16b, v2.16b, v0.16b, #10 1207*c0909341SAndroid Build Coastguard Worker 1208*c0909341SAndroid Build Coastguard Worker2: 1209*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1210*c0909341SAndroid Build Coastguard Worker b.ne 4f 1211*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that pixel to pad with 1212*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 1213*c0909341SAndroid Build Coastguard Worker sub w13, w4, #(2 + 16 - 3 + 1) 1214*c0909341SAndroid Build Coastguard Worker ldr h30, [x3, w13, sxtw #1] 1215*c0909341SAndroid Build Coastguard Worker // Fill v30 with the right padding pixel 1216*c0909341SAndroid Build Coastguard Worker dup v30.8h, v30.h[0] 1217*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 1218*c0909341SAndroid Build Coastguard Worker 1219*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 1220*c0909341SAndroid Build Coastguard Worker cmp w4, #11 1221*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 11, all used input pixels are valid 1222*c0909341SAndroid Build Coastguard Worker 1223*c0909341SAndroid Build Coastguard Worker // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1224*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 1225*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 1226*c0909341SAndroid Build Coastguard Worker 1227*c0909341SAndroid Build Coastguard Worker // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1228*c0909341SAndroid Build Coastguard Worker // buffer pointer. 1229*c0909341SAndroid Build Coastguard Worker movrel x13, right_ext_mask, -1 1230*c0909341SAndroid Build Coastguard Worker sub x13, x13, w4, uxtw #1 1231*c0909341SAndroid Build Coastguard Worker ld1 {v28.16b, v29.16b}, [x13] 1232*c0909341SAndroid Build Coastguard Worker 1233*c0909341SAndroid Build Coastguard Worker bit v0.16b, v30.16b, v28.16b 1234*c0909341SAndroid Build Coastguard Worker bit v1.16b, v30.16b, v29.16b 1235*c0909341SAndroid Build Coastguard Worker 1236*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 1237*c0909341SAndroid Build Coastguard Worker ext v26.16b, v0.16b, v1.16b, #2 1238*c0909341SAndroid Build Coastguard Worker ext v27.16b, v0.16b, v1.16b, #4 1239*c0909341SAndroid Build Coastguard Worker 1240*c0909341SAndroid Build Coastguard Worker add v6.8h, v0.8h, v26.8h 1241*c0909341SAndroid Build Coastguard Worker umull v22.4s, v0.4h, v0.4h 1242*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v26.4h, v26.4h 1243*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v27.4h, v27.4h 1244*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v27.8h 1245*c0909341SAndroid Build Coastguard Worker umull2 v23.4s, v0.8h, v0.8h 1246*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v26.8h, v26.8h 1247*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v27.8h, v27.8h 1248*c0909341SAndroid Build Coastguard Worker 1249*c0909341SAndroid Build Coastguard Worker ext v26.16b, v0.16b, v1.16b, #6 1250*c0909341SAndroid Build Coastguard Worker ext v27.16b, v0.16b, v1.16b, #8 1251*c0909341SAndroid Build Coastguard Worker 1252*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v26.8h 1253*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v26.4h, v26.4h 1254*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v27.4h, v27.4h 1255*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v27.8h 1256*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v26.8h, v26.8h 1257*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v27.8h, v27.8h 1258*c0909341SAndroid Build Coastguard Worker 1259*c0909341SAndroid Build Coastguard Worker subs w4, w4, #8 1260*c0909341SAndroid Build Coastguard Worker 1261*c0909341SAndroid Build Coastguard Worker st1 {v6.8h}, [x1], #16 1262*c0909341SAndroid Build Coastguard Worker st1 {v22.4s,v23.4s}, [x0], #32 1263*c0909341SAndroid Build Coastguard Worker 1264*c0909341SAndroid Build Coastguard Worker b.le 9f 1265*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1266*c0909341SAndroid Build Coastguard Worker mov v0.16b, v1.16b 1267*c0909341SAndroid Build Coastguard Worker ld1 {v1.8h}, [x3], #16 1268*c0909341SAndroid Build Coastguard Worker 1269*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep summing. 1270*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1271*c0909341SAndroid Build Coastguard Worker 1272*c0909341SAndroid Build Coastguard Worker9: 1273*c0909341SAndroid Build Coastguard Worker ret 1274*c0909341SAndroid Build Coastguard Workerendfunc 1275*c0909341SAndroid Build Coastguard Worker 1276*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3, 1277*c0909341SAndroid Build Coastguard Worker// int32_t *sumsq5, int16_t *sum5, 1278*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 1279*c0909341SAndroid Build Coastguard Worker// const pixel *src, const int w, 1280*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 1281*c0909341SAndroid Build Coastguard Workerfunction sgr_box35_row_h_16bpc_neon, export=1 1282*c0909341SAndroid Build Coastguard Worker add w6, w6, #2 // w += 2 1283*c0909341SAndroid Build Coastguard Worker 1284*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 1285*c0909341SAndroid Build Coastguard Worker b.eq 1f 1286*c0909341SAndroid Build Coastguard Worker cbnz x4, 0f 1287*c0909341SAndroid Build Coastguard Worker 1288*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT && left == NULL 1289*c0909341SAndroid Build Coastguard Worker sub x5, x5, #6 1290*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x5], #32 1291*c0909341SAndroid Build Coastguard Worker b 2f 1292*c0909341SAndroid Build Coastguard Worker 1293*c0909341SAndroid Build Coastguard Worker0: 1294*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 1295*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x5], #32 1296*c0909341SAndroid Build Coastguard Worker ld1 {v2.d}[1], [x4], #8 1297*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 pixels we loaded earlier, 1298*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 1299*c0909341SAndroid Build Coastguard Worker sub x5, x5, #6 1300*c0909341SAndroid Build Coastguard Worker ext v1.16b, v0.16b, v1.16b, #10 1301*c0909341SAndroid Build Coastguard Worker ext v0.16b, v2.16b, v0.16b, #10 1302*c0909341SAndroid Build Coastguard Worker b 2f 1303*c0909341SAndroid Build Coastguard Worker 1304*c0909341SAndroid Build Coastguard Worker1: 1305*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x5], #32 1306*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 1307*c0909341SAndroid Build Coastguard Worker // and shift v0/v1 to have 3x the first pixel at the front. 1308*c0909341SAndroid Build Coastguard Worker dup v2.8h, v0.h[0] 1309*c0909341SAndroid Build Coastguard Worker // Move x5 back to account for the last 3 pixels we loaded before, 1310*c0909341SAndroid Build Coastguard Worker // which we shifted out. 1311*c0909341SAndroid Build Coastguard Worker sub x5, x5, #6 1312*c0909341SAndroid Build Coastguard Worker ext v1.16b, v0.16b, v1.16b, #10 1313*c0909341SAndroid Build Coastguard Worker ext v0.16b, v2.16b, v0.16b, #10 1314*c0909341SAndroid Build Coastguard Worker 1315*c0909341SAndroid Build Coastguard Worker2: 1316*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 1317*c0909341SAndroid Build Coastguard Worker b.ne 4f 1318*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that pixel to pad with 1319*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 1320*c0909341SAndroid Build Coastguard Worker sub w13, w6, #(2 + 16 - 3 + 1) 1321*c0909341SAndroid Build Coastguard Worker ldr h30, [x5, w13, sxtw #1] 1322*c0909341SAndroid Build Coastguard Worker // Fill v30 with the right padding pixel 1323*c0909341SAndroid Build Coastguard Worker dup v30.8h, v30.h[0] 1324*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 1325*c0909341SAndroid Build Coastguard Worker 1326*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 1327*c0909341SAndroid Build Coastguard Worker cmp w6, #11 1328*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 11, all used input pixels are valid 1329*c0909341SAndroid Build Coastguard Worker 1330*c0909341SAndroid Build Coastguard Worker // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1331*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 1332*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 1333*c0909341SAndroid Build Coastguard Worker 1334*c0909341SAndroid Build Coastguard Worker // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1335*c0909341SAndroid Build Coastguard Worker // buffer pointer. 1336*c0909341SAndroid Build Coastguard Worker movrel x13, right_ext_mask, -1 1337*c0909341SAndroid Build Coastguard Worker sub x13, x13, w6, uxtw #1 1338*c0909341SAndroid Build Coastguard Worker ld1 {v28.16b, v29.16b}, [x13] 1339*c0909341SAndroid Build Coastguard Worker 1340*c0909341SAndroid Build Coastguard Worker bit v0.16b, v30.16b, v28.16b 1341*c0909341SAndroid Build Coastguard Worker bit v1.16b, v30.16b, v29.16b 1342*c0909341SAndroid Build Coastguard Worker 1343*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 1344*c0909341SAndroid Build Coastguard Worker ext v16.16b, v0.16b, v1.16b, #2 1345*c0909341SAndroid Build Coastguard Worker ext v17.16b, v0.16b, v1.16b, #4 1346*c0909341SAndroid Build Coastguard Worker ext v19.16b, v0.16b, v1.16b, #8 1347*c0909341SAndroid Build Coastguard Worker ext v18.16b, v0.16b, v1.16b, #6 1348*c0909341SAndroid Build Coastguard Worker 1349*c0909341SAndroid Build Coastguard Worker add v20.8h, v16.8h, v17.8h 1350*c0909341SAndroid Build Coastguard Worker add v21.8h, v0.8h, v19.8h 1351*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v18.8h 1352*c0909341SAndroid Build Coastguard Worker 1353*c0909341SAndroid Build Coastguard Worker umull v22.4s, v16.4h, v16.4h 1354*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v17.4h, v17.4h 1355*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v18.4h, v18.4h 1356*c0909341SAndroid Build Coastguard Worker 1357*c0909341SAndroid Build Coastguard Worker umull2 v23.4s, v16.8h, v16.8h 1358*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v17.8h, v17.8h 1359*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v18.8h, v18.8h 1360*c0909341SAndroid Build Coastguard Worker 1361*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v20.8h 1362*c0909341SAndroid Build Coastguard Worker st1 {v20.8h}, [x1], #16 1363*c0909341SAndroid Build Coastguard Worker st1 {v22.4s,v23.4s}, [x0], #32 1364*c0909341SAndroid Build Coastguard Worker 1365*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v0.4h, v0.4h 1366*c0909341SAndroid Build Coastguard Worker umlal v22.4s, v19.4h, v19.4h 1367*c0909341SAndroid Build Coastguard Worker 1368*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v0.8h, v0.8h 1369*c0909341SAndroid Build Coastguard Worker umlal2 v23.4s, v19.8h, v19.8h 1370*c0909341SAndroid Build Coastguard Worker 1371*c0909341SAndroid Build Coastguard Worker subs w6, w6, #8 1372*c0909341SAndroid Build Coastguard Worker 1373*c0909341SAndroid Build Coastguard Worker st1 {v21.8h}, [x3], #16 1374*c0909341SAndroid Build Coastguard Worker st1 {v22.4s,v23.4s}, [x2], #32 1375*c0909341SAndroid Build Coastguard Worker 1376*c0909341SAndroid Build Coastguard Worker b.le 9f 1377*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 1378*c0909341SAndroid Build Coastguard Worker mov v0.16b, v1.16b 1379*c0909341SAndroid Build Coastguard Worker ld1 {v1.8h}, [x5], #16 1380*c0909341SAndroid Build Coastguard Worker 1381*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep summing. 1382*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1383*c0909341SAndroid Build Coastguard Worker 1384*c0909341SAndroid Build Coastguard Worker9: 1385*c0909341SAndroid Build Coastguard Worker ret 1386*c0909341SAndroid Build Coastguard Workerendfunc 1387*c0909341SAndroid Build Coastguard Worker 1388*c0909341SAndroid Build Coastguard Workersgr_funcs 16 1389