1*c0909341SAndroid Build Coastguard Worker/* 2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors 3*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, Martin Storsjo 4*c0909341SAndroid Build Coastguard Worker * All rights reserved. 5*c0909341SAndroid Build Coastguard Worker * 6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker * 9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker * 12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker * 16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker */ 27*c0909341SAndroid Build Coastguard Worker 28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S" 29*c0909341SAndroid Build Coastguard Worker#include "util.S" 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Workerconst right_ext_mask_buf 32*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 37*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 38*c0909341SAndroid Build Coastguard Workerright_ext_mask: 39*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 42*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 43*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 44*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 45*c0909341SAndroid Build Coastguard Workerendconst 46*c0909341SAndroid Build Coastguard Worker 47*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride, 48*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], const pixel *lpf, 49*c0909341SAndroid Build Coastguard Worker// const int w, int h, 50*c0909341SAndroid Build Coastguard Worker// const int16_t filter[2][8], 51*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 52*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_8bpc_neon, export=1 53*c0909341SAndroid Build Coastguard Worker AARCH64_SIGN_LINK_REGISTER 54*c0909341SAndroid Build Coastguard Worker stp x29, x30, [sp, #-16]! 55*c0909341SAndroid Build Coastguard Worker mov x29, sp 56*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x6] 57*c0909341SAndroid Build Coastguard Worker tst w7, #4 // LR_HAVE_TOP 58*c0909341SAndroid Build Coastguard Worker sub_sp 384*2*6 59*c0909341SAndroid Build Coastguard Worker 60*c0909341SAndroid Build Coastguard Worker mov w17, #(1 << 14) - (1 << 2) 61*c0909341SAndroid Build Coastguard Worker dup v30.8h, w17 62*c0909341SAndroid Build Coastguard Worker movi v31.8h, #8, lsl #8 63*c0909341SAndroid Build Coastguard Worker 64*c0909341SAndroid Build Coastguard Worker // x9 - t6 65*c0909341SAndroid Build Coastguard Worker // x10 - t5 66*c0909341SAndroid Build Coastguard Worker // x11 - t4 67*c0909341SAndroid Build Coastguard Worker // x12 - t3 68*c0909341SAndroid Build Coastguard Worker // x13 - t2 69*c0909341SAndroid Build Coastguard Worker // x14 - t1 70*c0909341SAndroid Build Coastguard Worker // x15 - t0 71*c0909341SAndroid Build Coastguard Worker mov x14, sp // t1 72*c0909341SAndroid Build Coastguard Worker b.eq L(no_top_7) 73*c0909341SAndroid Build Coastguard Worker 74*c0909341SAndroid Build Coastguard Worker mov x16, x2 // backup left 75*c0909341SAndroid Build Coastguard Worker mov x2, #0 76*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 77*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride 78*c0909341SAndroid Build Coastguard Worker mov x9, x14 // t6 79*c0909341SAndroid Build Coastguard Worker mov x10, x14 // t5 80*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 81*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 82*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 83*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride*5 84*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 85*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 86*c0909341SAndroid Build Coastguard Worker mov x2, x16 // left 87*c0909341SAndroid Build Coastguard Worker mov x16, x3 // backup lpf 88*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 89*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 90*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 91*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 92*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 93*c0909341SAndroid Build Coastguard Worker b.eq L(v1_7) 94*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 95*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 96*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 97*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 98*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 99*c0909341SAndroid Build Coastguard Worker b.eq L(v2_7) 100*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 101*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 102*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 103*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 104*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 105*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 106*c0909341SAndroid Build Coastguard Worker 107*c0909341SAndroid Build Coastguard WorkerL(main_7): 108*c0909341SAndroid Build Coastguard Worker add x15, x14, #384*2 // t0 = t1 + 384*2 109*c0909341SAndroid Build Coastguard WorkerL(main_loop_7): 110*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_8bpc_neon 111*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 112*c0909341SAndroid Build Coastguard Worker b.ne L(main_loop_7) 113*c0909341SAndroid Build Coastguard Worker tst w7, #8 // LR_HAVE_BOTTOM 114*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 115*c0909341SAndroid Build Coastguard Worker 116*c0909341SAndroid Build Coastguard Worker mov x3, x16 // restore lpf 117*c0909341SAndroid Build Coastguard Worker mov x2, #0 // left = NULL 118*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_8bpc_neon 119*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_8bpc_neon 120*c0909341SAndroid Build Coastguard WorkerL(v1_7): 121*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_v_8bpc_neon 122*c0909341SAndroid Build Coastguard Worker 123*c0909341SAndroid Build Coastguard Worker mov sp, x29 124*c0909341SAndroid Build Coastguard Worker ldp x29, x30, [sp], #16 125*c0909341SAndroid Build Coastguard Worker AARCH64_VALIDATE_LINK_REGISTER 126*c0909341SAndroid Build Coastguard Worker ret 127*c0909341SAndroid Build Coastguard Worker 128*c0909341SAndroid Build Coastguard WorkerL(no_top_7): 129*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 130*c0909341SAndroid Build Coastguard Worker add x16, x3, x1, lsl #1 // lpf += stride*6, backup 131*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 132*c0909341SAndroid Build Coastguard Worker 133*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 134*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 135*c0909341SAndroid Build Coastguard Worker mov x9, x14 // t6 136*c0909341SAndroid Build Coastguard Worker mov x10, x14 // t5 137*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 138*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 139*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 140*c0909341SAndroid Build Coastguard Worker b.eq L(v1_7) 141*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 142*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 143*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 144*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 145*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 146*c0909341SAndroid Build Coastguard Worker b.eq L(v2_7) 147*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 148*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 149*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_h_8bpc_neon 150*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 151*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 152*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 153*c0909341SAndroid Build Coastguard Worker add x15, x14, #384*2 // t0 = t1 + 384*2 154*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_8bpc_neon 155*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 156*c0909341SAndroid Build Coastguard Worker b.eq L(v3_7) 157*c0909341SAndroid Build Coastguard Worker add x15, x15, #384*2*4 // t0 += 384*2*4 158*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_hv_8bpc_neon 159*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 160*c0909341SAndroid Build Coastguard Worker b.ne L(main_7) 161*c0909341SAndroid Build Coastguard WorkerL(v3_7): 162*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_v_8bpc_neon 163*c0909341SAndroid Build Coastguard WorkerL(v2_7): 164*c0909341SAndroid Build Coastguard Worker bl wiener_filter7_v_8bpc_neon 165*c0909341SAndroid Build Coastguard Worker b L(v1_7) 166*c0909341SAndroid Build Coastguard Workerendfunc 167*c0909341SAndroid Build Coastguard Worker 168*c0909341SAndroid Build Coastguard Worker 169*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_h_8bpc_neon 170*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #-32]! 171*c0909341SAndroid Build Coastguard Worker str x14, [sp, #16] 172*c0909341SAndroid Build Coastguard Worker 173*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 174*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 175*c0909341SAndroid Build Coastguard Worker b.eq 1f 176*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 177*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 178*c0909341SAndroid Build Coastguard Worker // left == NULL 179*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 180*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 181*c0909341SAndroid Build Coastguard Worker b 2f 182*c0909341SAndroid Build Coastguard Worker 183*c0909341SAndroid Build Coastguard Worker0: 184*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 185*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 186*c0909341SAndroid Build Coastguard Worker ld1 {v2.s}[3], [x2], #4 187*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded earlier, 188*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 189*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 190*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #13 191*c0909341SAndroid Build Coastguard Worker b 2f 192*c0909341SAndroid Build Coastguard Worker 193*c0909341SAndroid Build Coastguard Worker1: 194*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 195*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost byte 196*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 3x the first byte at the front. 197*c0909341SAndroid Build Coastguard Worker dup v2.16b, v3.b[0] 198*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded before, 199*c0909341SAndroid Build Coastguard Worker // which we shifted out. 200*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 201*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #13 202*c0909341SAndroid Build Coastguard Worker 203*c0909341SAndroid Build Coastguard Worker2: 204*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [x3], #8 205*c0909341SAndroid Build Coastguard Worker uxtl v2.8h, v3.8b 206*c0909341SAndroid Build Coastguard Worker uxtl2 v3.8h, v3.16b 207*c0909341SAndroid Build Coastguard Worker uxtl v4.8h, v4.8b 208*c0909341SAndroid Build Coastguard Worker 209*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 210*c0909341SAndroid Build Coastguard Worker b.ne 4f 211*c0909341SAndroid Build Coastguard Worker 212*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 213*c0909341SAndroid Build Coastguard Worker 214*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 215*c0909341SAndroid Build Coastguard Worker cmp w4, #19 216*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 19, all used input pixels are valid 217*c0909341SAndroid Build Coastguard Worker 218*c0909341SAndroid Build Coastguard Worker // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 219*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 220*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 221*c0909341SAndroid Build Coastguard Worker 222*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 223*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 224*c0909341SAndroid Build Coastguard Worker sub w17, w4, #22 225*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 226*c0909341SAndroid Build Coastguard Worker // buffer pointer. 227*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -6 228*c0909341SAndroid Build Coastguard Worker ldr b28, [x3, w17, sxtw] 229*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 230*c0909341SAndroid Build Coastguard Worker dup v28.8h, v28.h[0] 231*c0909341SAndroid Build Coastguard Worker ld1 {v25.16b, v26.16b, v27.16b}, [x6] 232*c0909341SAndroid Build Coastguard Worker 233*c0909341SAndroid Build Coastguard Worker bit v2.16b, v28.16b, v25.16b 234*c0909341SAndroid Build Coastguard Worker bit v3.16b, v28.16b, v26.16b 235*c0909341SAndroid Build Coastguard Worker bit v4.16b, v28.16b, v27.16b 236*c0909341SAndroid Build Coastguard Worker 237*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 238*c0909341SAndroid Build Coastguard Worker // Interleaving the mul/mla chains actually hurts performance 239*c0909341SAndroid Build Coastguard Worker // significantly on Cortex A53, thus keeping mul/mla tightly 240*c0909341SAndroid Build Coastguard Worker // chained like this. 241*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 242*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 243*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 244*c0909341SAndroid Build Coastguard Worker ext v20.16b, v2.16b, v3.16b, #10 245*c0909341SAndroid Build Coastguard Worker ext v21.16b, v2.16b, v3.16b, #12 246*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 247*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 248*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 249*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v2.8h 250*c0909341SAndroid Build Coastguard Worker shl v22.8h, v18.8h, #7 251*c0909341SAndroid Build Coastguard Worker mul v6.8h, v18.8h, v0.h[3] 252*c0909341SAndroid Build Coastguard Worker mla v6.8h, v19.8h, v0.h[4] 253*c0909341SAndroid Build Coastguard Worker mla v6.8h, v20.8h, v0.h[5] 254*c0909341SAndroid Build Coastguard Worker mla v6.8h, v21.8h, v0.h[6] 255*c0909341SAndroid Build Coastguard Worker 256*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 257*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 258*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 259*c0909341SAndroid Build Coastguard Worker ext v20.16b, v3.16b, v4.16b, #10 260*c0909341SAndroid Build Coastguard Worker ext v21.16b, v3.16b, v4.16b, #12 261*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 262*c0909341SAndroid Build Coastguard Worker 263*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 264*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 265*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v3.8h 266*c0909341SAndroid Build Coastguard Worker shl v23.8h, v18.8h, #7 267*c0909341SAndroid Build Coastguard Worker mul v7.8h, v18.8h, v0.h[3] 268*c0909341SAndroid Build Coastguard Worker mla v7.8h, v19.8h, v0.h[4] 269*c0909341SAndroid Build Coastguard Worker mla v7.8h, v20.8h, v0.h[5] 270*c0909341SAndroid Build Coastguard Worker mla v7.8h, v21.8h, v0.h[6] 271*c0909341SAndroid Build Coastguard Worker 272*c0909341SAndroid Build Coastguard Worker sub v22.8h, v22.8h, v30.8h 273*c0909341SAndroid Build Coastguard Worker sub v23.8h, v23.8h, v30.8h 274*c0909341SAndroid Build Coastguard Worker sqadd v6.8h, v6.8h, v22.8h 275*c0909341SAndroid Build Coastguard Worker sqadd v7.8h, v7.8h, v23.8h 276*c0909341SAndroid Build Coastguard Worker sshr v6.8h, v6.8h, #3 277*c0909341SAndroid Build Coastguard Worker sshr v7.8h, v7.8h, #3 278*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v31.8h 279*c0909341SAndroid Build Coastguard Worker add v7.8h, v7.8h, v31.8h 280*c0909341SAndroid Build Coastguard Worker 281*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 282*c0909341SAndroid Build Coastguard Worker 283*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x14], #32 284*c0909341SAndroid Build Coastguard Worker 285*c0909341SAndroid Build Coastguard Worker b.le 0f 286*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 287*c0909341SAndroid Build Coastguard Worker ld1 {v4.16b}, [x3], #16 288*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 289*c0909341SAndroid Build Coastguard Worker uxtl v3.8h, v4.8b 290*c0909341SAndroid Build Coastguard Worker uxtl2 v4.8h, v4.16b 291*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 292*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 293*c0909341SAndroid Build Coastguard Worker 294*c0909341SAndroid Build Coastguard Worker0: 295*c0909341SAndroid Build Coastguard Worker ldr x14, [sp, #16] 296*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp], #32 297*c0909341SAndroid Build Coastguard Worker ret 298*c0909341SAndroid Build Coastguard Workerendfunc 299*c0909341SAndroid Build Coastguard Worker 300*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_v_8bpc_neon 301*c0909341SAndroid Build Coastguard Worker // Backing up/restoring registers shifted, so that x9 gets the value 302*c0909341SAndroid Build Coastguard Worker // of x10, etc, afterwards. 303*c0909341SAndroid Build Coastguard Worker stp x10, x11, [sp, #-64]! 304*c0909341SAndroid Build Coastguard Worker stp x12, x13, [sp, #16] 305*c0909341SAndroid Build Coastguard Worker stp x14, x14, [sp, #32] 306*c0909341SAndroid Build Coastguard Worker stp x0, x4, [sp, #48] 307*c0909341SAndroid Build Coastguard Worker1: 308*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x11], #32 309*c0909341SAndroid Build Coastguard Worker ld1 {v24.8h, v25.8h}, [x13], #32 310*c0909341SAndroid Build Coastguard Worker 311*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x10], #32 312*c0909341SAndroid Build Coastguard Worker add v24.8h, v24.8h, v20.8h 313*c0909341SAndroid Build Coastguard Worker ld1 {v26.8h, v27.8h}, [x14], #32 314*c0909341SAndroid Build Coastguard Worker 315*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x9], #32 316*c0909341SAndroid Build Coastguard Worker add v28.8h, v26.8h, v18.8h 317*c0909341SAndroid Build Coastguard Worker ld1 {v22.8h, v23.8h}, [x12], #32 318*c0909341SAndroid Build Coastguard Worker 319*c0909341SAndroid Build Coastguard Worker add v16.8h, v26.8h, v16.8h 320*c0909341SAndroid Build Coastguard Worker add v25.8h, v25.8h, v21.8h 321*c0909341SAndroid Build Coastguard Worker 322*c0909341SAndroid Build Coastguard Worker smull v2.4s, v22.4h, v1.h[3] 323*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v24.4h, v1.h[4] 324*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v28.4h, v1.h[5] 325*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v16.4h, v1.h[6] 326*c0909341SAndroid Build Coastguard Worker add v29.8h, v27.8h, v19.8h 327*c0909341SAndroid Build Coastguard Worker smull2 v3.4s, v22.8h, v1.h[3] 328*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v24.8h, v1.h[4] 329*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v28.8h, v1.h[5] 330*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v16.8h, v1.h[6] 331*c0909341SAndroid Build Coastguard Worker add v17.8h, v27.8h, v17.8h 332*c0909341SAndroid Build Coastguard Worker smull v4.4s, v23.4h, v1.h[3] 333*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v25.4h, v1.h[4] 334*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v29.4h, v1.h[5] 335*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v17.4h, v1.h[6] 336*c0909341SAndroid Build Coastguard Worker smull2 v5.4s, v23.8h, v1.h[3] 337*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v25.8h, v1.h[4] 338*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v29.8h, v1.h[5] 339*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v17.8h, v1.h[6] 340*c0909341SAndroid Build Coastguard Worker sqrshrun v2.4h, v2.4s, #11 341*c0909341SAndroid Build Coastguard Worker sqrshrun2 v2.8h, v3.4s, #11 342*c0909341SAndroid Build Coastguard Worker sqrshrun v3.4h, v4.4s, #11 343*c0909341SAndroid Build Coastguard Worker sqrshrun2 v3.8h, v5.4s, #11 344*c0909341SAndroid Build Coastguard Worker sqxtun v2.8b, v2.8h 345*c0909341SAndroid Build Coastguard Worker sqxtun2 v2.16b, v3.8h 346*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 347*c0909341SAndroid Build Coastguard Worker st1 {v2.16b}, [x0], #16 348*c0909341SAndroid Build Coastguard Worker b.gt 1b 349*c0909341SAndroid Build Coastguard Worker 350*c0909341SAndroid Build Coastguard Worker ldp x0, x4, [sp, #48] 351*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #32] 352*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp, #16] 353*c0909341SAndroid Build Coastguard Worker ldp x9, x10, [sp], #64 354*c0909341SAndroid Build Coastguard Worker 355*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 356*c0909341SAndroid Build Coastguard Worker ret 357*c0909341SAndroid Build Coastguard Workerendfunc 358*c0909341SAndroid Build Coastguard Worker 359*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_hv_8bpc_neon 360*c0909341SAndroid Build Coastguard Worker // Backing up/restoring registers shifted, so that x9 gets the value 361*c0909341SAndroid Build Coastguard Worker // of x10, etc, and x15==x9, afterwards. 362*c0909341SAndroid Build Coastguard Worker stp x10, x11, [sp, #-80]! 363*c0909341SAndroid Build Coastguard Worker stp x12, x13, [sp, #16] 364*c0909341SAndroid Build Coastguard Worker stp x14, x15, [sp, #32] 365*c0909341SAndroid Build Coastguard Worker stp x10, x0, [sp, #48] 366*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #64] 367*c0909341SAndroid Build Coastguard Worker 368*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 369*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 370*c0909341SAndroid Build Coastguard Worker b.eq 1f 371*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 372*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 373*c0909341SAndroid Build Coastguard Worker // left == NULL 374*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 375*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 376*c0909341SAndroid Build Coastguard Worker b 2f 377*c0909341SAndroid Build Coastguard Worker 378*c0909341SAndroid Build Coastguard Worker0: 379*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 380*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 381*c0909341SAndroid Build Coastguard Worker ld1 {v2.s}[3], [x2], #4 382*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded earlier, 383*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 384*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 385*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #13 386*c0909341SAndroid Build Coastguard Worker b 2f 387*c0909341SAndroid Build Coastguard Worker1: 388*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 389*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost byte 390*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 3x the first byte at the front. 391*c0909341SAndroid Build Coastguard Worker dup v2.16b, v3.b[0] 392*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded before, 393*c0909341SAndroid Build Coastguard Worker // which we shifted out. 394*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 395*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #13 396*c0909341SAndroid Build Coastguard Worker 397*c0909341SAndroid Build Coastguard Worker2: 398*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [x3], #8 399*c0909341SAndroid Build Coastguard Worker uxtl v2.8h, v3.8b 400*c0909341SAndroid Build Coastguard Worker uxtl2 v3.8h, v3.16b 401*c0909341SAndroid Build Coastguard Worker uxtl v4.8h, v4.8b 402*c0909341SAndroid Build Coastguard Worker 403*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 404*c0909341SAndroid Build Coastguard Worker b.ne 4f 405*c0909341SAndroid Build Coastguard Worker 406*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 407*c0909341SAndroid Build Coastguard Worker 408*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 409*c0909341SAndroid Build Coastguard Worker cmp w4, #19 410*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 19, all used input pixels are valid 411*c0909341SAndroid Build Coastguard Worker 412*c0909341SAndroid Build Coastguard Worker // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 413*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 414*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 415*c0909341SAndroid Build Coastguard Worker 416*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 417*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 418*c0909341SAndroid Build Coastguard Worker sub w17, w4, #22 419*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 420*c0909341SAndroid Build Coastguard Worker // buffer pointer. 421*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -6 422*c0909341SAndroid Build Coastguard Worker ldr b28, [x3, w17, sxtw] 423*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 424*c0909341SAndroid Build Coastguard Worker dup v28.8h, v28.h[0] 425*c0909341SAndroid Build Coastguard Worker ld1 {v25.16b, v26.16b, v27.16b}, [x6] 426*c0909341SAndroid Build Coastguard Worker 427*c0909341SAndroid Build Coastguard Worker bit v2.16b, v28.16b, v25.16b 428*c0909341SAndroid Build Coastguard Worker bit v3.16b, v28.16b, v26.16b 429*c0909341SAndroid Build Coastguard Worker bit v4.16b, v28.16b, v27.16b 430*c0909341SAndroid Build Coastguard Worker 431*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 432*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 433*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 434*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 435*c0909341SAndroid Build Coastguard Worker ext v20.16b, v2.16b, v3.16b, #10 436*c0909341SAndroid Build Coastguard Worker ext v21.16b, v2.16b, v3.16b, #12 437*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 438*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 439*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 440*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v2.8h 441*c0909341SAndroid Build Coastguard Worker shl v22.8h, v18.8h, #7 442*c0909341SAndroid Build Coastguard Worker mul v6.8h, v18.8h, v0.h[3] 443*c0909341SAndroid Build Coastguard Worker mla v6.8h, v19.8h, v0.h[4] 444*c0909341SAndroid Build Coastguard Worker mla v6.8h, v20.8h, v0.h[5] 445*c0909341SAndroid Build Coastguard Worker mla v6.8h, v21.8h, v0.h[6] 446*c0909341SAndroid Build Coastguard Worker 447*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 448*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 449*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 450*c0909341SAndroid Build Coastguard Worker ext v20.16b, v3.16b, v4.16b, #10 451*c0909341SAndroid Build Coastguard Worker ext v21.16b, v3.16b, v4.16b, #12 452*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 453*c0909341SAndroid Build Coastguard Worker 454*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v17.8h 455*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v16.8h 456*c0909341SAndroid Build Coastguard Worker add v21.8h, v21.8h, v3.8h 457*c0909341SAndroid Build Coastguard Worker shl v23.8h, v18.8h, #7 458*c0909341SAndroid Build Coastguard Worker mul v7.8h, v18.8h, v0.h[3] 459*c0909341SAndroid Build Coastguard Worker mla v7.8h, v19.8h, v0.h[4] 460*c0909341SAndroid Build Coastguard Worker mla v7.8h, v20.8h, v0.h[5] 461*c0909341SAndroid Build Coastguard Worker mla v7.8h, v21.8h, v0.h[6] 462*c0909341SAndroid Build Coastguard Worker 463*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x11], #32 464*c0909341SAndroid Build Coastguard Worker 465*c0909341SAndroid Build Coastguard Worker sub v22.8h, v22.8h, v30.8h 466*c0909341SAndroid Build Coastguard Worker sub v23.8h, v23.8h, v30.8h 467*c0909341SAndroid Build Coastguard Worker ld1 {v26.8h, v27.8h}, [x13], #32 468*c0909341SAndroid Build Coastguard Worker sqadd v6.8h, v6.8h, v22.8h 469*c0909341SAndroid Build Coastguard Worker sqadd v7.8h, v7.8h, v23.8h 470*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x10], #32 471*c0909341SAndroid Build Coastguard Worker sshr v6.8h, v6.8h, #3 472*c0909341SAndroid Build Coastguard Worker sshr v7.8h, v7.8h, #3 473*c0909341SAndroid Build Coastguard Worker ld1 {v28.8h, v29.8h}, [x14], #32 474*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v31.8h 475*c0909341SAndroid Build Coastguard Worker add v7.8h, v7.8h, v31.8h 476*c0909341SAndroid Build Coastguard Worker 477*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x9], #32 478*c0909341SAndroid Build Coastguard Worker add v26.8h, v20.8h, v26.8h 479*c0909341SAndroid Build Coastguard Worker 480*c0909341SAndroid Build Coastguard Worker ld1 {v24.8h, v25.8h}, [x12], #32 481*c0909341SAndroid Build Coastguard Worker add v28.8h, v18.8h, v28.8h 482*c0909341SAndroid Build Coastguard Worker 483*c0909341SAndroid Build Coastguard Worker add v16.8h, v16.8h, v6.8h 484*c0909341SAndroid Build Coastguard Worker add v27.8h, v21.8h, v27.8h 485*c0909341SAndroid Build Coastguard Worker 486*c0909341SAndroid Build Coastguard Worker smull v18.4s, v24.4h, v1.h[3] 487*c0909341SAndroid Build Coastguard Worker smlal v18.4s, v26.4h, v1.h[4] 488*c0909341SAndroid Build Coastguard Worker smlal v18.4s, v28.4h, v1.h[5] 489*c0909341SAndroid Build Coastguard Worker smlal v18.4s, v16.4h, v1.h[6] 490*c0909341SAndroid Build Coastguard Worker add v29.8h, v19.8h, v29.8h 491*c0909341SAndroid Build Coastguard Worker smull2 v19.4s, v24.8h, v1.h[3] 492*c0909341SAndroid Build Coastguard Worker smlal2 v19.4s, v26.8h, v1.h[4] 493*c0909341SAndroid Build Coastguard Worker smlal2 v19.4s, v28.8h, v1.h[5] 494*c0909341SAndroid Build Coastguard Worker smlal2 v19.4s, v16.8h, v1.h[6] 495*c0909341SAndroid Build Coastguard Worker add v17.8h, v17.8h, v7.8h 496*c0909341SAndroid Build Coastguard Worker smull v20.4s, v25.4h, v1.h[3] 497*c0909341SAndroid Build Coastguard Worker smlal v20.4s, v27.4h, v1.h[4] 498*c0909341SAndroid Build Coastguard Worker smlal v20.4s, v29.4h, v1.h[5] 499*c0909341SAndroid Build Coastguard Worker smlal v20.4s, v17.4h, v1.h[6] 500*c0909341SAndroid Build Coastguard Worker smull2 v21.4s, v25.8h, v1.h[3] 501*c0909341SAndroid Build Coastguard Worker smlal2 v21.4s, v27.8h, v1.h[4] 502*c0909341SAndroid Build Coastguard Worker smlal2 v21.4s, v29.8h, v1.h[5] 503*c0909341SAndroid Build Coastguard Worker smlal2 v21.4s, v17.8h, v1.h[6] 504*c0909341SAndroid Build Coastguard Worker sqrshrun v18.4h, v18.4s, #11 505*c0909341SAndroid Build Coastguard Worker sqrshrun2 v18.8h, v19.4s, #11 506*c0909341SAndroid Build Coastguard Worker sqrshrun v19.4h, v20.4s, #11 507*c0909341SAndroid Build Coastguard Worker sqrshrun2 v19.8h, v21.4s, #11 508*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x15], #32 509*c0909341SAndroid Build Coastguard Worker sqxtun v18.8b, v18.8h 510*c0909341SAndroid Build Coastguard Worker sqxtun2 v18.16b, v19.8h 511*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 512*c0909341SAndroid Build Coastguard Worker 513*c0909341SAndroid Build Coastguard Worker st1 {v18.16b}, [x0], #16 514*c0909341SAndroid Build Coastguard Worker 515*c0909341SAndroid Build Coastguard Worker b.le 0f 516*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 517*c0909341SAndroid Build Coastguard Worker ld1 {v4.16b}, [x3], #16 518*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 519*c0909341SAndroid Build Coastguard Worker uxtl v3.8h, v4.8b 520*c0909341SAndroid Build Coastguard Worker uxtl2 v4.8h, v4.16b 521*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 522*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 523*c0909341SAndroid Build Coastguard Worker 524*c0909341SAndroid Build Coastguard Worker0: 525*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp, #64] 526*c0909341SAndroid Build Coastguard Worker ldp x15, x0, [sp, #48] 527*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #32] 528*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp, #16] 529*c0909341SAndroid Build Coastguard Worker ldp x9, x10, [sp], #80 530*c0909341SAndroid Build Coastguard Worker 531*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 532*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 533*c0909341SAndroid Build Coastguard Worker 534*c0909341SAndroid Build Coastguard Worker ret 535*c0909341SAndroid Build Coastguard Workerendfunc 536*c0909341SAndroid Build Coastguard Worker 537*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride, 538*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], const pixel *lpf, 539*c0909341SAndroid Build Coastguard Worker// const int w, int h, 540*c0909341SAndroid Build Coastguard Worker// const int16_t filter[2][8], 541*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 542*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_8bpc_neon, export=1 543*c0909341SAndroid Build Coastguard Worker AARCH64_SIGN_LINK_REGISTER 544*c0909341SAndroid Build Coastguard Worker stp x29, x30, [sp, #-16]! 545*c0909341SAndroid Build Coastguard Worker mov x29, sp 546*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h, v1.8h}, [x6] 547*c0909341SAndroid Build Coastguard Worker tst w7, #4 // LR_HAVE_TOP 548*c0909341SAndroid Build Coastguard Worker sub_sp 384*2*4 549*c0909341SAndroid Build Coastguard Worker 550*c0909341SAndroid Build Coastguard Worker mov w17, #(1 << 14) - (1 << 2) 551*c0909341SAndroid Build Coastguard Worker dup v30.8h, w17 552*c0909341SAndroid Build Coastguard Worker movi v31.8h, #8, lsl #8 553*c0909341SAndroid Build Coastguard Worker 554*c0909341SAndroid Build Coastguard Worker // x11 - t4 555*c0909341SAndroid Build Coastguard Worker // x12 - t3 556*c0909341SAndroid Build Coastguard Worker // x13 - t2 557*c0909341SAndroid Build Coastguard Worker // x14 - t1 558*c0909341SAndroid Build Coastguard Worker // x15 - t0 559*c0909341SAndroid Build Coastguard Worker mov x14, sp // t1 560*c0909341SAndroid Build Coastguard Worker b.eq L(no_top_5) 561*c0909341SAndroid Build Coastguard Worker 562*c0909341SAndroid Build Coastguard Worker mov x16, x2 // backup left 563*c0909341SAndroid Build Coastguard Worker mov x2, #0 564*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_8bpc_neon 565*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride 566*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 567*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 568*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_8bpc_neon 569*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 570*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // lpf += stride*5 571*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 572*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 573*c0909341SAndroid Build Coastguard Worker mov x2, x16 // left 574*c0909341SAndroid Build Coastguard Worker mov x16, x3 // backup lpf 575*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 576*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_8bpc_neon 577*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 578*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 579*c0909341SAndroid Build Coastguard Worker b.eq L(v1_5) 580*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 581*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 582*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_8bpc_neon 583*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 584*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 585*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 586*c0909341SAndroid Build Coastguard Worker 587*c0909341SAndroid Build Coastguard WorkerL(main_5): 588*c0909341SAndroid Build Coastguard Worker mov x15, x11 // t0 = t4 589*c0909341SAndroid Build Coastguard WorkerL(main_loop_5): 590*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_8bpc_neon 591*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 592*c0909341SAndroid Build Coastguard Worker b.ne L(main_loop_5) 593*c0909341SAndroid Build Coastguard Worker tst w7, #8 // LR_HAVE_BOTTOM 594*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 595*c0909341SAndroid Build Coastguard Worker 596*c0909341SAndroid Build Coastguard Worker mov x3, x16 // restore lpf 597*c0909341SAndroid Build Coastguard Worker mov x2, #0 // left = NULL 598*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_8bpc_neon 599*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_8bpc_neon 600*c0909341SAndroid Build Coastguard WorkerL(end_5): 601*c0909341SAndroid Build Coastguard Worker 602*c0909341SAndroid Build Coastguard Worker mov sp, x29 603*c0909341SAndroid Build Coastguard Worker ldp x29, x30, [sp], #16 604*c0909341SAndroid Build Coastguard Worker AARCH64_VALIDATE_LINK_REGISTER 605*c0909341SAndroid Build Coastguard Worker ret 606*c0909341SAndroid Build Coastguard Worker 607*c0909341SAndroid Build Coastguard WorkerL(no_top_5): 608*c0909341SAndroid Build Coastguard Worker add x3, x3, x1, lsl #2 609*c0909341SAndroid Build Coastguard Worker add x16, x3, x1, lsl #1 // lpf += stride*6, backup 610*c0909341SAndroid Build Coastguard Worker mov x3, x0 // lpf = p 611*c0909341SAndroid Build Coastguard Worker 612*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_8bpc_neon 613*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 614*c0909341SAndroid Build Coastguard Worker mov x11, x14 // t4 615*c0909341SAndroid Build Coastguard Worker mov x12, x14 // t3 616*c0909341SAndroid Build Coastguard Worker mov x13, x14 // t2 617*c0909341SAndroid Build Coastguard Worker b.eq L(v1_5) 618*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 619*c0909341SAndroid Build Coastguard Worker add x14, x14, #384*2 // t1 += 384*2 620*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_h_8bpc_neon 621*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 622*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 623*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 // src += stride 624*c0909341SAndroid Build Coastguard Worker add x15, x14, #384*2 // t0 = t1 + 384*2 625*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_8bpc_neon 626*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 627*c0909341SAndroid Build Coastguard Worker b.eq L(v2_5) 628*c0909341SAndroid Build Coastguard Worker add x15, x15, #384*2*3 // t0 += 384*2*3 629*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_hv_8bpc_neon 630*c0909341SAndroid Build Coastguard Worker subs w5, w5, #1 // h-- 631*c0909341SAndroid Build Coastguard Worker b.ne L(main_5) 632*c0909341SAndroid Build Coastguard WorkerL(v2_5): 633*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_v_8bpc_neon 634*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 635*c0909341SAndroid Build Coastguard Worker mov x11, x12 636*c0909341SAndroid Build Coastguard Worker mov x12, x13 637*c0909341SAndroid Build Coastguard Worker mov x13, x14 638*c0909341SAndroid Build Coastguard WorkerL(v1_5): 639*c0909341SAndroid Build Coastguard Worker bl wiener_filter5_v_8bpc_neon 640*c0909341SAndroid Build Coastguard Worker b L(end_5) 641*c0909341SAndroid Build Coastguard Workerendfunc 642*c0909341SAndroid Build Coastguard Worker 643*c0909341SAndroid Build Coastguard Worker 644*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_h_8bpc_neon 645*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #-32]! 646*c0909341SAndroid Build Coastguard Worker str x14, [sp, #16] 647*c0909341SAndroid Build Coastguard Worker 648*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 649*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 650*c0909341SAndroid Build Coastguard Worker b.eq 1f 651*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 652*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 653*c0909341SAndroid Build Coastguard Worker // left == NULL 654*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 655*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 656*c0909341SAndroid Build Coastguard Worker b 2f 657*c0909341SAndroid Build Coastguard Worker 658*c0909341SAndroid Build Coastguard Worker0: 659*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 660*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 661*c0909341SAndroid Build Coastguard Worker ld1 {v2.s}[3], [x2], #4 662*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 bytes we loaded earlier, 663*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 664*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 665*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #14 666*c0909341SAndroid Build Coastguard Worker b 2f 667*c0909341SAndroid Build Coastguard Worker 668*c0909341SAndroid Build Coastguard Worker1: 669*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 670*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost byte 671*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 3x the first byte at the front. 672*c0909341SAndroid Build Coastguard Worker dup v2.16b, v3.b[0] 673*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 bytes we loaded before, 674*c0909341SAndroid Build Coastguard Worker // which we shifted out. 675*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 676*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #14 677*c0909341SAndroid Build Coastguard Worker 678*c0909341SAndroid Build Coastguard Worker2: 679*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [x3], #8 680*c0909341SAndroid Build Coastguard Worker uxtl v2.8h, v3.8b 681*c0909341SAndroid Build Coastguard Worker uxtl2 v3.8h, v3.16b 682*c0909341SAndroid Build Coastguard Worker uxtl v4.8h, v4.8b 683*c0909341SAndroid Build Coastguard Worker 684*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 685*c0909341SAndroid Build Coastguard Worker b.ne 4f 686*c0909341SAndroid Build Coastguard Worker 687*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 688*c0909341SAndroid Build Coastguard Worker 689*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 690*c0909341SAndroid Build Coastguard Worker cmp w4, #18 691*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 18, all used input pixels are valid 692*c0909341SAndroid Build Coastguard Worker 693*c0909341SAndroid Build Coastguard Worker // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 694*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 695*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 696*c0909341SAndroid Build Coastguard Worker 697*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 698*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 699*c0909341SAndroid Build Coastguard Worker sub w17, w4, #23 700*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 701*c0909341SAndroid Build Coastguard Worker // buffer pointer. 702*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -4 703*c0909341SAndroid Build Coastguard Worker ldr b28, [x3, w17, sxtw] 704*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 705*c0909341SAndroid Build Coastguard Worker dup v28.8h, v28.h[0] 706*c0909341SAndroid Build Coastguard Worker ld1 {v25.16b, v26.16b, v27.16b}, [x6] 707*c0909341SAndroid Build Coastguard Worker 708*c0909341SAndroid Build Coastguard Worker bit v2.16b, v28.16b, v25.16b 709*c0909341SAndroid Build Coastguard Worker bit v3.16b, v28.16b, v26.16b 710*c0909341SAndroid Build Coastguard Worker bit v4.16b, v28.16b, v27.16b 711*c0909341SAndroid Build Coastguard Worker 712*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 713*c0909341SAndroid Build Coastguard Worker // Interleaving the mul/mla chains actually hurts performance 714*c0909341SAndroid Build Coastguard Worker // significantly on Cortex A53, thus keeping mul/mla tightly 715*c0909341SAndroid Build Coastguard Worker // chained like this. 716*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 717*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 718*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 719*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 720*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 721*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v2.8h 722*c0909341SAndroid Build Coastguard Worker shl v22.8h, v17.8h, #7 723*c0909341SAndroid Build Coastguard Worker mul v6.8h, v17.8h, v0.h[3] 724*c0909341SAndroid Build Coastguard Worker mla v6.8h, v18.8h, v0.h[4] 725*c0909341SAndroid Build Coastguard Worker mla v6.8h, v19.8h, v0.h[5] 726*c0909341SAndroid Build Coastguard Worker 727*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 728*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 729*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 730*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 731*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 732*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v3.8h 733*c0909341SAndroid Build Coastguard Worker shl v23.8h, v17.8h, #7 734*c0909341SAndroid Build Coastguard Worker mul v7.8h, v17.8h, v0.h[3] 735*c0909341SAndroid Build Coastguard Worker mla v7.8h, v18.8h, v0.h[4] 736*c0909341SAndroid Build Coastguard Worker mla v7.8h, v19.8h, v0.h[5] 737*c0909341SAndroid Build Coastguard Worker 738*c0909341SAndroid Build Coastguard Worker sub v22.8h, v22.8h, v30.8h 739*c0909341SAndroid Build Coastguard Worker sub v23.8h, v23.8h, v30.8h 740*c0909341SAndroid Build Coastguard Worker sqadd v6.8h, v6.8h, v22.8h 741*c0909341SAndroid Build Coastguard Worker sqadd v7.8h, v7.8h, v23.8h 742*c0909341SAndroid Build Coastguard Worker sshr v6.8h, v6.8h, #3 743*c0909341SAndroid Build Coastguard Worker sshr v7.8h, v7.8h, #3 744*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v31.8h 745*c0909341SAndroid Build Coastguard Worker add v7.8h, v7.8h, v31.8h 746*c0909341SAndroid Build Coastguard Worker 747*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 748*c0909341SAndroid Build Coastguard Worker 749*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x14], #32 750*c0909341SAndroid Build Coastguard Worker 751*c0909341SAndroid Build Coastguard Worker b.le 0f 752*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 753*c0909341SAndroid Build Coastguard Worker ld1 {v4.16b}, [x3], #16 754*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 755*c0909341SAndroid Build Coastguard Worker uxtl v3.8h, v4.8b 756*c0909341SAndroid Build Coastguard Worker uxtl2 v4.8h, v4.16b 757*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 758*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 759*c0909341SAndroid Build Coastguard Worker 760*c0909341SAndroid Build Coastguard Worker0: 761*c0909341SAndroid Build Coastguard Worker ldr x14, [sp, #16] 762*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp], #32 763*c0909341SAndroid Build Coastguard Worker ret 764*c0909341SAndroid Build Coastguard Workerendfunc 765*c0909341SAndroid Build Coastguard Worker 766*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_v_8bpc_neon 767*c0909341SAndroid Build Coastguard Worker stp x11, x12, [sp, #-48]! 768*c0909341SAndroid Build Coastguard Worker stp x13, x14, [sp, #16] 769*c0909341SAndroid Build Coastguard Worker stp x0, x4, [sp, #32] 770*c0909341SAndroid Build Coastguard Worker1: 771*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x12], #32 772*c0909341SAndroid Build Coastguard Worker ld1 {v22.8h, v23.8h}, [x14], #32 773*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x11], #32 774*c0909341SAndroid Build Coastguard Worker 775*c0909341SAndroid Build Coastguard Worker add v24.8h, v22.8h, v18.8h 776*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x13], #32 777*c0909341SAndroid Build Coastguard Worker add v16.8h, v22.8h, v16.8h 778*c0909341SAndroid Build Coastguard Worker add v25.8h, v23.8h, v19.8h 779*c0909341SAndroid Build Coastguard Worker 780*c0909341SAndroid Build Coastguard Worker smull v2.4s, v20.4h, v1.h[3] 781*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v24.4h, v1.h[4] 782*c0909341SAndroid Build Coastguard Worker smlal v2.4s, v16.4h, v1.h[5] 783*c0909341SAndroid Build Coastguard Worker add v17.8h, v23.8h, v17.8h 784*c0909341SAndroid Build Coastguard Worker smull2 v3.4s, v20.8h, v1.h[3] 785*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v24.8h, v1.h[4] 786*c0909341SAndroid Build Coastguard Worker smlal2 v3.4s, v16.8h, v1.h[5] 787*c0909341SAndroid Build Coastguard Worker smull v4.4s, v21.4h, v1.h[3] 788*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v25.4h, v1.h[4] 789*c0909341SAndroid Build Coastguard Worker smlal v4.4s, v17.4h, v1.h[5] 790*c0909341SAndroid Build Coastguard Worker smull2 v5.4s, v21.8h, v1.h[3] 791*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v25.8h, v1.h[4] 792*c0909341SAndroid Build Coastguard Worker smlal2 v5.4s, v17.8h, v1.h[5] 793*c0909341SAndroid Build Coastguard Worker sqrshrun v2.4h, v2.4s, #11 794*c0909341SAndroid Build Coastguard Worker sqrshrun2 v2.8h, v3.4s, #11 795*c0909341SAndroid Build Coastguard Worker sqrshrun v3.4h, v4.4s, #11 796*c0909341SAndroid Build Coastguard Worker sqrshrun2 v3.8h, v5.4s, #11 797*c0909341SAndroid Build Coastguard Worker sqxtun v2.8b, v2.8h 798*c0909341SAndroid Build Coastguard Worker sqxtun2 v2.16b, v3.8h 799*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 800*c0909341SAndroid Build Coastguard Worker st1 {v2.16b}, [x0], #16 801*c0909341SAndroid Build Coastguard Worker b.gt 1b 802*c0909341SAndroid Build Coastguard Worker 803*c0909341SAndroid Build Coastguard Worker ldp x0, x4, [sp, #32] 804*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #16] 805*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp], #48 806*c0909341SAndroid Build Coastguard Worker 807*c0909341SAndroid Build Coastguard Worker ret 808*c0909341SAndroid Build Coastguard Workerendfunc 809*c0909341SAndroid Build Coastguard Worker 810*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_hv_8bpc_neon 811*c0909341SAndroid Build Coastguard Worker // Backing up/restoring registers shifted, so that x11 gets the value 812*c0909341SAndroid Build Coastguard Worker // of x12, etc, and x15==x11, afterwards. 813*c0909341SAndroid Build Coastguard Worker stp x12, x13, [sp, #-64]! 814*c0909341SAndroid Build Coastguard Worker stp x14, x15, [sp, #16] 815*c0909341SAndroid Build Coastguard Worker stp x12, x0, [sp, #32] 816*c0909341SAndroid Build Coastguard Worker stp x3, x4, [sp, #48] 817*c0909341SAndroid Build Coastguard Worker 818*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 819*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 820*c0909341SAndroid Build Coastguard Worker b.eq 1f 821*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 822*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 823*c0909341SAndroid Build Coastguard Worker // left == NULL 824*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 825*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 826*c0909341SAndroid Build Coastguard Worker b 2f 827*c0909341SAndroid Build Coastguard Worker 828*c0909341SAndroid Build Coastguard Worker0: 829*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 830*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 831*c0909341SAndroid Build Coastguard Worker ld1 {v2.s}[3], [x2], #4 832*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 bytes we loaded earlier, 833*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 834*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 835*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #14 836*c0909341SAndroid Build Coastguard Worker b 2f 837*c0909341SAndroid Build Coastguard Worker1: 838*c0909341SAndroid Build Coastguard Worker ld1 {v3.16b}, [x3], #16 839*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v2 with the leftmost byte 840*c0909341SAndroid Build Coastguard Worker // and shift v3 to have 2x the first byte at the front. 841*c0909341SAndroid Build Coastguard Worker dup v2.16b, v3.b[0] 842*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 bytes we loaded before, 843*c0909341SAndroid Build Coastguard Worker // which we shifted out. 844*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 845*c0909341SAndroid Build Coastguard Worker ext v3.16b, v2.16b, v3.16b, #14 846*c0909341SAndroid Build Coastguard Worker 847*c0909341SAndroid Build Coastguard Worker2: 848*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [x3], #8 849*c0909341SAndroid Build Coastguard Worker uxtl v2.8h, v3.8b 850*c0909341SAndroid Build Coastguard Worker uxtl2 v3.8h, v3.16b 851*c0909341SAndroid Build Coastguard Worker uxtl v4.8h, v4.8b 852*c0909341SAndroid Build Coastguard Worker 853*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 854*c0909341SAndroid Build Coastguard Worker b.ne 4f 855*c0909341SAndroid Build Coastguard Worker 856*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 857*c0909341SAndroid Build Coastguard Worker 858*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 859*c0909341SAndroid Build Coastguard Worker cmp w4, #18 860*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 18, all used input pixels are valid 861*c0909341SAndroid Build Coastguard Worker 862*c0909341SAndroid Build Coastguard Worker // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 863*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 864*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 865*c0909341SAndroid Build Coastguard Worker 866*c0909341SAndroid Build Coastguard Worker // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie 867*c0909341SAndroid Build Coastguard Worker // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 868*c0909341SAndroid Build Coastguard Worker sub w17, w4, #23 869*c0909341SAndroid Build Coastguard Worker // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 870*c0909341SAndroid Build Coastguard Worker // buffer pointer. 871*c0909341SAndroid Build Coastguard Worker movrel x6, right_ext_mask, -4 872*c0909341SAndroid Build Coastguard Worker ldr b28, [x3, w17, sxtw] 873*c0909341SAndroid Build Coastguard Worker sub x6, x6, w4, uxtw #1 874*c0909341SAndroid Build Coastguard Worker dup v28.8h, v28.h[0] 875*c0909341SAndroid Build Coastguard Worker ld1 {v25.16b, v26.16b, v27.16b}, [x6] 876*c0909341SAndroid Build Coastguard Worker 877*c0909341SAndroid Build Coastguard Worker bit v2.16b, v28.16b, v25.16b 878*c0909341SAndroid Build Coastguard Worker bit v3.16b, v28.16b, v26.16b 879*c0909341SAndroid Build Coastguard Worker bit v4.16b, v28.16b, v27.16b 880*c0909341SAndroid Build Coastguard Worker 881*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 882*c0909341SAndroid Build Coastguard Worker 883*c0909341SAndroid Build Coastguard Worker ext v16.16b, v2.16b, v3.16b, #2 884*c0909341SAndroid Build Coastguard Worker ext v18.16b, v2.16b, v3.16b, #6 885*c0909341SAndroid Build Coastguard Worker ext v19.16b, v2.16b, v3.16b, #8 886*c0909341SAndroid Build Coastguard Worker ext v17.16b, v2.16b, v3.16b, #4 887*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 888*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v2.8h 889*c0909341SAndroid Build Coastguard Worker shl v22.8h, v17.8h, #7 890*c0909341SAndroid Build Coastguard Worker mul v6.8h, v17.8h, v0.h[3] 891*c0909341SAndroid Build Coastguard Worker mla v6.8h, v18.8h, v0.h[4] 892*c0909341SAndroid Build Coastguard Worker mla v6.8h, v19.8h, v0.h[5] 893*c0909341SAndroid Build Coastguard Worker 894*c0909341SAndroid Build Coastguard Worker ext v16.16b, v3.16b, v4.16b, #2 895*c0909341SAndroid Build Coastguard Worker ext v18.16b, v3.16b, v4.16b, #6 896*c0909341SAndroid Build Coastguard Worker ext v19.16b, v3.16b, v4.16b, #8 897*c0909341SAndroid Build Coastguard Worker ext v17.16b, v3.16b, v4.16b, #4 898*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v16.8h 899*c0909341SAndroid Build Coastguard Worker add v19.8h, v19.8h, v3.8h 900*c0909341SAndroid Build Coastguard Worker shl v23.8h, v17.8h, #7 901*c0909341SAndroid Build Coastguard Worker mul v7.8h, v17.8h, v0.h[3] 902*c0909341SAndroid Build Coastguard Worker mla v7.8h, v18.8h, v0.h[4] 903*c0909341SAndroid Build Coastguard Worker mla v7.8h, v19.8h, v0.h[5] 904*c0909341SAndroid Build Coastguard Worker 905*c0909341SAndroid Build Coastguard Worker ld1 {v18.8h, v19.8h}, [x12], #32 906*c0909341SAndroid Build Coastguard Worker 907*c0909341SAndroid Build Coastguard Worker sub v22.8h, v22.8h, v30.8h 908*c0909341SAndroid Build Coastguard Worker sub v23.8h, v23.8h, v30.8h 909*c0909341SAndroid Build Coastguard Worker ld1 {v24.8h, v25.8h}, [x14], #32 910*c0909341SAndroid Build Coastguard Worker sqadd v6.8h, v6.8h, v22.8h 911*c0909341SAndroid Build Coastguard Worker sqadd v7.8h, v7.8h, v23.8h 912*c0909341SAndroid Build Coastguard Worker ld1 {v16.8h, v17.8h}, [x11], #32 913*c0909341SAndroid Build Coastguard Worker sshr v6.8h, v6.8h, #3 914*c0909341SAndroid Build Coastguard Worker sshr v7.8h, v7.8h, #3 915*c0909341SAndroid Build Coastguard Worker ld1 {v20.8h, v21.8h}, [x13], #32 916*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v31.8h 917*c0909341SAndroid Build Coastguard Worker add v7.8h, v7.8h, v31.8h 918*c0909341SAndroid Build Coastguard Worker 919*c0909341SAndroid Build Coastguard Worker add v24.8h, v24.8h, v18.8h 920*c0909341SAndroid Build Coastguard Worker add v16.8h, v16.8h, v6.8h 921*c0909341SAndroid Build Coastguard Worker 922*c0909341SAndroid Build Coastguard Worker smull v18.4s, v20.4h, v1.h[3] 923*c0909341SAndroid Build Coastguard Worker smlal v18.4s, v24.4h, v1.h[4] 924*c0909341SAndroid Build Coastguard Worker smlal v18.4s, v16.4h, v1.h[5] 925*c0909341SAndroid Build Coastguard Worker add v25.8h, v25.8h, v19.8h 926*c0909341SAndroid Build Coastguard Worker smull2 v19.4s, v20.8h, v1.h[3] 927*c0909341SAndroid Build Coastguard Worker smlal2 v19.4s, v24.8h, v1.h[4] 928*c0909341SAndroid Build Coastguard Worker smlal2 v19.4s, v16.8h, v1.h[5] 929*c0909341SAndroid Build Coastguard Worker add v17.8h, v17.8h, v7.8h 930*c0909341SAndroid Build Coastguard Worker smull v20.4s, v21.4h, v1.h[3] 931*c0909341SAndroid Build Coastguard Worker smlal v20.4s, v25.4h, v1.h[4] 932*c0909341SAndroid Build Coastguard Worker smlal v20.4s, v17.4h, v1.h[5] 933*c0909341SAndroid Build Coastguard Worker smull2 v21.4s, v21.8h, v1.h[3] 934*c0909341SAndroid Build Coastguard Worker smlal2 v21.4s, v25.8h, v1.h[4] 935*c0909341SAndroid Build Coastguard Worker smlal2 v21.4s, v17.8h, v1.h[5] 936*c0909341SAndroid Build Coastguard Worker sqrshrun v18.4h, v18.4s, #11 937*c0909341SAndroid Build Coastguard Worker sqrshrun2 v18.8h, v19.4s, #11 938*c0909341SAndroid Build Coastguard Worker sqrshrun v19.4h, v20.4s, #11 939*c0909341SAndroid Build Coastguard Worker sqrshrun2 v19.8h, v21.4s, #11 940*c0909341SAndroid Build Coastguard Worker st1 {v6.8h, v7.8h}, [x15], #32 941*c0909341SAndroid Build Coastguard Worker sqxtun v18.8b, v18.8h 942*c0909341SAndroid Build Coastguard Worker sqxtun2 v18.16b, v19.8h 943*c0909341SAndroid Build Coastguard Worker subs w4, w4, #16 944*c0909341SAndroid Build Coastguard Worker 945*c0909341SAndroid Build Coastguard Worker st1 {v18.16b}, [x0], #16 946*c0909341SAndroid Build Coastguard Worker 947*c0909341SAndroid Build Coastguard Worker b.le 0f 948*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 949*c0909341SAndroid Build Coastguard Worker ld1 {v4.16b}, [x3], #16 950*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 951*c0909341SAndroid Build Coastguard Worker uxtl v3.8h, v4.8b 952*c0909341SAndroid Build Coastguard Worker uxtl2 v4.8h, v4.16b 953*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep filtering. 954*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 955*c0909341SAndroid Build Coastguard Worker 956*c0909341SAndroid Build Coastguard Worker0: 957*c0909341SAndroid Build Coastguard Worker ldp x3, x4, [sp, #48] 958*c0909341SAndroid Build Coastguard Worker ldp x15, x0, [sp, #32] 959*c0909341SAndroid Build Coastguard Worker ldp x13, x14, [sp, #16] 960*c0909341SAndroid Build Coastguard Worker ldp x11, x12, [sp], #64 961*c0909341SAndroid Build Coastguard Worker 962*c0909341SAndroid Build Coastguard Worker add x3, x3, x1 963*c0909341SAndroid Build Coastguard Worker add x0, x0, x1 964*c0909341SAndroid Build Coastguard Worker 965*c0909341SAndroid Build Coastguard Worker ret 966*c0909341SAndroid Build Coastguard Workerendfunc 967*c0909341SAndroid Build Coastguard Worker 968*c0909341SAndroid Build Coastguard Worker#include "looprestoration_tmpl.S" 969*c0909341SAndroid Build Coastguard Worker 970*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 971*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 972*c0909341SAndroid Build Coastguard Worker// const pixel *src, const int w, 973*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 974*c0909341SAndroid Build Coastguard Workerfunction sgr_box3_row_h_8bpc_neon, export=1 975*c0909341SAndroid Build Coastguard Worker add w4, w4, #2 // w += 2 976*c0909341SAndroid Build Coastguard Worker 977*c0909341SAndroid Build Coastguard Worker tst w5, #1 // LR_HAVE_LEFT 978*c0909341SAndroid Build Coastguard Worker b.eq 1f 979*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 980*c0909341SAndroid Build Coastguard Worker 981*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT && left == NULL 982*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 983*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x3], #16 984*c0909341SAndroid Build Coastguard Worker b 2f 985*c0909341SAndroid Build Coastguard Worker 986*c0909341SAndroid Build Coastguard Worker0: 987*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 988*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x3], #16 989*c0909341SAndroid Build Coastguard Worker ld1 {v1.s}[3], [x2] 990*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 bytes we loaded earlier, 991*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 992*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 993*c0909341SAndroid Build Coastguard Worker ext v0.16b, v1.16b, v0.16b, #14 994*c0909341SAndroid Build Coastguard Worker b 2f 995*c0909341SAndroid Build Coastguard Worker 996*c0909341SAndroid Build Coastguard Worker1: 997*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x3], #16 998*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v1 with the leftmost byte 999*c0909341SAndroid Build Coastguard Worker // and shift v0 to have 2x the first byte at the front. 1000*c0909341SAndroid Build Coastguard Worker dup v1.16b, v0.b[0] 1001*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 2 bytes we loaded before, 1002*c0909341SAndroid Build Coastguard Worker // which we shifted out. 1003*c0909341SAndroid Build Coastguard Worker sub x3, x3, #2 1004*c0909341SAndroid Build Coastguard Worker ext v0.16b, v1.16b, v0.16b, #14 1005*c0909341SAndroid Build Coastguard Worker 1006*c0909341SAndroid Build Coastguard Worker2: 1007*c0909341SAndroid Build Coastguard Worker umull v1.8h, v0.8b, v0.8b 1008*c0909341SAndroid Build Coastguard Worker umull2 v2.8h, v0.16b, v0.16b 1009*c0909341SAndroid Build Coastguard Worker 1010*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1011*c0909341SAndroid Build Coastguard Worker b.ne 4f 1012*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that byte to pad with 1013*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 1014*c0909341SAndroid Build Coastguard Worker sub w13, w4, #(2 + 16 - 2 + 1) 1015*c0909341SAndroid Build Coastguard Worker ldr b30, [x3, w13, sxtw] 1016*c0909341SAndroid Build Coastguard Worker // Fill v30 with the right padding pixel 1017*c0909341SAndroid Build Coastguard Worker dup v30.16b, v30.b[0] 1018*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 1019*c0909341SAndroid Build Coastguard Worker 1020*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 1021*c0909341SAndroid Build Coastguard Worker cmp w4, #10 1022*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 10, all used input pixels are valid 1023*c0909341SAndroid Build Coastguard Worker 1024*c0909341SAndroid Build Coastguard Worker // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called 1025*c0909341SAndroid Build Coastguard Worker // again; it's not strictly needed in those cases (we pad enough here), 1026*c0909341SAndroid Build Coastguard Worker // but keeping the code as simple as possible. 1027*c0909341SAndroid Build Coastguard Worker 1028*c0909341SAndroid Build Coastguard Worker // Insert padding in v0.b[w] onwards 1029*c0909341SAndroid Build Coastguard Worker movrel x13, right_ext_mask 1030*c0909341SAndroid Build Coastguard Worker sub x13, x13, w4, uxtw 1031*c0909341SAndroid Build Coastguard Worker ld1 {v29.16b}, [x13] 1032*c0909341SAndroid Build Coastguard Worker 1033*c0909341SAndroid Build Coastguard Worker bit v0.16b, v30.16b, v29.16b 1034*c0909341SAndroid Build Coastguard Worker 1035*c0909341SAndroid Build Coastguard Worker // Update the precalculated squares 1036*c0909341SAndroid Build Coastguard Worker umull v1.8h, v0.8b, v0.8b 1037*c0909341SAndroid Build Coastguard Worker umull2 v2.8h, v0.16b, v0.16b 1038*c0909341SAndroid Build Coastguard Worker 1039*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 1040*c0909341SAndroid Build Coastguard Worker ext v16.16b, v0.16b, v0.16b, #1 1041*c0909341SAndroid Build Coastguard Worker ext v17.16b, v0.16b, v0.16b, #2 1042*c0909341SAndroid Build Coastguard Worker uaddl v3.8h, v0.8b, v16.8b 1043*c0909341SAndroid Build Coastguard Worker ext v20.16b, v1.16b, v2.16b, #2 1044*c0909341SAndroid Build Coastguard Worker uaddw v3.8h, v3.8h, v17.8b 1045*c0909341SAndroid Build Coastguard Worker 1046*c0909341SAndroid Build Coastguard Worker ext v21.16b, v1.16b, v2.16b, #4 1047*c0909341SAndroid Build Coastguard Worker 1048*c0909341SAndroid Build Coastguard Worker uaddl v26.4s, v1.4h, v20.4h 1049*c0909341SAndroid Build Coastguard Worker uaddl2 v27.4s, v1.8h, v20.8h 1050*c0909341SAndroid Build Coastguard Worker uaddw v26.4s, v26.4s, v21.4h 1051*c0909341SAndroid Build Coastguard Worker uaddw2 v27.4s, v27.4s, v21.8h 1052*c0909341SAndroid Build Coastguard Worker 1053*c0909341SAndroid Build Coastguard Worker subs w4, w4, #8 1054*c0909341SAndroid Build Coastguard Worker 1055*c0909341SAndroid Build Coastguard Worker st1 {v3.8h}, [x1], #16 1056*c0909341SAndroid Build Coastguard Worker st1 {v26.4s,v27.4s}, [x0], #32 1057*c0909341SAndroid Build Coastguard Worker 1058*c0909341SAndroid Build Coastguard Worker b.le 9f 1059*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1060*c0909341SAndroid Build Coastguard Worker ld1 {v3.8b}, [x3], #8 1061*c0909341SAndroid Build Coastguard Worker mov v1.16b, v2.16b 1062*c0909341SAndroid Build Coastguard Worker ext v0.16b, v0.16b, v3.16b, #8 1063*c0909341SAndroid Build Coastguard Worker umull v2.8h, v3.8b, v3.8b 1064*c0909341SAndroid Build Coastguard Worker 1065*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep summing. 1066*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1067*c0909341SAndroid Build Coastguard Worker 1068*c0909341SAndroid Build Coastguard Worker9: 1069*c0909341SAndroid Build Coastguard Worker ret 1070*c0909341SAndroid Build Coastguard Workerendfunc 1071*c0909341SAndroid Build Coastguard Worker 1072*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 1073*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 1074*c0909341SAndroid Build Coastguard Worker// const pixel *src, const int w, 1075*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 1076*c0909341SAndroid Build Coastguard Workerfunction sgr_box5_row_h_8bpc_neon, export=1 1077*c0909341SAndroid Build Coastguard Worker add w4, w4, #2 // w += 2 1078*c0909341SAndroid Build Coastguard Worker 1079*c0909341SAndroid Build Coastguard Worker tst w5, #1 // LR_HAVE_LEFT 1080*c0909341SAndroid Build Coastguard Worker b.eq 1f 1081*c0909341SAndroid Build Coastguard Worker cbnz x2, 0f 1082*c0909341SAndroid Build Coastguard Worker 1083*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT && left == NULL 1084*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 1085*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x3], #16 1086*c0909341SAndroid Build Coastguard Worker b 2f 1087*c0909341SAndroid Build Coastguard Worker 1088*c0909341SAndroid Build Coastguard Worker0: 1089*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 1090*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x3], #16 1091*c0909341SAndroid Build Coastguard Worker ld1 {v1.s}[3], [x2], #4 1092*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded earlier, 1093*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 1094*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 1095*c0909341SAndroid Build Coastguard Worker ext v0.16b, v1.16b, v0.16b, #13 1096*c0909341SAndroid Build Coastguard Worker b 2f 1097*c0909341SAndroid Build Coastguard Worker 1098*c0909341SAndroid Build Coastguard Worker1: 1099*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x3], #16 1100*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v1 with the leftmost byte 1101*c0909341SAndroid Build Coastguard Worker // and shift v0 to have 3x the first byte at the front. 1102*c0909341SAndroid Build Coastguard Worker dup v1.16b, v0.b[0] 1103*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded before, 1104*c0909341SAndroid Build Coastguard Worker // which we shifted out. 1105*c0909341SAndroid Build Coastguard Worker sub x3, x3, #3 1106*c0909341SAndroid Build Coastguard Worker ext v0.16b, v1.16b, v0.16b, #13 1107*c0909341SAndroid Build Coastguard Worker 1108*c0909341SAndroid Build Coastguard Worker2: 1109*c0909341SAndroid Build Coastguard Worker umull v1.8h, v0.8b, v0.8b 1110*c0909341SAndroid Build Coastguard Worker umull2 v2.8h, v0.16b, v0.16b 1111*c0909341SAndroid Build Coastguard Worker 1112*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1113*c0909341SAndroid Build Coastguard Worker b.ne 4f 1114*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that byte to pad with 1115*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 1116*c0909341SAndroid Build Coastguard Worker sub w13, w4, #(2 + 16 - 3 + 1) 1117*c0909341SAndroid Build Coastguard Worker ldr b30, [x3, w13, sxtw] 1118*c0909341SAndroid Build Coastguard Worker // Fill v30 with the right padding pixel 1119*c0909341SAndroid Build Coastguard Worker dup v30.16b, v30.b[0] 1120*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 1121*c0909341SAndroid Build Coastguard Worker 1122*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 1123*c0909341SAndroid Build Coastguard Worker cmp w4, #11 1124*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 11, all used input pixels are valid 1125*c0909341SAndroid Build Coastguard Worker 1126*c0909341SAndroid Build Coastguard Worker // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1127*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 1128*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 1129*c0909341SAndroid Build Coastguard Worker 1130*c0909341SAndroid Build Coastguard Worker // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1131*c0909341SAndroid Build Coastguard Worker // buffer pointer. 1132*c0909341SAndroid Build Coastguard Worker movrel x13, right_ext_mask, -1 1133*c0909341SAndroid Build Coastguard Worker sub x13, x13, w4, uxtw 1134*c0909341SAndroid Build Coastguard Worker ld1 {v29.16b}, [x13] 1135*c0909341SAndroid Build Coastguard Worker 1136*c0909341SAndroid Build Coastguard Worker bit v0.16b, v30.16b, v29.16b 1137*c0909341SAndroid Build Coastguard Worker 1138*c0909341SAndroid Build Coastguard Worker // Update the precalculated squares 1139*c0909341SAndroid Build Coastguard Worker umull v1.8h, v0.8b, v0.8b 1140*c0909341SAndroid Build Coastguard Worker umull2 v2.8h, v0.16b, v0.16b 1141*c0909341SAndroid Build Coastguard Worker 1142*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 1143*c0909341SAndroid Build Coastguard Worker ext v16.16b, v0.16b, v0.16b, #1 1144*c0909341SAndroid Build Coastguard Worker ext v17.16b, v0.16b, v0.16b, #2 1145*c0909341SAndroid Build Coastguard Worker ext v18.16b, v0.16b, v0.16b, #3 1146*c0909341SAndroid Build Coastguard Worker ext v19.16b, v0.16b, v0.16b, #4 1147*c0909341SAndroid Build Coastguard Worker uaddl v3.8h, v0.8b, v16.8b 1148*c0909341SAndroid Build Coastguard Worker uaddl v24.8h, v17.8b, v18.8b 1149*c0909341SAndroid Build Coastguard Worker uaddw v3.8h, v3.8h, v19.8b 1150*c0909341SAndroid Build Coastguard Worker add v3.8h, v3.8h, v24.8h 1151*c0909341SAndroid Build Coastguard Worker 1152*c0909341SAndroid Build Coastguard Worker ext v16.16b, v1.16b, v2.16b, #2 1153*c0909341SAndroid Build Coastguard Worker ext v17.16b, v1.16b, v2.16b, #4 1154*c0909341SAndroid Build Coastguard Worker ext v18.16b, v1.16b, v2.16b, #6 1155*c0909341SAndroid Build Coastguard Worker ext v19.16b, v1.16b, v2.16b, #8 1156*c0909341SAndroid Build Coastguard Worker 1157*c0909341SAndroid Build Coastguard Worker uaddl v26.4s, v1.4h, v16.4h 1158*c0909341SAndroid Build Coastguard Worker uaddl2 v27.4s, v1.8h, v16.8h 1159*c0909341SAndroid Build Coastguard Worker uaddl v16.4s, v17.4h, v18.4h 1160*c0909341SAndroid Build Coastguard Worker uaddl2 v17.4s, v17.8h, v18.8h 1161*c0909341SAndroid Build Coastguard Worker uaddw v26.4s, v26.4s, v19.4h 1162*c0909341SAndroid Build Coastguard Worker uaddw2 v27.4s, v27.4s, v19.8h 1163*c0909341SAndroid Build Coastguard Worker add v26.4s, v26.4s, v16.4s 1164*c0909341SAndroid Build Coastguard Worker add v27.4s, v27.4s, v17.4s 1165*c0909341SAndroid Build Coastguard Worker 1166*c0909341SAndroid Build Coastguard Worker subs w4, w4, #8 1167*c0909341SAndroid Build Coastguard Worker 1168*c0909341SAndroid Build Coastguard Worker st1 {v3.8h}, [x1], #16 1169*c0909341SAndroid Build Coastguard Worker st1 {v26.4s,v27.4s}, [x0], #32 1170*c0909341SAndroid Build Coastguard Worker 1171*c0909341SAndroid Build Coastguard Worker b.le 9f 1172*c0909341SAndroid Build Coastguard Worker tst w5, #2 // LR_HAVE_RIGHT 1173*c0909341SAndroid Build Coastguard Worker ld1 {v3.8b}, [x3], #8 1174*c0909341SAndroid Build Coastguard Worker mov v1.16b, v2.16b 1175*c0909341SAndroid Build Coastguard Worker ext v0.16b, v0.16b, v3.16b, #8 1176*c0909341SAndroid Build Coastguard Worker umull v2.8h, v3.8b, v3.8b 1177*c0909341SAndroid Build Coastguard Worker 1178*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep summing. 1179*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1180*c0909341SAndroid Build Coastguard Worker 1181*c0909341SAndroid Build Coastguard Worker9: 1182*c0909341SAndroid Build Coastguard Worker ret 1183*c0909341SAndroid Build Coastguard Workerendfunc 1184*c0909341SAndroid Build Coastguard Worker 1185*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3, 1186*c0909341SAndroid Build Coastguard Worker// int32_t *sumsq5, int16_t *sum5, 1187*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 1188*c0909341SAndroid Build Coastguard Worker// const pixel *src, const int w, 1189*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 1190*c0909341SAndroid Build Coastguard Workerfunction sgr_box35_row_h_8bpc_neon, export=1 1191*c0909341SAndroid Build Coastguard Worker add w6, w6, #2 // w += 2 1192*c0909341SAndroid Build Coastguard Worker 1193*c0909341SAndroid Build Coastguard Worker tst w7, #1 // LR_HAVE_LEFT 1194*c0909341SAndroid Build Coastguard Worker b.eq 1f 1195*c0909341SAndroid Build Coastguard Worker cbnz x4, 0f 1196*c0909341SAndroid Build Coastguard Worker 1197*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT && left == NULL 1198*c0909341SAndroid Build Coastguard Worker sub x5, x5, #3 1199*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x5], #16 1200*c0909341SAndroid Build Coastguard Worker b 2f 1201*c0909341SAndroid Build Coastguard Worker 1202*c0909341SAndroid Build Coastguard Worker0: 1203*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 1204*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x5], #16 1205*c0909341SAndroid Build Coastguard Worker ld1 {v1.s}[3], [x4], #4 1206*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded earlier, 1207*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 1208*c0909341SAndroid Build Coastguard Worker sub x5, x5, #3 1209*c0909341SAndroid Build Coastguard Worker ext v0.16b, v1.16b, v0.16b, #13 1210*c0909341SAndroid Build Coastguard Worker b 2f 1211*c0909341SAndroid Build Coastguard Worker 1212*c0909341SAndroid Build Coastguard Worker1: 1213*c0909341SAndroid Build Coastguard Worker ld1 {v0.16b}, [x5], #16 1214*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill v1 with the leftmost byte 1215*c0909341SAndroid Build Coastguard Worker // and shift v0 to have 3x the first byte at the front. 1216*c0909341SAndroid Build Coastguard Worker dup v1.16b, v0.b[0] 1217*c0909341SAndroid Build Coastguard Worker // Move x3 back to account for the last 3 bytes we loaded before, 1218*c0909341SAndroid Build Coastguard Worker // which we shifted out. 1219*c0909341SAndroid Build Coastguard Worker sub x5, x5, #3 1220*c0909341SAndroid Build Coastguard Worker ext v0.16b, v1.16b, v0.16b, #13 1221*c0909341SAndroid Build Coastguard Worker 1222*c0909341SAndroid Build Coastguard Worker2: 1223*c0909341SAndroid Build Coastguard Worker umull v1.8h, v0.8b, v0.8b 1224*c0909341SAndroid Build Coastguard Worker umull2 v2.8h, v0.16b, v0.16b 1225*c0909341SAndroid Build Coastguard Worker 1226*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 1227*c0909341SAndroid Build Coastguard Worker b.ne 4f 1228*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that byte to pad with 1229*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 1230*c0909341SAndroid Build Coastguard Worker sub w13, w6, #(2 + 16 - 3 + 1) 1231*c0909341SAndroid Build Coastguard Worker ldr b30, [x5, w13, sxtw] 1232*c0909341SAndroid Build Coastguard Worker // Fill v30 with the right padding pixel 1233*c0909341SAndroid Build Coastguard Worker dup v30.16b, v30.b[0] 1234*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 1235*c0909341SAndroid Build Coastguard Worker 1236*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 1237*c0909341SAndroid Build Coastguard Worker cmp w6, #11 1238*c0909341SAndroid Build Coastguard Worker b.ge 4f // If w >= 11, all used input pixels are valid 1239*c0909341SAndroid Build Coastguard Worker 1240*c0909341SAndroid Build Coastguard Worker // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1241*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 1242*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 1243*c0909341SAndroid Build Coastguard Worker 1244*c0909341SAndroid Build Coastguard Worker // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1245*c0909341SAndroid Build Coastguard Worker // buffer pointer. 1246*c0909341SAndroid Build Coastguard Worker movrel x13, right_ext_mask, -1 1247*c0909341SAndroid Build Coastguard Worker sub x13, x13, w6, uxtw 1248*c0909341SAndroid Build Coastguard Worker ld1 {v29.16b}, [x13] 1249*c0909341SAndroid Build Coastguard Worker 1250*c0909341SAndroid Build Coastguard Worker bit v0.16b, v30.16b, v29.16b 1251*c0909341SAndroid Build Coastguard Worker 1252*c0909341SAndroid Build Coastguard Worker // Update the precalculated squares 1253*c0909341SAndroid Build Coastguard Worker umull v1.8h, v0.8b, v0.8b 1254*c0909341SAndroid Build Coastguard Worker umull2 v2.8h, v0.16b, v0.16b 1255*c0909341SAndroid Build Coastguard Worker 1256*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 1257*c0909341SAndroid Build Coastguard Worker ext v16.16b, v0.16b, v0.16b, #1 1258*c0909341SAndroid Build Coastguard Worker ext v17.16b, v0.16b, v0.16b, #2 1259*c0909341SAndroid Build Coastguard Worker ext v19.16b, v0.16b, v0.16b, #4 1260*c0909341SAndroid Build Coastguard Worker ext v18.16b, v0.16b, v0.16b, #3 1261*c0909341SAndroid Build Coastguard Worker uaddl v3.8h, v16.8b, v17.8b 1262*c0909341SAndroid Build Coastguard Worker uaddl v24.8h, v0.8b, v19.8b 1263*c0909341SAndroid Build Coastguard Worker uaddw v3.8h, v3.8h, v18.8b 1264*c0909341SAndroid Build Coastguard Worker 1265*c0909341SAndroid Build Coastguard Worker ext v16.16b, v1.16b, v2.16b, #2 1266*c0909341SAndroid Build Coastguard Worker ext v17.16b, v1.16b, v2.16b, #4 1267*c0909341SAndroid Build Coastguard Worker ext v19.16b, v1.16b, v2.16b, #8 1268*c0909341SAndroid Build Coastguard Worker ext v18.16b, v1.16b, v2.16b, #6 1269*c0909341SAndroid Build Coastguard Worker 1270*c0909341SAndroid Build Coastguard Worker st1 {v3.8h}, [x1], #16 1271*c0909341SAndroid Build Coastguard Worker add v3.8h, v3.8h, v24.8h 1272*c0909341SAndroid Build Coastguard Worker 1273*c0909341SAndroid Build Coastguard Worker uaddl v26.4s, v16.4h, v17.4h 1274*c0909341SAndroid Build Coastguard Worker uaddl2 v27.4s, v16.8h, v17.8h 1275*c0909341SAndroid Build Coastguard Worker uaddl v16.4s, v1.4h, v19.4h 1276*c0909341SAndroid Build Coastguard Worker uaddl2 v17.4s, v1.8h, v19.8h 1277*c0909341SAndroid Build Coastguard Worker uaddw v26.4s, v26.4s, v18.4h 1278*c0909341SAndroid Build Coastguard Worker uaddw2 v27.4s, v27.4s, v18.8h 1279*c0909341SAndroid Build Coastguard Worker 1280*c0909341SAndroid Build Coastguard Worker st1 {v26.4s,v27.4s}, [x0], #32 1281*c0909341SAndroid Build Coastguard Worker add v26.4s, v26.4s, v16.4s 1282*c0909341SAndroid Build Coastguard Worker add v27.4s, v27.4s, v17.4s 1283*c0909341SAndroid Build Coastguard Worker 1284*c0909341SAndroid Build Coastguard Worker subs w6, w6, #8 1285*c0909341SAndroid Build Coastguard Worker 1286*c0909341SAndroid Build Coastguard Worker st1 {v3.8h}, [x3], #16 1287*c0909341SAndroid Build Coastguard Worker st1 {v26.4s,v27.4s}, [x2], #32 1288*c0909341SAndroid Build Coastguard Worker 1289*c0909341SAndroid Build Coastguard Worker b.le 9f 1290*c0909341SAndroid Build Coastguard Worker tst w7, #2 // LR_HAVE_RIGHT 1291*c0909341SAndroid Build Coastguard Worker ld1 {v3.8b}, [x5], #8 1292*c0909341SAndroid Build Coastguard Worker mov v1.16b, v2.16b 1293*c0909341SAndroid Build Coastguard Worker ext v0.16b, v0.16b, v3.16b, #8 1294*c0909341SAndroid Build Coastguard Worker umull v2.8h, v3.8b, v3.8b 1295*c0909341SAndroid Build Coastguard Worker 1296*c0909341SAndroid Build Coastguard Worker b.ne 4b // If we don't need to pad, just keep summing. 1297*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 1298*c0909341SAndroid Build Coastguard Worker 1299*c0909341SAndroid Build Coastguard Worker9: 1300*c0909341SAndroid Build Coastguard Worker ret 1301*c0909341SAndroid Build Coastguard Workerendfunc 1302*c0909341SAndroid Build Coastguard Worker 1303*c0909341SAndroid Build Coastguard Workersgr_funcs 8 1304