1*c0909341SAndroid Build Coastguard Worker/* 2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors 3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo 4*c0909341SAndroid Build Coastguard Worker * All rights reserved. 5*c0909341SAndroid Build Coastguard Worker * 6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker * 9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker * 12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker * 16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker */ 27*c0909341SAndroid Build Coastguard Worker 28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S" 29*c0909341SAndroid Build Coastguard Worker#include "util.S" 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Workerconst right_ext_mask_buf 32*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35*c0909341SAndroid Build Coastguard Worker .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36*c0909341SAndroid Build Coastguard Workerright_ext_mask: 37*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 38*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 39*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40*c0909341SAndroid Build Coastguard Worker .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41*c0909341SAndroid Build Coastguard Workerendconst 42*c0909341SAndroid Build Coastguard Worker 43*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], 44*c0909341SAndroid Build Coastguard Worker// const pixel *src, ptrdiff_t stride, 45*c0909341SAndroid Build Coastguard Worker// const int16_t fh[7], const intptr_t w, 46*c0909341SAndroid Build Coastguard Worker// int h, enum LrEdgeFlags edges, 47*c0909341SAndroid Build Coastguard Worker// const int bitdepth_max); 48*c0909341SAndroid Build Coastguard Workerfunction wiener_filter_h_16bpc_neon, export=1 49*c0909341SAndroid Build Coastguard Worker push {r4-r11,lr} 50*c0909341SAndroid Build Coastguard Worker vpush {q4-q7} 51*c0909341SAndroid Build Coastguard Worker ldrd r4, r5, [sp, #100] 52*c0909341SAndroid Build Coastguard Worker ldrd r6, r7, [sp, #108] 53*c0909341SAndroid Build Coastguard Worker ldr r8, [sp, #116] // bitdepth_max 54*c0909341SAndroid Build Coastguard Worker vld1.16 {q0}, [r4, :128] 55*c0909341SAndroid Build Coastguard Worker clz r8, r8 56*c0909341SAndroid Build Coastguard Worker vmov.i32 q14, #1 57*c0909341SAndroid Build Coastguard Worker sub r9, r8, #38 // -(bitdepth + 6) 58*c0909341SAndroid Build Coastguard Worker sub r8, r8, #25 // -round_bits_h 59*c0909341SAndroid Build Coastguard Worker neg r9, r9 // bitdepth + 6 60*c0909341SAndroid Build Coastguard Worker vdup.32 q1, r9 61*c0909341SAndroid Build Coastguard Worker vdup.32 q13, r8 // -round_bits_h 62*c0909341SAndroid Build Coastguard Worker vmov.i16 q15, #8192 63*c0909341SAndroid Build Coastguard Worker vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) 64*c0909341SAndroid Build Coastguard Worker mov r8, r5 65*c0909341SAndroid Build Coastguard Worker // Calculate mid_stride 66*c0909341SAndroid Build Coastguard Worker add r10, r5, #7 67*c0909341SAndroid Build Coastguard Worker bic r10, r10, #7 68*c0909341SAndroid Build Coastguard Worker lsl r10, r10, #1 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard Worker // Set up pointers for reading/writing alternate rows 71*c0909341SAndroid Build Coastguard Worker add r12, r0, r10 72*c0909341SAndroid Build Coastguard Worker lsl r10, r10, #1 73*c0909341SAndroid Build Coastguard Worker add lr, r2, r3 74*c0909341SAndroid Build Coastguard Worker lsl r3, r3, #1 75*c0909341SAndroid Build Coastguard Worker 76*c0909341SAndroid Build Coastguard Worker // Subtract the aligned width from mid_stride 77*c0909341SAndroid Build Coastguard Worker add r11, r5, #7 78*c0909341SAndroid Build Coastguard Worker bic r11, r11, #7 79*c0909341SAndroid Build Coastguard Worker sub r10, r10, r11, lsl #1 80*c0909341SAndroid Build Coastguard Worker 81*c0909341SAndroid Build Coastguard Worker // Subtract the number of pixels read from the source stride 82*c0909341SAndroid Build Coastguard Worker add r11, r11, #8 83*c0909341SAndroid Build Coastguard Worker sub r3, r3, r11, lsl #1 84*c0909341SAndroid Build Coastguard Worker 85*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 86*c0909341SAndroid Build Coastguard Worker tst r7, #1 // LR_HAVE_LEFT 87*c0909341SAndroid Build Coastguard Worker beq 2f 88*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 89*c0909341SAndroid Build Coastguard Worker cmp r1, #0 90*c0909341SAndroid Build Coastguard Worker bne 0f 91*c0909341SAndroid Build Coastguard Worker // left == NULL 92*c0909341SAndroid Build Coastguard Worker sub r2, r2, #6 93*c0909341SAndroid Build Coastguard Worker sub lr, lr, #6 94*c0909341SAndroid Build Coastguard Worker b 1f 95*c0909341SAndroid Build Coastguard Worker0: // LR_HAVE_LEFT, left != NULL 96*c0909341SAndroid Build Coastguard Worker2: // !LR_HAVE_LEFT, increase the stride. 97*c0909341SAndroid Build Coastguard Worker // For this case we don't read the left 3 pixels from the src pointer, 98*c0909341SAndroid Build Coastguard Worker // but shift it as if we had done that. 99*c0909341SAndroid Build Coastguard Worker add r3, r3, #6 100*c0909341SAndroid Build Coastguard Worker 101*c0909341SAndroid Build Coastguard Worker 102*c0909341SAndroid Build Coastguard Worker1: // Loop vertically 103*c0909341SAndroid Build Coastguard Worker vld1.16 {q2, q3}, [r2]! 104*c0909341SAndroid Build Coastguard Worker vld1.16 {q4, q5}, [lr]! 105*c0909341SAndroid Build Coastguard Worker 106*c0909341SAndroid Build Coastguard Worker tst r7, #1 // LR_HAVE_LEFT 107*c0909341SAndroid Build Coastguard Worker beq 0f 108*c0909341SAndroid Build Coastguard Worker cmp r1, #0 109*c0909341SAndroid Build Coastguard Worker beq 2f 110*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 111*c0909341SAndroid Build Coastguard Worker vld1.16 {d3}, [r1]! 112*c0909341SAndroid Build Coastguard Worker // Move r2/lr back to account for the last 3 pixels we loaded earlier, 113*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 114*c0909341SAndroid Build Coastguard Worker sub r2, r2, #6 115*c0909341SAndroid Build Coastguard Worker sub lr, lr, #6 116*c0909341SAndroid Build Coastguard Worker vld1.16 {d13}, [r1]! 117*c0909341SAndroid Build Coastguard Worker vext.8 q3, q2, q3, #10 118*c0909341SAndroid Build Coastguard Worker vext.8 q2, q1, q2, #10 119*c0909341SAndroid Build Coastguard Worker vext.8 q5, q4, q5, #10 120*c0909341SAndroid Build Coastguard Worker vext.8 q4, q6, q4, #10 121*c0909341SAndroid Build Coastguard Worker b 2f 122*c0909341SAndroid Build Coastguard Worker0: 123*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 124*c0909341SAndroid Build Coastguard Worker // and shift q2/q3 to have 3x the first pixel at the front. 125*c0909341SAndroid Build Coastguard Worker vdup.16 q1, d4[0] 126*c0909341SAndroid Build Coastguard Worker vdup.16 q6, d8[0] 127*c0909341SAndroid Build Coastguard Worker // Move r2 back to account for the last 3 pixels we loaded before, 128*c0909341SAndroid Build Coastguard Worker // which we shifted out. 129*c0909341SAndroid Build Coastguard Worker sub r2, r2, #6 130*c0909341SAndroid Build Coastguard Worker sub lr, lr, #6 131*c0909341SAndroid Build Coastguard Worker vext.8 q3, q2, q3, #10 132*c0909341SAndroid Build Coastguard Worker vext.8 q2, q1, q2, #10 133*c0909341SAndroid Build Coastguard Worker vext.8 q5, q4, q5, #10 134*c0909341SAndroid Build Coastguard Worker vext.8 q4, q6, q4, #10 135*c0909341SAndroid Build Coastguard Worker 136*c0909341SAndroid Build Coastguard Worker2: 137*c0909341SAndroid Build Coastguard Worker 138*c0909341SAndroid Build Coastguard Worker tst r7, #2 // LR_HAVE_RIGHT 139*c0909341SAndroid Build Coastguard Worker bne 4f 140*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that pixel to pad with 141*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 142*c0909341SAndroid Build Coastguard Worker sub r9, r5, #14 143*c0909341SAndroid Build Coastguard Worker lsl r9, r9, #1 144*c0909341SAndroid Build Coastguard Worker ldrh r11, [r2, r9] 145*c0909341SAndroid Build Coastguard Worker ldrh r9, [lr, r9] 146*c0909341SAndroid Build Coastguard Worker // Fill q11/q12 with the right padding pixel 147*c0909341SAndroid Build Coastguard Worker vdup.16 q11, r11 148*c0909341SAndroid Build Coastguard Worker vdup.16 q12, r9 149*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 150*c0909341SAndroid Build Coastguard Worker 151*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 152*c0909341SAndroid Build Coastguard Worker cmp r5, #11 153*c0909341SAndroid Build Coastguard Worker bge 4f // If w >= 11, all used input pixels are valid 154*c0909341SAndroid Build Coastguard Worker 155*c0909341SAndroid Build Coastguard Worker // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, 156*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 157*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 158*c0909341SAndroid Build Coastguard Worker 159*c0909341SAndroid Build Coastguard Worker // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the 160*c0909341SAndroid Build Coastguard Worker // buffer pointer. 161*c0909341SAndroid Build Coastguard Worker movrel_local r4, right_ext_mask, -6 162*c0909341SAndroid Build Coastguard Worker sub r4, r4, r5, lsl #1 163*c0909341SAndroid Build Coastguard Worker vld1.8 {q9, q10}, [r4] 164*c0909341SAndroid Build Coastguard Worker 165*c0909341SAndroid Build Coastguard Worker vbit q2, q11, q9 166*c0909341SAndroid Build Coastguard Worker vbit q3, q11, q10 167*c0909341SAndroid Build Coastguard Worker vbit q4, q12, q9 168*c0909341SAndroid Build Coastguard Worker vbit q5, q12, q10 169*c0909341SAndroid Build Coastguard Worker 170*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 171*c0909341SAndroid Build Coastguard Worker vext.8 q7, q2, q3, #4 172*c0909341SAndroid Build Coastguard Worker vext.8 q8, q2, q3, #8 173*c0909341SAndroid Build Coastguard Worker vext.8 q6, q2, q3, #2 174*c0909341SAndroid Build Coastguard Worker vext.8 q9, q2, q3, #10 175*c0909341SAndroid Build Coastguard Worker vadd.i16 q8, q8, q7 176*c0909341SAndroid Build Coastguard Worker vadd.i16 q9, q9, q6 177*c0909341SAndroid Build Coastguard Worker vext.8 q6, q2, q3, #12 178*c0909341SAndroid Build Coastguard Worker vext.8 q7, q2, q3, #6 179*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q6 180*c0909341SAndroid Build Coastguard Worker vmull.s16 q6, d14, d0[3] 181*c0909341SAndroid Build Coastguard Worker vmlal.s16 q6, d16, d1[0] 182*c0909341SAndroid Build Coastguard Worker vmlal.s16 q6, d18, d1[1] 183*c0909341SAndroid Build Coastguard Worker vmlal.s16 q6, d4, d1[2] 184*c0909341SAndroid Build Coastguard Worker vmull.s16 q7, d15, d0[3] 185*c0909341SAndroid Build Coastguard Worker vmlal.s16 q7, d17, d1[0] 186*c0909341SAndroid Build Coastguard Worker vmlal.s16 q7, d19, d1[1] 187*c0909341SAndroid Build Coastguard Worker vmlal.s16 q7, d5, d1[2] 188*c0909341SAndroid Build Coastguard Worker 189*c0909341SAndroid Build Coastguard Worker vext.8 q8, q4, q5, #4 190*c0909341SAndroid Build Coastguard Worker vext.8 q10, q4, q5, #8 191*c0909341SAndroid Build Coastguard Worker vext.8 q9, q4, q5, #2 192*c0909341SAndroid Build Coastguard Worker vext.8 q2, q4, q5, #10 193*c0909341SAndroid Build Coastguard Worker vadd.i16 q10, q10, q8 194*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q9 195*c0909341SAndroid Build Coastguard Worker vext.8 q8, q4, q5, #12 196*c0909341SAndroid Build Coastguard Worker vext.8 q9, q4, q5, #6 197*c0909341SAndroid Build Coastguard Worker vadd.i16 q4, q4, q8 198*c0909341SAndroid Build Coastguard Worker vmull.s16 q8, d18, d0[3] 199*c0909341SAndroid Build Coastguard Worker vmlal.s16 q8, d20, d1[0] 200*c0909341SAndroid Build Coastguard Worker vmlal.s16 q8, d4, d1[1] 201*c0909341SAndroid Build Coastguard Worker vmlal.s16 q8, d8, d1[2] 202*c0909341SAndroid Build Coastguard Worker vmull.s16 q9, d19, d0[3] 203*c0909341SAndroid Build Coastguard Worker vmlal.s16 q9, d21, d1[0] 204*c0909341SAndroid Build Coastguard Worker vmlal.s16 q9, d5, d1[1] 205*c0909341SAndroid Build Coastguard Worker vmlal.s16 q9, d9, d1[2] 206*c0909341SAndroid Build Coastguard Worker 207*c0909341SAndroid Build Coastguard Worker vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 208*c0909341SAndroid Build Coastguard Worker vadd.i32 q6, q6, q14 209*c0909341SAndroid Build Coastguard Worker vadd.i32 q7, q7, q14 210*c0909341SAndroid Build Coastguard Worker vadd.i32 q8, q8, q14 211*c0909341SAndroid Build Coastguard Worker vadd.i32 q9, q9, q14 212*c0909341SAndroid Build Coastguard Worker vrshl.s32 q6, q6, q13 213*c0909341SAndroid Build Coastguard Worker vrshl.s32 q7, q7, q13 214*c0909341SAndroid Build Coastguard Worker vrshl.s32 q8, q8, q13 215*c0909341SAndroid Build Coastguard Worker vrshl.s32 q9, q9, q13 216*c0909341SAndroid Build Coastguard Worker vqmovun.s32 d12, q6 217*c0909341SAndroid Build Coastguard Worker vqmovun.s32 d13, q7 218*c0909341SAndroid Build Coastguard Worker vqmovun.s32 d14, q8 219*c0909341SAndroid Build Coastguard Worker vqmovun.s32 d15, q9 220*c0909341SAndroid Build Coastguard Worker vmin.u16 q6, q6, q10 221*c0909341SAndroid Build Coastguard Worker vmin.u16 q7, q7, q10 222*c0909341SAndroid Build Coastguard Worker vsub.i16 q6, q6, q15 223*c0909341SAndroid Build Coastguard Worker vsub.i16 q7, q7, q15 224*c0909341SAndroid Build Coastguard Worker subs r5, r5, #8 225*c0909341SAndroid Build Coastguard Worker vst1.16 {q6}, [r0, :128]! 226*c0909341SAndroid Build Coastguard Worker vst1.16 {q7}, [r12, :128]! 227*c0909341SAndroid Build Coastguard Worker 228*c0909341SAndroid Build Coastguard Worker ble 9f 229*c0909341SAndroid Build Coastguard Worker tst r7, #2 // LR_HAVE_RIGHT 230*c0909341SAndroid Build Coastguard Worker vmov q2, q3 231*c0909341SAndroid Build Coastguard Worker vmov q4, q5 232*c0909341SAndroid Build Coastguard Worker vld1.16 {q3}, [r2]! 233*c0909341SAndroid Build Coastguard Worker vld1.16 {q5}, [lr]! 234*c0909341SAndroid Build Coastguard Worker bne 4b // If we don't need to pad, just keep filtering. 235*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 236*c0909341SAndroid Build Coastguard Worker 237*c0909341SAndroid Build Coastguard Worker9: 238*c0909341SAndroid Build Coastguard Worker subs r6, r6, #2 239*c0909341SAndroid Build Coastguard Worker ble 0f 240*c0909341SAndroid Build Coastguard Worker // Jump to the next row and loop horizontally 241*c0909341SAndroid Build Coastguard Worker add r0, r0, r10 242*c0909341SAndroid Build Coastguard Worker add r12, r12, r10 243*c0909341SAndroid Build Coastguard Worker add r2, r2, r3 244*c0909341SAndroid Build Coastguard Worker add lr, lr, r3 245*c0909341SAndroid Build Coastguard Worker mov r5, r8 246*c0909341SAndroid Build Coastguard Worker b 1b 247*c0909341SAndroid Build Coastguard Worker0: 248*c0909341SAndroid Build Coastguard Worker vpop {q4-q7} 249*c0909341SAndroid Build Coastguard Worker pop {r4-r11,pc} 250*c0909341SAndroid Build Coastguard Workerendfunc 251*c0909341SAndroid Build Coastguard Worker 252*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, 253*c0909341SAndroid Build Coastguard Worker// const int16_t *mid, int w, int h, 254*c0909341SAndroid Build Coastguard Worker// const int16_t fv[7], enum LrEdgeFlags edges, 255*c0909341SAndroid Build Coastguard Worker// ptrdiff_t mid_stride, const int bitdepth_max); 256*c0909341SAndroid Build Coastguard Workerfunction wiener_filter_v_16bpc_neon, export=1 257*c0909341SAndroid Build Coastguard Worker push {r4-r7,lr} 258*c0909341SAndroid Build Coastguard Worker vpush {q4-q5} 259*c0909341SAndroid Build Coastguard Worker ldrd r4, r5, [sp, #52] 260*c0909341SAndroid Build Coastguard Worker ldrd r6, r7, [sp, #60] 261*c0909341SAndroid Build Coastguard Worker ldr lr, [sp, #68] // bitdepth_max 262*c0909341SAndroid Build Coastguard Worker vld1.16 {q0}, [r5, :128] 263*c0909341SAndroid Build Coastguard Worker vdup.16 q5, lr 264*c0909341SAndroid Build Coastguard Worker clz lr, lr 265*c0909341SAndroid Build Coastguard Worker sub lr, lr, #11 // round_bits_v 266*c0909341SAndroid Build Coastguard Worker vdup.32 q4, lr 267*c0909341SAndroid Build Coastguard Worker mov lr, r4 268*c0909341SAndroid Build Coastguard Worker vneg.s32 q4, q4 // -round_bits_v 269*c0909341SAndroid Build Coastguard Worker 270*c0909341SAndroid Build Coastguard Worker // Calculate the number of rows to move back when looping vertically 271*c0909341SAndroid Build Coastguard Worker mov r12, r4 272*c0909341SAndroid Build Coastguard Worker tst r6, #4 // LR_HAVE_TOP 273*c0909341SAndroid Build Coastguard Worker beq 0f 274*c0909341SAndroid Build Coastguard Worker sub r2, r2, r7, lsl #1 275*c0909341SAndroid Build Coastguard Worker add r12, r12, #2 276*c0909341SAndroid Build Coastguard Worker0: 277*c0909341SAndroid Build Coastguard Worker tst r6, #8 // LR_HAVE_BOTTOM 278*c0909341SAndroid Build Coastguard Worker beq 1f 279*c0909341SAndroid Build Coastguard Worker add r12, r12, #2 280*c0909341SAndroid Build Coastguard Worker 281*c0909341SAndroid Build Coastguard Worker1: // Start of horizontal loop; start one vertical filter slice. 282*c0909341SAndroid Build Coastguard Worker // Load rows into q8-q11 and pad properly. 283*c0909341SAndroid Build Coastguard Worker tst r6, #4 // LR_HAVE_TOP 284*c0909341SAndroid Build Coastguard Worker vld1.16 {q8}, [r2, :128], r7 285*c0909341SAndroid Build Coastguard Worker beq 2f 286*c0909341SAndroid Build Coastguard Worker // LR_HAVE_TOP 287*c0909341SAndroid Build Coastguard Worker vld1.16 {q10}, [r2, :128], r7 288*c0909341SAndroid Build Coastguard Worker vmov q9, q8 289*c0909341SAndroid Build Coastguard Worker vld1.16 {q11}, [r2, :128], r7 290*c0909341SAndroid Build Coastguard Worker b 3f 291*c0909341SAndroid Build Coastguard Worker2: // !LR_HAVE_TOP 292*c0909341SAndroid Build Coastguard Worker vmov q9, q8 293*c0909341SAndroid Build Coastguard Worker vmov q10, q8 294*c0909341SAndroid Build Coastguard Worker vmov q11, q8 295*c0909341SAndroid Build Coastguard Worker 296*c0909341SAndroid Build Coastguard Worker3: 297*c0909341SAndroid Build Coastguard Worker cmp r4, #4 298*c0909341SAndroid Build Coastguard Worker blt 5f 299*c0909341SAndroid Build Coastguard Worker // Start filtering normally; fill in q12-q14 with unique rows. 300*c0909341SAndroid Build Coastguard Worker vld1.16 {q12}, [r2, :128], r7 301*c0909341SAndroid Build Coastguard Worker vld1.16 {q13}, [r2, :128], r7 302*c0909341SAndroid Build Coastguard Worker vld1.16 {q14}, [r2, :128], r7 303*c0909341SAndroid Build Coastguard Worker 304*c0909341SAndroid Build Coastguard Worker4: 305*c0909341SAndroid Build Coastguard Worker.macro filter compare 306*c0909341SAndroid Build Coastguard Worker subs r4, r4, #1 307*c0909341SAndroid Build Coastguard Worker // Interleaving the mul/mla chains actually hurts performance 308*c0909341SAndroid Build Coastguard Worker // significantly on Cortex A53, thus keeping mul/mla tightly 309*c0909341SAndroid Build Coastguard Worker // chained like this. 310*c0909341SAndroid Build Coastguard Worker vmull.s16 q2, d16, d0[0] 311*c0909341SAndroid Build Coastguard Worker vmlal.s16 q2, d18, d0[1] 312*c0909341SAndroid Build Coastguard Worker vmlal.s16 q2, d20, d0[2] 313*c0909341SAndroid Build Coastguard Worker vmlal.s16 q2, d22, d0[3] 314*c0909341SAndroid Build Coastguard Worker vmlal.s16 q2, d24, d1[0] 315*c0909341SAndroid Build Coastguard Worker vmlal.s16 q2, d26, d1[1] 316*c0909341SAndroid Build Coastguard Worker vmlal.s16 q2, d28, d1[2] 317*c0909341SAndroid Build Coastguard Worker vmull.s16 q3, d17, d0[0] 318*c0909341SAndroid Build Coastguard Worker vmlal.s16 q3, d19, d0[1] 319*c0909341SAndroid Build Coastguard Worker vmlal.s16 q3, d21, d0[2] 320*c0909341SAndroid Build Coastguard Worker vmlal.s16 q3, d23, d0[3] 321*c0909341SAndroid Build Coastguard Worker vmlal.s16 q3, d25, d1[0] 322*c0909341SAndroid Build Coastguard Worker vmlal.s16 q3, d27, d1[1] 323*c0909341SAndroid Build Coastguard Worker vmlal.s16 q3, d29, d1[2] 324*c0909341SAndroid Build Coastguard Worker vrshl.s32 q2, q2, q4 // round_bits_v 325*c0909341SAndroid Build Coastguard Worker vrshl.s32 q3, q3, q4 326*c0909341SAndroid Build Coastguard Worker vqmovun.s32 d4, q2 327*c0909341SAndroid Build Coastguard Worker vqmovun.s32 d5, q3 328*c0909341SAndroid Build Coastguard Worker vmin.u16 q2, q2, q5 // bitdepth_max 329*c0909341SAndroid Build Coastguard Worker vst1.16 {q2}, [r0, :128], r1 330*c0909341SAndroid Build Coastguard Worker.if \compare 331*c0909341SAndroid Build Coastguard Worker cmp r4, #4 332*c0909341SAndroid Build Coastguard Worker.else 333*c0909341SAndroid Build Coastguard Worker ble 9f 334*c0909341SAndroid Build Coastguard Worker.endif 335*c0909341SAndroid Build Coastguard Worker vmov q8, q9 336*c0909341SAndroid Build Coastguard Worker vmov q9, q10 337*c0909341SAndroid Build Coastguard Worker vmov q10, q11 338*c0909341SAndroid Build Coastguard Worker vmov q11, q12 339*c0909341SAndroid Build Coastguard Worker vmov q12, q13 340*c0909341SAndroid Build Coastguard Worker vmov q13, q14 341*c0909341SAndroid Build Coastguard Worker.endm 342*c0909341SAndroid Build Coastguard Worker filter 1 343*c0909341SAndroid Build Coastguard Worker blt 7f 344*c0909341SAndroid Build Coastguard Worker vld1.16 {q14}, [r2, :128], r7 345*c0909341SAndroid Build Coastguard Worker b 4b 346*c0909341SAndroid Build Coastguard Worker 347*c0909341SAndroid Build Coastguard Worker5: // Less than 4 rows in total; not all of q12-q13 are filled yet. 348*c0909341SAndroid Build Coastguard Worker tst r6, #8 // LR_HAVE_BOTTOM 349*c0909341SAndroid Build Coastguard Worker beq 6f 350*c0909341SAndroid Build Coastguard Worker // LR_HAVE_BOTTOM 351*c0909341SAndroid Build Coastguard Worker cmp r4, #2 352*c0909341SAndroid Build Coastguard Worker // We load at least 2 rows in all cases. 353*c0909341SAndroid Build Coastguard Worker vld1.16 {q12}, [r2, :128], r7 354*c0909341SAndroid Build Coastguard Worker vld1.16 {q13}, [r2, :128], r7 355*c0909341SAndroid Build Coastguard Worker bgt 53f // 3 rows in total 356*c0909341SAndroid Build Coastguard Worker beq 52f // 2 rows in total 357*c0909341SAndroid Build Coastguard Worker51: // 1 row in total, q11 already loaded, load edge into q12-q14. 358*c0909341SAndroid Build Coastguard Worker vmov q13, q12 359*c0909341SAndroid Build Coastguard Worker b 8f 360*c0909341SAndroid Build Coastguard Worker52: // 2 rows in total, q11 already loaded, load q12 with content data 361*c0909341SAndroid Build Coastguard Worker // and 2 rows of edge. 362*c0909341SAndroid Build Coastguard Worker vld1.16 {q14}, [r2, :128], r7 363*c0909341SAndroid Build Coastguard Worker vmov q15, q14 364*c0909341SAndroid Build Coastguard Worker b 8f 365*c0909341SAndroid Build Coastguard Worker53: 366*c0909341SAndroid Build Coastguard Worker // 3 rows in total, q11 already loaded, load q12 and q13 with content 367*c0909341SAndroid Build Coastguard Worker // and 2 rows of edge. 368*c0909341SAndroid Build Coastguard Worker vld1.16 {q14}, [r2, :128], r7 369*c0909341SAndroid Build Coastguard Worker vld1.16 {q15}, [r2, :128], r7 370*c0909341SAndroid Build Coastguard Worker vmov q1, q15 371*c0909341SAndroid Build Coastguard Worker b 8f 372*c0909341SAndroid Build Coastguard Worker 373*c0909341SAndroid Build Coastguard Worker6: 374*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_BOTTOM 375*c0909341SAndroid Build Coastguard Worker cmp r4, #2 376*c0909341SAndroid Build Coastguard Worker bgt 63f // 3 rows in total 377*c0909341SAndroid Build Coastguard Worker beq 62f // 2 rows in total 378*c0909341SAndroid Build Coastguard Worker61: // 1 row in total, q11 already loaded, pad that into q12-q14. 379*c0909341SAndroid Build Coastguard Worker vmov q12, q11 380*c0909341SAndroid Build Coastguard Worker vmov q13, q11 381*c0909341SAndroid Build Coastguard Worker vmov q14, q11 382*c0909341SAndroid Build Coastguard Worker b 8f 383*c0909341SAndroid Build Coastguard Worker62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. 384*c0909341SAndroid Build Coastguard Worker vld1.16 {q12}, [r2, :128], r7 385*c0909341SAndroid Build Coastguard Worker vmov q13, q12 386*c0909341SAndroid Build Coastguard Worker vmov q14, q12 387*c0909341SAndroid Build Coastguard Worker vmov q15, q12 388*c0909341SAndroid Build Coastguard Worker b 8f 389*c0909341SAndroid Build Coastguard Worker63: 390*c0909341SAndroid Build Coastguard Worker // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. 391*c0909341SAndroid Build Coastguard Worker vld1.16 {q12}, [r2, :128], r7 392*c0909341SAndroid Build Coastguard Worker vld1.16 {q13}, [r2, :128], r7 393*c0909341SAndroid Build Coastguard Worker vmov q14, q13 394*c0909341SAndroid Build Coastguard Worker vmov q15, q13 395*c0909341SAndroid Build Coastguard Worker vmov q1, q13 396*c0909341SAndroid Build Coastguard Worker b 8f 397*c0909341SAndroid Build Coastguard Worker 398*c0909341SAndroid Build Coastguard Worker7: 399*c0909341SAndroid Build Coastguard Worker // All registers up to q13 are filled already, 3 valid rows left. 400*c0909341SAndroid Build Coastguard Worker // < 4 valid rows left; fill in padding and filter the last 401*c0909341SAndroid Build Coastguard Worker // few rows. 402*c0909341SAndroid Build Coastguard Worker tst r6, #8 // LR_HAVE_BOTTOM 403*c0909341SAndroid Build Coastguard Worker beq 71f 404*c0909341SAndroid Build Coastguard Worker // LR_HAVE_BOTTOM; load 2 rows of edge. 405*c0909341SAndroid Build Coastguard Worker vld1.16 {q14}, [r2, :128], r7 406*c0909341SAndroid Build Coastguard Worker vld1.16 {q15}, [r2, :128], r7 407*c0909341SAndroid Build Coastguard Worker vmov q1, q15 408*c0909341SAndroid Build Coastguard Worker b 8f 409*c0909341SAndroid Build Coastguard Worker71: 410*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_BOTTOM, pad 3 rows 411*c0909341SAndroid Build Coastguard Worker vmov q14, q13 412*c0909341SAndroid Build Coastguard Worker vmov q15, q13 413*c0909341SAndroid Build Coastguard Worker vmov q1, q13 414*c0909341SAndroid Build Coastguard Worker 415*c0909341SAndroid Build Coastguard Worker8: // At this point, all registers up to q14-q15,q1 are loaded with 416*c0909341SAndroid Build Coastguard Worker // edge/padding (depending on how many rows are left). 417*c0909341SAndroid Build Coastguard Worker filter 0 // This branches to 9f when done 418*c0909341SAndroid Build Coastguard Worker vmov q14, q15 419*c0909341SAndroid Build Coastguard Worker vmov q15, q1 420*c0909341SAndroid Build Coastguard Worker b 8b 421*c0909341SAndroid Build Coastguard Worker 422*c0909341SAndroid Build Coastguard Worker9: // End of one vertical slice. 423*c0909341SAndroid Build Coastguard Worker subs r3, r3, #8 424*c0909341SAndroid Build Coastguard Worker ble 0f 425*c0909341SAndroid Build Coastguard Worker // Move pointers back up to the top and loop horizontally. 426*c0909341SAndroid Build Coastguard Worker mls r0, r1, lr, r0 427*c0909341SAndroid Build Coastguard Worker mls r2, r7, r12, r2 428*c0909341SAndroid Build Coastguard Worker add r0, r0, #16 429*c0909341SAndroid Build Coastguard Worker add r2, r2, #16 430*c0909341SAndroid Build Coastguard Worker mov r4, lr 431*c0909341SAndroid Build Coastguard Worker b 1b 432*c0909341SAndroid Build Coastguard Worker 433*c0909341SAndroid Build Coastguard Worker0: 434*c0909341SAndroid Build Coastguard Worker vpop {q4-q5} 435*c0909341SAndroid Build Coastguard Worker pop {r4-r7,pc} 436*c0909341SAndroid Build Coastguard Worker.purgem filter 437*c0909341SAndroid Build Coastguard Workerendfunc 438*c0909341SAndroid Build Coastguard Worker 439*c0909341SAndroid Build Coastguard Worker#define SUM_STRIDE (384+16) 440*c0909341SAndroid Build Coastguard Worker 441*c0909341SAndroid Build Coastguard Worker#include "looprestoration_tmpl.S" 442*c0909341SAndroid Build Coastguard Worker 443*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 444*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 445*c0909341SAndroid Build Coastguard Worker// const pixel *src, const ptrdiff_t stride, 446*c0909341SAndroid Build Coastguard Worker// const int w, const int h, 447*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 448*c0909341SAndroid Build Coastguard Workerfunction sgr_box3_h_16bpc_neon, export=1 449*c0909341SAndroid Build Coastguard Worker push {r4-r11,lr} 450*c0909341SAndroid Build Coastguard Worker vpush {q4-q7} 451*c0909341SAndroid Build Coastguard Worker ldrd r4, r5, [sp, #100] 452*c0909341SAndroid Build Coastguard Worker ldrd r6, r7, [sp, #108] 453*c0909341SAndroid Build Coastguard Worker add r5, r5, #2 // w += 2 454*c0909341SAndroid Build Coastguard Worker 455*c0909341SAndroid Build Coastguard Worker // Set up pointers for reading/writing alternate rows 456*c0909341SAndroid Build Coastguard Worker add r10, r0, #(4*SUM_STRIDE) // sumsq 457*c0909341SAndroid Build Coastguard Worker add r11, r1, #(2*SUM_STRIDE) // sum 458*c0909341SAndroid Build Coastguard Worker add r12, r3, r4 // src 459*c0909341SAndroid Build Coastguard Worker lsl r4, r4, #1 460*c0909341SAndroid Build Coastguard Worker mov r9, #(2*2*SUM_STRIDE) // double sum stride 461*c0909341SAndroid Build Coastguard Worker 462*c0909341SAndroid Build Coastguard Worker // Subtract the aligned width from the output stride. 463*c0909341SAndroid Build Coastguard Worker add lr, r5, #7 464*c0909341SAndroid Build Coastguard Worker bic lr, lr, #7 465*c0909341SAndroid Build Coastguard Worker sub r9, r9, lr, lsl #1 466*c0909341SAndroid Build Coastguard Worker 467*c0909341SAndroid Build Coastguard Worker // Store the width for the vertical loop 468*c0909341SAndroid Build Coastguard Worker mov r8, r5 469*c0909341SAndroid Build Coastguard Worker 470*c0909341SAndroid Build Coastguard Worker // Subtract the number of pixels read from the input from the stride 471*c0909341SAndroid Build Coastguard Worker add lr, lr, #8 472*c0909341SAndroid Build Coastguard Worker sub r4, r4, lr, lsl #1 473*c0909341SAndroid Build Coastguard Worker 474*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 475*c0909341SAndroid Build Coastguard Worker tst r7, #1 // LR_HAVE_LEFT 476*c0909341SAndroid Build Coastguard Worker beq 2f 477*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 478*c0909341SAndroid Build Coastguard Worker cmp r2, #0 479*c0909341SAndroid Build Coastguard Worker bne 0f 480*c0909341SAndroid Build Coastguard Worker // left == NULL 481*c0909341SAndroid Build Coastguard Worker sub r3, r3, #4 482*c0909341SAndroid Build Coastguard Worker sub r12, r12, #4 483*c0909341SAndroid Build Coastguard Worker b 1f 484*c0909341SAndroid Build Coastguard Worker0: // LR_HAVE_LEFT, left != NULL 485*c0909341SAndroid Build Coastguard Worker2: // !LR_HAVE_LEFT, increase the stride. 486*c0909341SAndroid Build Coastguard Worker // For this case we don't read the left 2 pixels from the src pointer, 487*c0909341SAndroid Build Coastguard Worker // but shift it as if we had done that. 488*c0909341SAndroid Build Coastguard Worker add r4, r4, #4 489*c0909341SAndroid Build Coastguard Worker 490*c0909341SAndroid Build Coastguard Worker 491*c0909341SAndroid Build Coastguard Worker1: // Loop vertically 492*c0909341SAndroid Build Coastguard Worker vld1.16 {q0, q1}, [r3]! 493*c0909341SAndroid Build Coastguard Worker vld1.16 {q4, q5}, [r12]! 494*c0909341SAndroid Build Coastguard Worker 495*c0909341SAndroid Build Coastguard Worker tst r7, #1 // LR_HAVE_LEFT 496*c0909341SAndroid Build Coastguard Worker beq 0f 497*c0909341SAndroid Build Coastguard Worker cmp r2, #0 498*c0909341SAndroid Build Coastguard Worker beq 2f 499*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 500*c0909341SAndroid Build Coastguard Worker vld1.16 {d5}, [r2]! 501*c0909341SAndroid Build Coastguard Worker // Move r3/r12 back to account for the last 2 pixels we loaded earlier, 502*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 503*c0909341SAndroid Build Coastguard Worker sub r3, r3, #4 504*c0909341SAndroid Build Coastguard Worker sub r12, r12, #4 505*c0909341SAndroid Build Coastguard Worker vld1.16 {d13}, [r2]! 506*c0909341SAndroid Build Coastguard Worker vext.8 q1, q0, q1, #12 507*c0909341SAndroid Build Coastguard Worker vext.8 q0, q2, q0, #12 508*c0909341SAndroid Build Coastguard Worker vext.8 q5, q4, q5, #12 509*c0909341SAndroid Build Coastguard Worker vext.8 q4, q6, q4, #12 510*c0909341SAndroid Build Coastguard Worker b 2f 511*c0909341SAndroid Build Coastguard Worker0: 512*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill q2 with the leftmost pixel 513*c0909341SAndroid Build Coastguard Worker // and shift q0 to have 2x the first byte at the front. 514*c0909341SAndroid Build Coastguard Worker vdup.16 q2, d0[0] 515*c0909341SAndroid Build Coastguard Worker vdup.16 q6, d8[0] 516*c0909341SAndroid Build Coastguard Worker // Move r3 back to account for the last 2 pixels we loaded before, 517*c0909341SAndroid Build Coastguard Worker // which we shifted out. 518*c0909341SAndroid Build Coastguard Worker sub r3, r3, #4 519*c0909341SAndroid Build Coastguard Worker sub r12, r12, #4 520*c0909341SAndroid Build Coastguard Worker vext.8 q1, q0, q1, #12 521*c0909341SAndroid Build Coastguard Worker vext.8 q0, q2, q0, #12 522*c0909341SAndroid Build Coastguard Worker vext.8 q5, q4, q5, #12 523*c0909341SAndroid Build Coastguard Worker vext.8 q4, q6, q4, #12 524*c0909341SAndroid Build Coastguard Worker 525*c0909341SAndroid Build Coastguard Worker2: 526*c0909341SAndroid Build Coastguard Worker tst r7, #2 // LR_HAVE_RIGHT 527*c0909341SAndroid Build Coastguard Worker bne 4f 528*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that pixel to pad with 529*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 530*c0909341SAndroid Build Coastguard Worker sub lr, r5, #(2 + 16 - 2 + 1) 531*c0909341SAndroid Build Coastguard Worker lsl lr, lr, #1 532*c0909341SAndroid Build Coastguard Worker ldrh r11, [r3, lr] 533*c0909341SAndroid Build Coastguard Worker ldrh lr, [r12, lr] 534*c0909341SAndroid Build Coastguard Worker // Fill q14/q15 with the right padding pixel 535*c0909341SAndroid Build Coastguard Worker vdup.16 q14, r11 536*c0909341SAndroid Build Coastguard Worker vdup.16 q15, lr 537*c0909341SAndroid Build Coastguard Worker // Restore r11 after using it for a temporary value 538*c0909341SAndroid Build Coastguard Worker add r11, r1, #(2*SUM_STRIDE) 539*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 540*c0909341SAndroid Build Coastguard Worker 541*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 542*c0909341SAndroid Build Coastguard Worker cmp r5, #10 543*c0909341SAndroid Build Coastguard Worker bge 4f // If w >= 10, all used input pixels are valid 544*c0909341SAndroid Build Coastguard Worker 545*c0909341SAndroid Build Coastguard Worker // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called 546*c0909341SAndroid Build Coastguard Worker // again; it's not strictly needed in those cases (we pad enough here), 547*c0909341SAndroid Build Coastguard Worker // but keeping the code as simple as possible. 548*c0909341SAndroid Build Coastguard Worker 549*c0909341SAndroid Build Coastguard Worker // Insert padding in q0/1.h[w] onwards 550*c0909341SAndroid Build Coastguard Worker movrel_local lr, right_ext_mask 551*c0909341SAndroid Build Coastguard Worker sub lr, lr, r5, lsl #1 552*c0909341SAndroid Build Coastguard Worker vld1.8 {q12, q13}, [lr] 553*c0909341SAndroid Build Coastguard Worker 554*c0909341SAndroid Build Coastguard Worker vbit q0, q14, q12 555*c0909341SAndroid Build Coastguard Worker vbit q1, q14, q13 556*c0909341SAndroid Build Coastguard Worker vbit q4, q15, q12 557*c0909341SAndroid Build Coastguard Worker vbit q5, q15, q13 558*c0909341SAndroid Build Coastguard Worker 559*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 560*c0909341SAndroid Build Coastguard Worker vext.8 q8, q0, q1, #2 561*c0909341SAndroid Build Coastguard Worker vext.8 q10, q4, q5, #2 562*c0909341SAndroid Build Coastguard Worker vext.8 q9, q0, q1, #4 563*c0909341SAndroid Build Coastguard Worker vext.8 q11, q4, q5, #4 564*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q0, q8 565*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q4, q10 566*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q9 567*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q3, q11 568*c0909341SAndroid Build Coastguard Worker 569*c0909341SAndroid Build Coastguard Worker vmull.u16 q6, d0, d0 570*c0909341SAndroid Build Coastguard Worker vmlal.u16 q6, d16, d16 571*c0909341SAndroid Build Coastguard Worker vmlal.u16 q6, d18, d18 572*c0909341SAndroid Build Coastguard Worker vmull.u16 q12, d8, d8 573*c0909341SAndroid Build Coastguard Worker vmlal.u16 q12, d20, d20 574*c0909341SAndroid Build Coastguard Worker vmlal.u16 q12, d22, d22 575*c0909341SAndroid Build Coastguard Worker vmull.u16 q7, d1, d1 576*c0909341SAndroid Build Coastguard Worker vmlal.u16 q7, d17, d17 577*c0909341SAndroid Build Coastguard Worker vmlal.u16 q7, d19, d19 578*c0909341SAndroid Build Coastguard Worker vmull.u16 q13, d9, d9 579*c0909341SAndroid Build Coastguard Worker vmlal.u16 q13, d21, d21 580*c0909341SAndroid Build Coastguard Worker vmlal.u16 q13, d23, d23 581*c0909341SAndroid Build Coastguard Worker subs r5, r5, #8 582*c0909341SAndroid Build Coastguard Worker vst1.16 {q2}, [r1, :128]! 583*c0909341SAndroid Build Coastguard Worker vst1.16 {q3}, [r11, :128]! 584*c0909341SAndroid Build Coastguard Worker vst1.32 {q6, q7}, [r0, :128]! 585*c0909341SAndroid Build Coastguard Worker vst1.32 {q12, q13}, [r10, :128]! 586*c0909341SAndroid Build Coastguard Worker 587*c0909341SAndroid Build Coastguard Worker ble 9f 588*c0909341SAndroid Build Coastguard Worker tst r7, #2 // LR_HAVE_RIGHT 589*c0909341SAndroid Build Coastguard Worker vmov q0, q1 590*c0909341SAndroid Build Coastguard Worker vmov q4, q5 591*c0909341SAndroid Build Coastguard Worker vld1.16 {q1}, [r3]! 592*c0909341SAndroid Build Coastguard Worker vld1.16 {q5}, [r12]! 593*c0909341SAndroid Build Coastguard Worker 594*c0909341SAndroid Build Coastguard Worker bne 4b // If we don't need to pad, just keep summing. 595*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 596*c0909341SAndroid Build Coastguard Worker 597*c0909341SAndroid Build Coastguard Worker9: 598*c0909341SAndroid Build Coastguard Worker subs r6, r6, #2 599*c0909341SAndroid Build Coastguard Worker ble 0f 600*c0909341SAndroid Build Coastguard Worker // Jump to the next row and loop horizontally 601*c0909341SAndroid Build Coastguard Worker add r0, r0, r9, lsl #1 602*c0909341SAndroid Build Coastguard Worker add r10, r10, r9, lsl #1 603*c0909341SAndroid Build Coastguard Worker add r1, r1, r9 604*c0909341SAndroid Build Coastguard Worker add r11, r11, r9 605*c0909341SAndroid Build Coastguard Worker add r3, r3, r4 606*c0909341SAndroid Build Coastguard Worker add r12, r12, r4 607*c0909341SAndroid Build Coastguard Worker mov r5, r8 608*c0909341SAndroid Build Coastguard Worker b 1b 609*c0909341SAndroid Build Coastguard Worker0: 610*c0909341SAndroid Build Coastguard Worker vpop {q4-q7} 611*c0909341SAndroid Build Coastguard Worker pop {r4-r11,pc} 612*c0909341SAndroid Build Coastguard Workerendfunc 613*c0909341SAndroid Build Coastguard Worker 614*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 615*c0909341SAndroid Build Coastguard Worker// const pixel (*left)[4], 616*c0909341SAndroid Build Coastguard Worker// const pixel *src, const ptrdiff_t stride, 617*c0909341SAndroid Build Coastguard Worker// const int w, const int h, 618*c0909341SAndroid Build Coastguard Worker// const enum LrEdgeFlags edges); 619*c0909341SAndroid Build Coastguard Workerfunction sgr_box5_h_16bpc_neon, export=1 620*c0909341SAndroid Build Coastguard Worker push {r4-r11,lr} 621*c0909341SAndroid Build Coastguard Worker vpush {q4-q7} 622*c0909341SAndroid Build Coastguard Worker ldrd r4, r5, [sp, #100] 623*c0909341SAndroid Build Coastguard Worker ldrd r6, r7, [sp, #108] 624*c0909341SAndroid Build Coastguard Worker add r5, r5, #2 // w += 2 625*c0909341SAndroid Build Coastguard Worker 626*c0909341SAndroid Build Coastguard Worker // Set up pointers for reading/writing alternate rows 627*c0909341SAndroid Build Coastguard Worker add r10, r0, #(4*SUM_STRIDE) // sumsq 628*c0909341SAndroid Build Coastguard Worker add r11, r1, #(2*SUM_STRIDE) // sum 629*c0909341SAndroid Build Coastguard Worker add r12, r3, r4 // src 630*c0909341SAndroid Build Coastguard Worker lsl r4, r4, #1 631*c0909341SAndroid Build Coastguard Worker mov r9, #(2*2*SUM_STRIDE) // double sum stride 632*c0909341SAndroid Build Coastguard Worker 633*c0909341SAndroid Build Coastguard Worker // Subtract the aligned width from the output stride. 634*c0909341SAndroid Build Coastguard Worker add lr, r5, #7 635*c0909341SAndroid Build Coastguard Worker bic lr, lr, #7 636*c0909341SAndroid Build Coastguard Worker sub r9, r9, lr, lsl #1 637*c0909341SAndroid Build Coastguard Worker add lr, lr, #8 638*c0909341SAndroid Build Coastguard Worker sub r4, r4, lr, lsl #1 639*c0909341SAndroid Build Coastguard Worker 640*c0909341SAndroid Build Coastguard Worker // Store the width for the vertical loop 641*c0909341SAndroid Build Coastguard Worker mov r8, r5 642*c0909341SAndroid Build Coastguard Worker 643*c0909341SAndroid Build Coastguard Worker // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 644*c0909341SAndroid Build Coastguard Worker tst r7, #1 // LR_HAVE_LEFT 645*c0909341SAndroid Build Coastguard Worker beq 2f 646*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT 647*c0909341SAndroid Build Coastguard Worker cmp r2, #0 648*c0909341SAndroid Build Coastguard Worker bne 0f 649*c0909341SAndroid Build Coastguard Worker // left == NULL 650*c0909341SAndroid Build Coastguard Worker sub r3, r3, #6 651*c0909341SAndroid Build Coastguard Worker sub r12, r12, #6 652*c0909341SAndroid Build Coastguard Worker b 1f 653*c0909341SAndroid Build Coastguard Worker0: // LR_HAVE_LEFT, left != NULL 654*c0909341SAndroid Build Coastguard Worker2: // !LR_HAVE_LEFT, increase the stride. 655*c0909341SAndroid Build Coastguard Worker // For this case we don't read the left 3 pixels from the src pointer, 656*c0909341SAndroid Build Coastguard Worker // but shift it as if we had done that. 657*c0909341SAndroid Build Coastguard Worker add r4, r4, #6 658*c0909341SAndroid Build Coastguard Worker 659*c0909341SAndroid Build Coastguard Worker1: // Loop vertically 660*c0909341SAndroid Build Coastguard Worker vld1.16 {q0, q1}, [r3]! 661*c0909341SAndroid Build Coastguard Worker vld1.16 {q4, q5}, [r12]! 662*c0909341SAndroid Build Coastguard Worker 663*c0909341SAndroid Build Coastguard Worker tst r7, #1 // LR_HAVE_LEFT 664*c0909341SAndroid Build Coastguard Worker beq 0f 665*c0909341SAndroid Build Coastguard Worker cmp r2, #0 666*c0909341SAndroid Build Coastguard Worker beq 2f 667*c0909341SAndroid Build Coastguard Worker // LR_HAVE_LEFT, left != NULL 668*c0909341SAndroid Build Coastguard Worker vld1.16 {d5}, [r2]! 669*c0909341SAndroid Build Coastguard Worker // Move r3/r12 back to account for the last 3 pixels we loaded earlier, 670*c0909341SAndroid Build Coastguard Worker // which we'll shift out. 671*c0909341SAndroid Build Coastguard Worker sub r3, r3, #6 672*c0909341SAndroid Build Coastguard Worker sub r12, r12, #6 673*c0909341SAndroid Build Coastguard Worker vld1.16 {d13}, [r2]! 674*c0909341SAndroid Build Coastguard Worker vext.8 q1, q0, q1, #10 675*c0909341SAndroid Build Coastguard Worker vext.8 q0, q2, q0, #10 676*c0909341SAndroid Build Coastguard Worker vext.8 q5, q4, q5, #10 677*c0909341SAndroid Build Coastguard Worker vext.8 q4, q6, q4, #10 678*c0909341SAndroid Build Coastguard Worker b 2f 679*c0909341SAndroid Build Coastguard Worker0: 680*c0909341SAndroid Build Coastguard Worker // !LR_HAVE_LEFT, fill q2 with the leftmost pixel 681*c0909341SAndroid Build Coastguard Worker // and shift q0 to have 3x the first pixel at the front. 682*c0909341SAndroid Build Coastguard Worker vdup.16 q2, d0[0] 683*c0909341SAndroid Build Coastguard Worker vdup.16 q6, d8[0] 684*c0909341SAndroid Build Coastguard Worker // Move r3 back to account for the last 3 pixels we loaded before, 685*c0909341SAndroid Build Coastguard Worker // which we shifted out. 686*c0909341SAndroid Build Coastguard Worker sub r3, r3, #6 687*c0909341SAndroid Build Coastguard Worker sub r12, r12, #6 688*c0909341SAndroid Build Coastguard Worker vext.8 q1, q0, q1, #10 689*c0909341SAndroid Build Coastguard Worker vext.8 q0, q2, q0, #10 690*c0909341SAndroid Build Coastguard Worker vext.8 q5, q4, q5, #10 691*c0909341SAndroid Build Coastguard Worker vext.8 q4, q6, q4, #10 692*c0909341SAndroid Build Coastguard Worker 693*c0909341SAndroid Build Coastguard Worker2: 694*c0909341SAndroid Build Coastguard Worker tst r7, #2 // LR_HAVE_RIGHT 695*c0909341SAndroid Build Coastguard Worker bne 4f 696*c0909341SAndroid Build Coastguard Worker // If we'll need to pad the right edge, load that pixel to pad with 697*c0909341SAndroid Build Coastguard Worker // here since we can find it pretty easily from here. 698*c0909341SAndroid Build Coastguard Worker sub lr, r5, #(2 + 16 - 3 + 1) 699*c0909341SAndroid Build Coastguard Worker lsl lr, lr, #1 700*c0909341SAndroid Build Coastguard Worker ldrh r11, [r3, lr] 701*c0909341SAndroid Build Coastguard Worker ldrh lr, [r12, lr] 702*c0909341SAndroid Build Coastguard Worker // Fill q14/q15 with the right padding pixel 703*c0909341SAndroid Build Coastguard Worker vdup.16 q14, r11 704*c0909341SAndroid Build Coastguard Worker vdup.16 q15, lr 705*c0909341SAndroid Build Coastguard Worker // Restore r11 after using it for a temporary value 706*c0909341SAndroid Build Coastguard Worker add r11, r1, #(2*SUM_STRIDE) 707*c0909341SAndroid Build Coastguard Worker3: // !LR_HAVE_RIGHT 708*c0909341SAndroid Build Coastguard Worker 709*c0909341SAndroid Build Coastguard Worker // Check whether we need to pad the right edge 710*c0909341SAndroid Build Coastguard Worker cmp r5, #11 711*c0909341SAndroid Build Coastguard Worker bge 4f // If w >= 11, all used input pixels are valid 712*c0909341SAndroid Build Coastguard Worker 713*c0909341SAndroid Build Coastguard Worker // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, 714*c0909341SAndroid Build Coastguard Worker // this ends up called again; it's not strictly needed in those 715*c0909341SAndroid Build Coastguard Worker // cases (we pad enough here), but keeping the code as simple as possible. 716*c0909341SAndroid Build Coastguard Worker 717*c0909341SAndroid Build Coastguard Worker // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the 718*c0909341SAndroid Build Coastguard Worker // buffer pointer. 719*c0909341SAndroid Build Coastguard Worker movrel_local lr, right_ext_mask, -2 720*c0909341SAndroid Build Coastguard Worker sub lr, lr, r5, lsl #1 721*c0909341SAndroid Build Coastguard Worker vld1.8 {q12, q13}, [lr] 722*c0909341SAndroid Build Coastguard Worker 723*c0909341SAndroid Build Coastguard Worker vbit q0, q14, q12 724*c0909341SAndroid Build Coastguard Worker vbit q1, q14, q13 725*c0909341SAndroid Build Coastguard Worker vbit q4, q15, q12 726*c0909341SAndroid Build Coastguard Worker vbit q5, q15, q13 727*c0909341SAndroid Build Coastguard Worker 728*c0909341SAndroid Build Coastguard Worker4: // Loop horizontally 729*c0909341SAndroid Build Coastguard Worker vext.8 q8, q0, q1, #2 730*c0909341SAndroid Build Coastguard Worker vext.8 q10, q4, q5, #2 731*c0909341SAndroid Build Coastguard Worker vext.8 q9, q0, q1, #4 732*c0909341SAndroid Build Coastguard Worker vext.8 q11, q4, q5, #4 733*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q0, q8 734*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q4, q10 735*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q9 736*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q3, q11 737*c0909341SAndroid Build Coastguard Worker 738*c0909341SAndroid Build Coastguard Worker vmull.u16 q6, d0, d0 739*c0909341SAndroid Build Coastguard Worker vmlal.u16 q6, d16, d16 740*c0909341SAndroid Build Coastguard Worker vmlal.u16 q6, d18, d18 741*c0909341SAndroid Build Coastguard Worker vmull.u16 q12, d8, d8 742*c0909341SAndroid Build Coastguard Worker vmlal.u16 q12, d20, d20 743*c0909341SAndroid Build Coastguard Worker vmlal.u16 q12, d22, d22 744*c0909341SAndroid Build Coastguard Worker vmull.u16 q7, d1, d1 745*c0909341SAndroid Build Coastguard Worker vmlal.u16 q7, d17, d17 746*c0909341SAndroid Build Coastguard Worker vmlal.u16 q7, d19, d19 747*c0909341SAndroid Build Coastguard Worker vmull.u16 q13, d9, d9 748*c0909341SAndroid Build Coastguard Worker vmlal.u16 q13, d21, d21 749*c0909341SAndroid Build Coastguard Worker vmlal.u16 q13, d23, d23 750*c0909341SAndroid Build Coastguard Worker 751*c0909341SAndroid Build Coastguard Worker vext.8 q8, q0, q1, #6 752*c0909341SAndroid Build Coastguard Worker vext.8 q10, q4, q5, #6 753*c0909341SAndroid Build Coastguard Worker vext.8 q9, q0, q1, #8 754*c0909341SAndroid Build Coastguard Worker vext.8 q11, q4, q5, #8 755*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q8 756*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q3, q10 757*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q9 758*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q3, q11 759*c0909341SAndroid Build Coastguard Worker 760*c0909341SAndroid Build Coastguard Worker vmlal.u16 q6, d16, d16 761*c0909341SAndroid Build Coastguard Worker vmlal.u16 q6, d1, d1 762*c0909341SAndroid Build Coastguard Worker vmlal.u16 q12, d20, d20 763*c0909341SAndroid Build Coastguard Worker vmlal.u16 q12, d9, d9 764*c0909341SAndroid Build Coastguard Worker vmlal.u16 q7, d17, d17 765*c0909341SAndroid Build Coastguard Worker vmlal.u16 q7, d19, d19 766*c0909341SAndroid Build Coastguard Worker vmlal.u16 q13, d21, d21 767*c0909341SAndroid Build Coastguard Worker vmlal.u16 q13, d23, d23 768*c0909341SAndroid Build Coastguard Worker 769*c0909341SAndroid Build Coastguard Worker subs r5, r5, #8 770*c0909341SAndroid Build Coastguard Worker vst1.16 {q2}, [r1, :128]! 771*c0909341SAndroid Build Coastguard Worker vst1.16 {q3}, [r11, :128]! 772*c0909341SAndroid Build Coastguard Worker vst1.32 {q6, q7}, [r0, :128]! 773*c0909341SAndroid Build Coastguard Worker vst1.32 {q12, q13}, [r10, :128]! 774*c0909341SAndroid Build Coastguard Worker 775*c0909341SAndroid Build Coastguard Worker ble 9f 776*c0909341SAndroid Build Coastguard Worker tst r7, #2 // LR_HAVE_RIGHT 777*c0909341SAndroid Build Coastguard Worker vmov q0, q1 778*c0909341SAndroid Build Coastguard Worker vmov q4, q5 779*c0909341SAndroid Build Coastguard Worker vld1.16 {q1}, [r3]! 780*c0909341SAndroid Build Coastguard Worker vld1.16 {q5}, [r12]! 781*c0909341SAndroid Build Coastguard Worker bne 4b // If we don't need to pad, just keep summing. 782*c0909341SAndroid Build Coastguard Worker b 3b // If we need to pad, check how many pixels we have left. 783*c0909341SAndroid Build Coastguard Worker 784*c0909341SAndroid Build Coastguard Worker9: 785*c0909341SAndroid Build Coastguard Worker subs r6, r6, #2 786*c0909341SAndroid Build Coastguard Worker ble 0f 787*c0909341SAndroid Build Coastguard Worker // Jump to the next row and loop horizontally 788*c0909341SAndroid Build Coastguard Worker add r0, r0, r9, lsl #1 789*c0909341SAndroid Build Coastguard Worker add r10, r10, r9, lsl #1 790*c0909341SAndroid Build Coastguard Worker add r1, r1, r9 791*c0909341SAndroid Build Coastguard Worker add r11, r11, r9 792*c0909341SAndroid Build Coastguard Worker add r3, r3, r4 793*c0909341SAndroid Build Coastguard Worker add r12, r12, r4 794*c0909341SAndroid Build Coastguard Worker mov r5, r8 795*c0909341SAndroid Build Coastguard Worker b 1b 796*c0909341SAndroid Build Coastguard Worker0: 797*c0909341SAndroid Build Coastguard Worker vpop {q4-q7} 798*c0909341SAndroid Build Coastguard Worker pop {r4-r11,pc} 799*c0909341SAndroid Build Coastguard Workerendfunc 800*c0909341SAndroid Build Coastguard Worker 801*c0909341SAndroid Build Coastguard Workersgr_funcs 16 802