1*c0909341SAndroid Build Coastguard Worker/* 2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors 3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo 4*c0909341SAndroid Build Coastguard Worker * All rights reserved. 5*c0909341SAndroid Build Coastguard Worker * 6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker * 9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker * 12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker * 16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker */ 27*c0909341SAndroid Build Coastguard Worker 28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S" 29*c0909341SAndroid Build Coastguard Worker#include "util.S" 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker.macro dir_table w, stride 32*c0909341SAndroid Build Coastguard Workerconst directions\w 33*c0909341SAndroid Build Coastguard Worker .byte -1 * \stride + 1, -2 * \stride + 2 34*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, -1 * \stride + 2 35*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 0 * \stride + 2 36*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 1 * \stride + 2 37*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 1, 2 * \stride + 2 38*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride + 1 39*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride + 0 40*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride - 1 41*c0909341SAndroid Build Coastguard Worker// Repeated, to avoid & 7 42*c0909341SAndroid Build Coastguard Worker .byte -1 * \stride + 1, -2 * \stride + 2 43*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, -1 * \stride + 2 44*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 0 * \stride + 2 45*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 1 * \stride + 2 46*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 1, 2 * \stride + 2 47*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride + 1 48*c0909341SAndroid Build Coastguard Workerendconst 49*c0909341SAndroid Build Coastguard Worker.endm 50*c0909341SAndroid Build Coastguard Worker 51*c0909341SAndroid Build Coastguard Worker.macro tables 52*c0909341SAndroid Build Coastguard Workerdir_table 8, 16 53*c0909341SAndroid Build Coastguard Workerdir_table 4, 8 54*c0909341SAndroid Build Coastguard Worker 55*c0909341SAndroid Build Coastguard Workerconst pri_taps 56*c0909341SAndroid Build Coastguard Worker .byte 4, 2, 3, 3 57*c0909341SAndroid Build Coastguard Workerendconst 58*c0909341SAndroid Build Coastguard Worker.endm 59*c0909341SAndroid Build Coastguard Worker 60*c0909341SAndroid Build Coastguard Worker.macro load_px d1, d2, w 61*c0909341SAndroid Build Coastguard Worker.if \w == 8 62*c0909341SAndroid Build Coastguard Worker add x6, x2, w9, sxtb #1 // x + off 63*c0909341SAndroid Build Coastguard Worker sub x9, x2, w9, sxtb #1 // x - off 64*c0909341SAndroid Build Coastguard Worker ld1 {\d1\().8h}, [x6] // p0 65*c0909341SAndroid Build Coastguard Worker ld1 {\d2\().8h}, [x9] // p1 66*c0909341SAndroid Build Coastguard Worker.else 67*c0909341SAndroid Build Coastguard Worker add x6, x2, w9, sxtb #1 // x + off 68*c0909341SAndroid Build Coastguard Worker sub x9, x2, w9, sxtb #1 // x - off 69*c0909341SAndroid Build Coastguard Worker ld1 {\d1\().4h}, [x6] // p0 70*c0909341SAndroid Build Coastguard Worker add x6, x6, #2*8 // += stride 71*c0909341SAndroid Build Coastguard Worker ld1 {\d2\().4h}, [x9] // p1 72*c0909341SAndroid Build Coastguard Worker add x9, x9, #2*8 // += stride 73*c0909341SAndroid Build Coastguard Worker ld1 {\d1\().d}[1], [x6] // p0 74*c0909341SAndroid Build Coastguard Worker ld1 {\d2\().d}[1], [x9] // p1 75*c0909341SAndroid Build Coastguard Worker.endif 76*c0909341SAndroid Build Coastguard Worker.endm 77*c0909341SAndroid Build Coastguard Worker.macro handle_pixel s1, s2, thresh_vec, shift, tap, min 78*c0909341SAndroid Build Coastguard Worker.if \min 79*c0909341SAndroid Build Coastguard Worker umin v2.8h, v2.8h, \s1\().8h 80*c0909341SAndroid Build Coastguard Worker smax v3.8h, v3.8h, \s1\().8h 81*c0909341SAndroid Build Coastguard Worker umin v2.8h, v2.8h, \s2\().8h 82*c0909341SAndroid Build Coastguard Worker smax v3.8h, v3.8h, \s2\().8h 83*c0909341SAndroid Build Coastguard Worker.endif 84*c0909341SAndroid Build Coastguard Worker uabd v16.8h, v0.8h, \s1\().8h // abs(diff) 85*c0909341SAndroid Build Coastguard Worker uabd v20.8h, v0.8h, \s2\().8h // abs(diff) 86*c0909341SAndroid Build Coastguard Worker ushl v17.8h, v16.8h, \shift // abs(diff) >> shift 87*c0909341SAndroid Build Coastguard Worker ushl v21.8h, v20.8h, \shift // abs(diff) >> shift 88*c0909341SAndroid Build Coastguard Worker uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift)) 89*c0909341SAndroid Build Coastguard Worker uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift)) 90*c0909341SAndroid Build Coastguard Worker sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px 91*c0909341SAndroid Build Coastguard Worker sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px 92*c0909341SAndroid Build Coastguard Worker neg v16.8h, v17.8h // -clip 93*c0909341SAndroid Build Coastguard Worker neg v20.8h, v21.8h // -clip 94*c0909341SAndroid Build Coastguard Worker smin v18.8h, v18.8h, v17.8h // imin(diff, clip) 95*c0909341SAndroid Build Coastguard Worker smin v22.8h, v22.8h, v21.8h // imin(diff, clip) 96*c0909341SAndroid Build Coastguard Worker dup v19.8h, \tap // taps[k] 97*c0909341SAndroid Build Coastguard Worker smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip) 98*c0909341SAndroid Build Coastguard Worker smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip) 99*c0909341SAndroid Build Coastguard Worker mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() 100*c0909341SAndroid Build Coastguard Worker mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() 101*c0909341SAndroid Build Coastguard Worker.endm 102*c0909341SAndroid Build Coastguard Worker 103*c0909341SAndroid Build Coastguard Worker// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, 104*c0909341SAndroid Build Coastguard Worker// const uint16_t *tmp, int pri_strength, 105*c0909341SAndroid Build Coastguard Worker// int sec_strength, int dir, int damping, 106*c0909341SAndroid Build Coastguard Worker// int h, size_t edges); 107*c0909341SAndroid Build Coastguard Worker.macro filter_func w, bpc, pri, sec, min, suffix 108*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\suffix\()_\bpc\()bpc_neon 109*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 110*c0909341SAndroid Build Coastguard Worker ldr w8, [sp] // edges 111*c0909341SAndroid Build Coastguard Worker cmp w8, #0xf 112*c0909341SAndroid Build Coastguard Worker b.eq cdef_filter\w\suffix\()_edged_8bpc_neon 113*c0909341SAndroid Build Coastguard Worker.endif 114*c0909341SAndroid Build Coastguard Worker.if \pri 115*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 116*c0909341SAndroid Build Coastguard Worker ldr w9, [sp, #8] // bitdepth_max 117*c0909341SAndroid Build Coastguard Worker clz w9, w9 118*c0909341SAndroid Build Coastguard Worker sub w9, w9, #24 // -bitdepth_min_8 119*c0909341SAndroid Build Coastguard Worker neg w9, w9 // bitdepth_min_8 120*c0909341SAndroid Build Coastguard Worker.endif 121*c0909341SAndroid Build Coastguard Worker movrel x8, pri_taps 122*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 123*c0909341SAndroid Build Coastguard Worker lsr w9, w3, w9 // pri_strength >> bitdepth_min_8 124*c0909341SAndroid Build Coastguard Worker and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1 125*c0909341SAndroid Build Coastguard Worker.else 126*c0909341SAndroid Build Coastguard Worker and w9, w3, #1 127*c0909341SAndroid Build Coastguard Worker.endif 128*c0909341SAndroid Build Coastguard Worker add x8, x8, w9, uxtw #1 129*c0909341SAndroid Build Coastguard Worker.endif 130*c0909341SAndroid Build Coastguard Worker movrel x9, directions\w 131*c0909341SAndroid Build Coastguard Worker add x5, x9, w5, uxtw #1 132*c0909341SAndroid Build Coastguard Worker movi v30.4h, #15 133*c0909341SAndroid Build Coastguard Worker dup v28.4h, w6 // damping 134*c0909341SAndroid Build Coastguard Worker 135*c0909341SAndroid Build Coastguard Worker.if \pri 136*c0909341SAndroid Build Coastguard Worker dup v25.8h, w3 // threshold 137*c0909341SAndroid Build Coastguard Worker.endif 138*c0909341SAndroid Build Coastguard Worker.if \sec 139*c0909341SAndroid Build Coastguard Worker dup v27.8h, w4 // threshold 140*c0909341SAndroid Build Coastguard Worker.endif 141*c0909341SAndroid Build Coastguard Worker trn1 v24.4h, v25.4h, v27.4h 142*c0909341SAndroid Build Coastguard Worker clz v24.4h, v24.4h // clz(threshold) 143*c0909341SAndroid Build Coastguard Worker sub v24.4h, v30.4h, v24.4h // ulog2(threshold) 144*c0909341SAndroid Build Coastguard Worker uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold)) 145*c0909341SAndroid Build Coastguard Worker neg v24.4h, v24.4h // -shift 146*c0909341SAndroid Build Coastguard Worker.if \sec 147*c0909341SAndroid Build Coastguard Worker dup v26.8h, v24.h[1] 148*c0909341SAndroid Build Coastguard Worker.endif 149*c0909341SAndroid Build Coastguard Worker.if \pri 150*c0909341SAndroid Build Coastguard Worker dup v24.8h, v24.h[0] 151*c0909341SAndroid Build Coastguard Worker.endif 152*c0909341SAndroid Build Coastguard Worker 153*c0909341SAndroid Build Coastguard Worker1: 154*c0909341SAndroid Build Coastguard Worker.if \w == 8 155*c0909341SAndroid Build Coastguard Worker ld1 {v0.8h}, [x2] // px 156*c0909341SAndroid Build Coastguard Worker.else 157*c0909341SAndroid Build Coastguard Worker add x12, x2, #2*8 158*c0909341SAndroid Build Coastguard Worker ld1 {v0.4h}, [x2] // px 159*c0909341SAndroid Build Coastguard Worker ld1 {v0.d}[1], [x12] // px 160*c0909341SAndroid Build Coastguard Worker.endif 161*c0909341SAndroid Build Coastguard Worker 162*c0909341SAndroid Build Coastguard Worker movi v1.8h, #0 // sum 163*c0909341SAndroid Build Coastguard Worker.if \min 164*c0909341SAndroid Build Coastguard Worker mov v2.16b, v0.16b // min 165*c0909341SAndroid Build Coastguard Worker mov v3.16b, v0.16b // max 166*c0909341SAndroid Build Coastguard Worker.endif 167*c0909341SAndroid Build Coastguard Worker 168*c0909341SAndroid Build Coastguard Worker // Instead of loading sec_taps 2, 1 from memory, just set it 169*c0909341SAndroid Build Coastguard Worker // to 2 initially and decrease for the second round. 170*c0909341SAndroid Build Coastguard Worker // This is also used as loop counter. 171*c0909341SAndroid Build Coastguard Worker mov w11, #2 // sec_taps[0] 172*c0909341SAndroid Build Coastguard Worker 173*c0909341SAndroid Build Coastguard Worker2: 174*c0909341SAndroid Build Coastguard Worker.if \pri 175*c0909341SAndroid Build Coastguard Worker ldrb w9, [x5] // off1 176*c0909341SAndroid Build Coastguard Worker 177*c0909341SAndroid Build Coastguard Worker load_px v4, v5, \w 178*c0909341SAndroid Build Coastguard Worker.endif 179*c0909341SAndroid Build Coastguard Worker 180*c0909341SAndroid Build Coastguard Worker.if \sec 181*c0909341SAndroid Build Coastguard Worker add x5, x5, #4 // +2*2 182*c0909341SAndroid Build Coastguard Worker ldrb w9, [x5] // off2 183*c0909341SAndroid Build Coastguard Worker load_px v6, v7, \w 184*c0909341SAndroid Build Coastguard Worker.endif 185*c0909341SAndroid Build Coastguard Worker 186*c0909341SAndroid Build Coastguard Worker.if \pri 187*c0909341SAndroid Build Coastguard Worker ldrb w10, [x8] // *pri_taps 188*c0909341SAndroid Build Coastguard Worker 189*c0909341SAndroid Build Coastguard Worker handle_pixel v4, v5, v25.8h, v24.8h, w10, \min 190*c0909341SAndroid Build Coastguard Worker.endif 191*c0909341SAndroid Build Coastguard Worker 192*c0909341SAndroid Build Coastguard Worker.if \sec 193*c0909341SAndroid Build Coastguard Worker add x5, x5, #8 // +2*4 194*c0909341SAndroid Build Coastguard Worker ldrb w9, [x5] // off3 195*c0909341SAndroid Build Coastguard Worker load_px v4, v5, \w 196*c0909341SAndroid Build Coastguard Worker 197*c0909341SAndroid Build Coastguard Worker handle_pixel v6, v7, v27.8h, v26.8h, w11, \min 198*c0909341SAndroid Build Coastguard Worker 199*c0909341SAndroid Build Coastguard Worker handle_pixel v4, v5, v27.8h, v26.8h, w11, \min 200*c0909341SAndroid Build Coastguard Worker 201*c0909341SAndroid Build Coastguard Worker sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; 202*c0909341SAndroid Build Coastguard Worker.else 203*c0909341SAndroid Build Coastguard Worker add x5, x5, #1 // x5 += 1 204*c0909341SAndroid Build Coastguard Worker.endif 205*c0909341SAndroid Build Coastguard Worker subs w11, w11, #1 // sec_tap-- (value) 206*c0909341SAndroid Build Coastguard Worker.if \pri 207*c0909341SAndroid Build Coastguard Worker add x8, x8, #1 // pri_taps++ (pointer) 208*c0909341SAndroid Build Coastguard Worker.endif 209*c0909341SAndroid Build Coastguard Worker b.ne 2b 210*c0909341SAndroid Build Coastguard Worker 211*c0909341SAndroid Build Coastguard Worker cmlt v4.8h, v1.8h, #0 // -(sum < 0) 212*c0909341SAndroid Build Coastguard Worker add v1.8h, v1.8h, v4.8h // sum - (sum < 0) 213*c0909341SAndroid Build Coastguard Worker srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 214*c0909341SAndroid Build Coastguard Worker add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 215*c0909341SAndroid Build Coastguard Worker.if \min 216*c0909341SAndroid Build Coastguard Worker smin v0.8h, v0.8h, v3.8h 217*c0909341SAndroid Build Coastguard Worker smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) 218*c0909341SAndroid Build Coastguard Worker.endif 219*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 220*c0909341SAndroid Build Coastguard Worker xtn v0.8b, v0.8h 221*c0909341SAndroid Build Coastguard Worker.endif 222*c0909341SAndroid Build Coastguard Worker.if \w == 8 223*c0909341SAndroid Build Coastguard Worker add x2, x2, #2*16 // tmp += tmp_stride 224*c0909341SAndroid Build Coastguard Worker subs w7, w7, #1 // h-- 225*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 226*c0909341SAndroid Build Coastguard Worker st1 {v0.8b}, [x0], x1 227*c0909341SAndroid Build Coastguard Worker.else 228*c0909341SAndroid Build Coastguard Worker st1 {v0.8h}, [x0], x1 229*c0909341SAndroid Build Coastguard Worker.endif 230*c0909341SAndroid Build Coastguard Worker.else 231*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 232*c0909341SAndroid Build Coastguard Worker st1 {v0.s}[0], [x0], x1 233*c0909341SAndroid Build Coastguard Worker.else 234*c0909341SAndroid Build Coastguard Worker st1 {v0.d}[0], [x0], x1 235*c0909341SAndroid Build Coastguard Worker.endif 236*c0909341SAndroid Build Coastguard Worker add x2, x2, #2*16 // tmp += 2*tmp_stride 237*c0909341SAndroid Build Coastguard Worker subs w7, w7, #2 // h -= 2 238*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 239*c0909341SAndroid Build Coastguard Worker st1 {v0.s}[1], [x0], x1 240*c0909341SAndroid Build Coastguard Worker.else 241*c0909341SAndroid Build Coastguard Worker st1 {v0.d}[1], [x0], x1 242*c0909341SAndroid Build Coastguard Worker.endif 243*c0909341SAndroid Build Coastguard Worker.endif 244*c0909341SAndroid Build Coastguard Worker 245*c0909341SAndroid Build Coastguard Worker // Reset pri_taps and directions back to the original point 246*c0909341SAndroid Build Coastguard Worker sub x5, x5, #2 247*c0909341SAndroid Build Coastguard Worker.if \pri 248*c0909341SAndroid Build Coastguard Worker sub x8, x8, #2 249*c0909341SAndroid Build Coastguard Worker.endif 250*c0909341SAndroid Build Coastguard Worker 251*c0909341SAndroid Build Coastguard Worker b.gt 1b 252*c0909341SAndroid Build Coastguard Worker ret 253*c0909341SAndroid Build Coastguard Workerendfunc 254*c0909341SAndroid Build Coastguard Worker.endm 255*c0909341SAndroid Build Coastguard Worker 256*c0909341SAndroid Build Coastguard Worker.macro filter w, bpc 257*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri 258*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec 259*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec 260*c0909341SAndroid Build Coastguard Worker 261*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\()_\bpc\()bpc_neon, export=1 262*c0909341SAndroid Build Coastguard Worker cbnz w3, 1f // pri_strength 263*c0909341SAndroid Build Coastguard Worker b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 264*c0909341SAndroid Build Coastguard Worker1: 265*c0909341SAndroid Build Coastguard Worker cbnz w4, 1f // sec_strength 266*c0909341SAndroid Build Coastguard Worker b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 267*c0909341SAndroid Build Coastguard Worker1: 268*c0909341SAndroid Build Coastguard Worker b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec 269*c0909341SAndroid Build Coastguard Workerendfunc 270*c0909341SAndroid Build Coastguard Worker.endm 271*c0909341SAndroid Build Coastguard Worker 272*c0909341SAndroid Build Coastguard Workerconst div_table 273*c0909341SAndroid Build Coastguard Worker .short 840, 420, 280, 210, 168, 140, 120, 105 274*c0909341SAndroid Build Coastguard Workerendconst 275*c0909341SAndroid Build Coastguard Worker 276*c0909341SAndroid Build Coastguard Workerconst alt_fact 277*c0909341SAndroid Build Coastguard Worker .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 278*c0909341SAndroid Build Coastguard Workerendconst 279*c0909341SAndroid Build Coastguard Worker 280*c0909341SAndroid Build Coastguard Worker.macro cost_alt d1, d2, s1, s2, s3, s4 281*c0909341SAndroid Build Coastguard Worker smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] 282*c0909341SAndroid Build Coastguard Worker smull2 v23.4s, \s1\().8h, \s1\().8h 283*c0909341SAndroid Build Coastguard Worker smull v24.4s, \s2\().4h, \s2\().4h 284*c0909341SAndroid Build Coastguard Worker smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] 285*c0909341SAndroid Build Coastguard Worker smull2 v26.4s, \s3\().8h, \s3\().8h 286*c0909341SAndroid Build Coastguard Worker smull v27.4s, \s4\().4h, \s4\().4h 287*c0909341SAndroid Build Coastguard Worker mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact 288*c0909341SAndroid Build Coastguard Worker mla v22.4s, v23.4s, v30.4s 289*c0909341SAndroid Build Coastguard Worker mla v22.4s, v24.4s, v31.4s 290*c0909341SAndroid Build Coastguard Worker mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact 291*c0909341SAndroid Build Coastguard Worker mla v25.4s, v26.4s, v30.4s 292*c0909341SAndroid Build Coastguard Worker mla v25.4s, v27.4s, v31.4s 293*c0909341SAndroid Build Coastguard Worker addv \d1, v22.4s // *cost_ptr 294*c0909341SAndroid Build Coastguard Worker addv \d2, v25.4s // *cost_ptr 295*c0909341SAndroid Build Coastguard Worker.endm 296*c0909341SAndroid Build Coastguard Worker 297*c0909341SAndroid Build Coastguard Worker.macro find_best s1, s2, s3 298*c0909341SAndroid Build Coastguard Worker.ifnb \s2 299*c0909341SAndroid Build Coastguard Worker mov w5, \s2\().s[0] 300*c0909341SAndroid Build Coastguard Worker.endif 301*c0909341SAndroid Build Coastguard Worker cmp w4, w1 // cost[n] > best_cost 302*c0909341SAndroid Build Coastguard Worker csel w0, w3, w0, gt // best_dir = n 303*c0909341SAndroid Build Coastguard Worker csel w1, w4, w1, gt // best_cost = cost[n] 304*c0909341SAndroid Build Coastguard Worker.ifnb \s2 305*c0909341SAndroid Build Coastguard Worker add w3, w3, #1 // n++ 306*c0909341SAndroid Build Coastguard Worker cmp w5, w1 // cost[n] > best_cost 307*c0909341SAndroid Build Coastguard Worker mov w4, \s3\().s[0] 308*c0909341SAndroid Build Coastguard Worker csel w0, w3, w0, gt // best_dir = n 309*c0909341SAndroid Build Coastguard Worker csel w1, w5, w1, gt // best_cost = cost[n] 310*c0909341SAndroid Build Coastguard Worker add w3, w3, #1 // n++ 311*c0909341SAndroid Build Coastguard Worker.endif 312*c0909341SAndroid Build Coastguard Worker.endm 313*c0909341SAndroid Build Coastguard Worker 314*c0909341SAndroid Build Coastguard Worker// Steps for loading and preparing each row 315*c0909341SAndroid Build Coastguard Worker.macro dir_load_step1 s1, bpc 316*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 317*c0909341SAndroid Build Coastguard Worker ld1 {\s1\().8b}, [x0], x1 318*c0909341SAndroid Build Coastguard Worker.else 319*c0909341SAndroid Build Coastguard Worker ld1 {\s1\().8h}, [x0], x1 320*c0909341SAndroid Build Coastguard Worker.endif 321*c0909341SAndroid Build Coastguard Worker.endm 322*c0909341SAndroid Build Coastguard Worker 323*c0909341SAndroid Build Coastguard Worker.macro dir_load_step2 s1, bpc 324*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 325*c0909341SAndroid Build Coastguard Worker usubl \s1\().8h, \s1\().8b, v31.8b 326*c0909341SAndroid Build Coastguard Worker.else 327*c0909341SAndroid Build Coastguard Worker ushl \s1\().8h, \s1\().8h, v8.8h 328*c0909341SAndroid Build Coastguard Worker.endif 329*c0909341SAndroid Build Coastguard Worker.endm 330*c0909341SAndroid Build Coastguard Worker 331*c0909341SAndroid Build Coastguard Worker.macro dir_load_step3 s1, bpc 332*c0909341SAndroid Build Coastguard Worker// Nothing for \bpc == 8 333*c0909341SAndroid Build Coastguard Worker.if \bpc != 8 334*c0909341SAndroid Build Coastguard Worker sub \s1\().8h, \s1\().8h, v31.8h 335*c0909341SAndroid Build Coastguard Worker.endif 336*c0909341SAndroid Build Coastguard Worker.endm 337*c0909341SAndroid Build Coastguard Worker 338*c0909341SAndroid Build Coastguard Worker// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, 339*c0909341SAndroid Build Coastguard Worker// unsigned *const var) 340*c0909341SAndroid Build Coastguard Worker.macro find_dir bpc 341*c0909341SAndroid Build Coastguard Workerfunction cdef_find_dir_\bpc\()bpc_neon, export=1 342*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 343*c0909341SAndroid Build Coastguard Worker str d8, [sp, #-0x10]! 344*c0909341SAndroid Build Coastguard Worker clz w3, w3 // clz(bitdepth_max) 345*c0909341SAndroid Build Coastguard Worker sub w3, w3, #24 // -bitdepth_min_8 346*c0909341SAndroid Build Coastguard Worker dup v8.8h, w3 347*c0909341SAndroid Build Coastguard Worker.endif 348*c0909341SAndroid Build Coastguard Worker sub sp, sp, #32 // cost 349*c0909341SAndroid Build Coastguard Worker mov w3, #8 350*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 351*c0909341SAndroid Build Coastguard Worker movi v31.16b, #128 352*c0909341SAndroid Build Coastguard Worker.else 353*c0909341SAndroid Build Coastguard Worker movi v31.8h, #128 354*c0909341SAndroid Build Coastguard Worker.endif 355*c0909341SAndroid Build Coastguard Worker movi v30.16b, #0 356*c0909341SAndroid Build Coastguard Worker movi v1.8h, #0 // v0-v1 sum_diag[0] 357*c0909341SAndroid Build Coastguard Worker movi v3.8h, #0 // v2-v3 sum_diag[1] 358*c0909341SAndroid Build Coastguard Worker movi v5.8h, #0 // v4-v5 sum_hv[0-1] 359*c0909341SAndroid Build Coastguard Worker movi v7.8h, #0 // v6-v7 sum_alt[0] 360*c0909341SAndroid Build Coastguard Worker dir_load_step1 v26, \bpc // Setup first row early 361*c0909341SAndroid Build Coastguard Worker movi v17.8h, #0 // v16-v17 sum_alt[1] 362*c0909341SAndroid Build Coastguard Worker movi v18.8h, #0 // v18-v19 sum_alt[2] 363*c0909341SAndroid Build Coastguard Worker dir_load_step2 v26, \bpc 364*c0909341SAndroid Build Coastguard Worker movi v19.8h, #0 365*c0909341SAndroid Build Coastguard Worker dir_load_step3 v26, \bpc 366*c0909341SAndroid Build Coastguard Worker movi v21.8h, #0 // v20-v21 sum_alt[3] 367*c0909341SAndroid Build Coastguard Worker 368*c0909341SAndroid Build Coastguard Worker.irpc i, 01234567 369*c0909341SAndroid Build Coastguard Worker addv h25, v26.8h // [y] 370*c0909341SAndroid Build Coastguard Worker rev64 v27.8h, v26.8h 371*c0909341SAndroid Build Coastguard Worker addp v28.8h, v26.8h, v30.8h // [(x >> 1)] 372*c0909341SAndroid Build Coastguard Worker add v5.8h, v5.8h, v26.8h // sum_hv[1] 373*c0909341SAndroid Build Coastguard Worker ext v27.16b, v27.16b, v27.16b, #8 // [-x] 374*c0909341SAndroid Build Coastguard Worker rev64 v29.4h, v28.4h // [-(x >> 1)] 375*c0909341SAndroid Build Coastguard Worker ins v4.h[\i], v25.h[0] // sum_hv[0] 376*c0909341SAndroid Build Coastguard Worker.if \i < 6 377*c0909341SAndroid Build Coastguard Worker ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) 378*c0909341SAndroid Build Coastguard Worker ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) 379*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v22.8h // sum_alt[2] 380*c0909341SAndroid Build Coastguard Worker add v19.4h, v19.4h, v23.4h // sum_alt[2] 381*c0909341SAndroid Build Coastguard Worker.else 382*c0909341SAndroid Build Coastguard Worker add v18.8h, v18.8h, v26.8h // sum_alt[2] 383*c0909341SAndroid Build Coastguard Worker.endif 384*c0909341SAndroid Build Coastguard Worker.if \i == 0 385*c0909341SAndroid Build Coastguard Worker mov v20.16b, v26.16b // sum_alt[3] 386*c0909341SAndroid Build Coastguard Worker.elseif \i == 1 387*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v26.8h // sum_alt[3] 388*c0909341SAndroid Build Coastguard Worker.else 389*c0909341SAndroid Build Coastguard Worker ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) 390*c0909341SAndroid Build Coastguard Worker ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) 391*c0909341SAndroid Build Coastguard Worker add v20.8h, v20.8h, v24.8h // sum_alt[3] 392*c0909341SAndroid Build Coastguard Worker add v21.4h, v21.4h, v25.4h // sum_alt[3] 393*c0909341SAndroid Build Coastguard Worker.endif 394*c0909341SAndroid Build Coastguard Worker.if \i == 0 395*c0909341SAndroid Build Coastguard Worker mov v0.16b, v26.16b // sum_diag[0] 396*c0909341SAndroid Build Coastguard Worker dir_load_step1 v26, \bpc 397*c0909341SAndroid Build Coastguard Worker mov v2.16b, v27.16b // sum_diag[1] 398*c0909341SAndroid Build Coastguard Worker dir_load_step2 v26, \bpc 399*c0909341SAndroid Build Coastguard Worker mov v6.16b, v28.16b // sum_alt[0] 400*c0909341SAndroid Build Coastguard Worker dir_load_step3 v26, \bpc 401*c0909341SAndroid Build Coastguard Worker mov v16.16b, v29.16b // sum_alt[1] 402*c0909341SAndroid Build Coastguard Worker.else 403*c0909341SAndroid Build Coastguard Worker ext v22.16b, v30.16b, v26.16b, #(16-2*\i) 404*c0909341SAndroid Build Coastguard Worker ext v23.16b, v26.16b, v30.16b, #(16-2*\i) 405*c0909341SAndroid Build Coastguard Worker ext v24.16b, v30.16b, v27.16b, #(16-2*\i) 406*c0909341SAndroid Build Coastguard Worker ext v25.16b, v27.16b, v30.16b, #(16-2*\i) 407*c0909341SAndroid Build Coastguard Worker.if \i != 7 // Nothing to load for the final row 408*c0909341SAndroid Build Coastguard Worker dir_load_step1 v26, \bpc // Start setting up the next row early. 409*c0909341SAndroid Build Coastguard Worker.endif 410*c0909341SAndroid Build Coastguard Worker add v0.8h, v0.8h, v22.8h // sum_diag[0] 411*c0909341SAndroid Build Coastguard Worker add v1.8h, v1.8h, v23.8h // sum_diag[0] 412*c0909341SAndroid Build Coastguard Worker add v2.8h, v2.8h, v24.8h // sum_diag[1] 413*c0909341SAndroid Build Coastguard Worker add v3.8h, v3.8h, v25.8h // sum_diag[1] 414*c0909341SAndroid Build Coastguard Worker.if \i != 7 415*c0909341SAndroid Build Coastguard Worker dir_load_step2 v26, \bpc 416*c0909341SAndroid Build Coastguard Worker.endif 417*c0909341SAndroid Build Coastguard Worker ext v22.16b, v30.16b, v28.16b, #(16-2*\i) 418*c0909341SAndroid Build Coastguard Worker ext v23.16b, v28.16b, v30.16b, #(16-2*\i) 419*c0909341SAndroid Build Coastguard Worker ext v24.16b, v30.16b, v29.16b, #(16-2*\i) 420*c0909341SAndroid Build Coastguard Worker ext v25.16b, v29.16b, v30.16b, #(16-2*\i) 421*c0909341SAndroid Build Coastguard Worker.if \i != 7 422*c0909341SAndroid Build Coastguard Worker dir_load_step3 v26, \bpc 423*c0909341SAndroid Build Coastguard Worker.endif 424*c0909341SAndroid Build Coastguard Worker add v6.8h, v6.8h, v22.8h // sum_alt[0] 425*c0909341SAndroid Build Coastguard Worker add v7.4h, v7.4h, v23.4h // sum_alt[0] 426*c0909341SAndroid Build Coastguard Worker add v16.8h, v16.8h, v24.8h // sum_alt[1] 427*c0909341SAndroid Build Coastguard Worker add v17.4h, v17.4h, v25.4h // sum_alt[1] 428*c0909341SAndroid Build Coastguard Worker.endif 429*c0909341SAndroid Build Coastguard Worker.endr 430*c0909341SAndroid Build Coastguard Worker 431*c0909341SAndroid Build Coastguard Worker movi v31.4s, #105 432*c0909341SAndroid Build Coastguard Worker 433*c0909341SAndroid Build Coastguard Worker smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] 434*c0909341SAndroid Build Coastguard Worker smlal2 v26.4s, v4.8h, v4.8h 435*c0909341SAndroid Build Coastguard Worker smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] 436*c0909341SAndroid Build Coastguard Worker smlal2 v27.4s, v5.8h, v5.8h 437*c0909341SAndroid Build Coastguard Worker mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 438*c0909341SAndroid Build Coastguard Worker mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 439*c0909341SAndroid Build Coastguard Worker addv s4, v26.4s // cost[2] 440*c0909341SAndroid Build Coastguard Worker addv s5, v27.4s // cost[6] 441*c0909341SAndroid Build Coastguard Worker 442*c0909341SAndroid Build Coastguard Worker rev64 v1.8h, v1.8h 443*c0909341SAndroid Build Coastguard Worker rev64 v3.8h, v3.8h 444*c0909341SAndroid Build Coastguard Worker ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n] 445*c0909341SAndroid Build Coastguard Worker ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n] 446*c0909341SAndroid Build Coastguard Worker 447*c0909341SAndroid Build Coastguard Worker str s4, [sp, #2*4] // cost[2] 448*c0909341SAndroid Build Coastguard Worker str s5, [sp, #6*4] // cost[6] 449*c0909341SAndroid Build Coastguard Worker 450*c0909341SAndroid Build Coastguard Worker movrel x4, div_table 451*c0909341SAndroid Build Coastguard Worker ld1 {v31.8h}, [x4] 452*c0909341SAndroid Build Coastguard Worker 453*c0909341SAndroid Build Coastguard Worker smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] 454*c0909341SAndroid Build Coastguard Worker smull2 v23.4s, v0.8h, v0.8h 455*c0909341SAndroid Build Coastguard Worker smlal v22.4s, v1.4h, v1.4h 456*c0909341SAndroid Build Coastguard Worker smlal2 v23.4s, v1.8h, v1.8h 457*c0909341SAndroid Build Coastguard Worker smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] 458*c0909341SAndroid Build Coastguard Worker smull2 v25.4s, v2.8h, v2.8h 459*c0909341SAndroid Build Coastguard Worker smlal v24.4s, v3.4h, v3.4h 460*c0909341SAndroid Build Coastguard Worker smlal2 v25.4s, v3.8h, v3.8h 461*c0909341SAndroid Build Coastguard Worker uxtl v30.4s, v31.4h // div_table 462*c0909341SAndroid Build Coastguard Worker uxtl2 v31.4s, v31.8h 463*c0909341SAndroid Build Coastguard Worker mul v22.4s, v22.4s, v30.4s // cost[0] 464*c0909341SAndroid Build Coastguard Worker mla v22.4s, v23.4s, v31.4s // cost[0] 465*c0909341SAndroid Build Coastguard Worker mul v24.4s, v24.4s, v30.4s // cost[4] 466*c0909341SAndroid Build Coastguard Worker mla v24.4s, v25.4s, v31.4s // cost[4] 467*c0909341SAndroid Build Coastguard Worker addv s0, v22.4s // cost[0] 468*c0909341SAndroid Build Coastguard Worker addv s2, v24.4s // cost[4] 469*c0909341SAndroid Build Coastguard Worker 470*c0909341SAndroid Build Coastguard Worker movrel x5, alt_fact 471*c0909341SAndroid Build Coastguard Worker ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 472*c0909341SAndroid Build Coastguard Worker 473*c0909341SAndroid Build Coastguard Worker str s0, [sp, #0*4] // cost[0] 474*c0909341SAndroid Build Coastguard Worker str s2, [sp, #4*4] // cost[4] 475*c0909341SAndroid Build Coastguard Worker 476*c0909341SAndroid Build Coastguard Worker uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 477*c0909341SAndroid Build Coastguard Worker uxtl v30.4s, v30.4h 478*c0909341SAndroid Build Coastguard Worker uxtl v31.4s, v31.4h 479*c0909341SAndroid Build Coastguard Worker 480*c0909341SAndroid Build Coastguard Worker cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] 481*c0909341SAndroid Build Coastguard Worker cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] 482*c0909341SAndroid Build Coastguard Worker str s6, [sp, #1*4] // cost[1] 483*c0909341SAndroid Build Coastguard Worker str s16, [sp, #3*4] // cost[3] 484*c0909341SAndroid Build Coastguard Worker 485*c0909341SAndroid Build Coastguard Worker mov w0, #0 // best_dir 486*c0909341SAndroid Build Coastguard Worker mov w1, v0.s[0] // best_cost 487*c0909341SAndroid Build Coastguard Worker mov w3, #1 // n 488*c0909341SAndroid Build Coastguard Worker 489*c0909341SAndroid Build Coastguard Worker str s18, [sp, #5*4] // cost[5] 490*c0909341SAndroid Build Coastguard Worker str s20, [sp, #7*4] // cost[7] 491*c0909341SAndroid Build Coastguard Worker 492*c0909341SAndroid Build Coastguard Worker mov w4, v6.s[0] 493*c0909341SAndroid Build Coastguard Worker 494*c0909341SAndroid Build Coastguard Worker find_best v6, v4, v16 495*c0909341SAndroid Build Coastguard Worker find_best v16, v2, v18 496*c0909341SAndroid Build Coastguard Worker find_best v18, v5, v20 497*c0909341SAndroid Build Coastguard Worker find_best v20 498*c0909341SAndroid Build Coastguard Worker 499*c0909341SAndroid Build Coastguard Worker eor w3, w0, #4 // best_dir ^4 500*c0909341SAndroid Build Coastguard Worker ldr w4, [sp, w3, uxtw #2] 501*c0909341SAndroid Build Coastguard Worker sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] 502*c0909341SAndroid Build Coastguard Worker lsr w1, w1, #10 503*c0909341SAndroid Build Coastguard Worker str w1, [x2] // *var 504*c0909341SAndroid Build Coastguard Worker 505*c0909341SAndroid Build Coastguard Worker add sp, sp, #32 506*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 507*c0909341SAndroid Build Coastguard Worker ldr d8, [sp], 0x10 508*c0909341SAndroid Build Coastguard Worker.endif 509*c0909341SAndroid Build Coastguard Worker ret 510*c0909341SAndroid Build Coastguard Workerendfunc 511*c0909341SAndroid Build Coastguard Worker.endm 512