1*c0909341SAndroid Build Coastguard Worker/* 2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors 3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo 4*c0909341SAndroid Build Coastguard Worker * All rights reserved. 5*c0909341SAndroid Build Coastguard Worker * 6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker * 9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker * 12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker * 16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker */ 27*c0909341SAndroid Build Coastguard Worker 28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S" 29*c0909341SAndroid Build Coastguard Worker#include "util.S" 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker.macro dir_table w, stride 32*c0909341SAndroid Build Coastguard Workerconst directions\w 33*c0909341SAndroid Build Coastguard Worker .byte -1 * \stride + 1, -2 * \stride + 2 34*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, -1 * \stride + 2 35*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 0 * \stride + 2 36*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 1 * \stride + 2 37*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 1, 2 * \stride + 2 38*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride + 1 39*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride + 0 40*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride - 1 41*c0909341SAndroid Build Coastguard Worker// Repeated, to avoid & 7 42*c0909341SAndroid Build Coastguard Worker .byte -1 * \stride + 1, -2 * \stride + 2 43*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, -1 * \stride + 2 44*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 0 * \stride + 2 45*c0909341SAndroid Build Coastguard Worker .byte 0 * \stride + 1, 1 * \stride + 2 46*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 1, 2 * \stride + 2 47*c0909341SAndroid Build Coastguard Worker .byte 1 * \stride + 0, 2 * \stride + 1 48*c0909341SAndroid Build Coastguard Workerendconst 49*c0909341SAndroid Build Coastguard Worker.endm 50*c0909341SAndroid Build Coastguard Worker 51*c0909341SAndroid Build Coastguard Worker.macro tables 52*c0909341SAndroid Build Coastguard Workerdir_table 8, 16 53*c0909341SAndroid Build Coastguard Workerdir_table 4, 8 54*c0909341SAndroid Build Coastguard Worker 55*c0909341SAndroid Build Coastguard Workerconst pri_taps 56*c0909341SAndroid Build Coastguard Worker .byte 4, 2, 3, 3 57*c0909341SAndroid Build Coastguard Workerendconst 58*c0909341SAndroid Build Coastguard Worker.endm 59*c0909341SAndroid Build Coastguard Worker 60*c0909341SAndroid Build Coastguard Worker.macro load_px d11, d12, d21, d22, w 61*c0909341SAndroid Build Coastguard Worker.if \w == 8 62*c0909341SAndroid Build Coastguard Worker add r6, r2, r9, lsl #1 // x + off 63*c0909341SAndroid Build Coastguard Worker sub r9, r2, r9, lsl #1 // x - off 64*c0909341SAndroid Build Coastguard Worker vld1.16 {\d11,\d12}, [r6] // p0 65*c0909341SAndroid Build Coastguard Worker vld1.16 {\d21,\d22}, [r9] // p1 66*c0909341SAndroid Build Coastguard Worker.else 67*c0909341SAndroid Build Coastguard Worker add r6, r2, r9, lsl #1 // x + off 68*c0909341SAndroid Build Coastguard Worker sub r9, r2, r9, lsl #1 // x - off 69*c0909341SAndroid Build Coastguard Worker vld1.16 {\d11}, [r6] // p0 70*c0909341SAndroid Build Coastguard Worker add r6, r6, #2*8 // += stride 71*c0909341SAndroid Build Coastguard Worker vld1.16 {\d21}, [r9] // p1 72*c0909341SAndroid Build Coastguard Worker add r9, r9, #2*8 // += stride 73*c0909341SAndroid Build Coastguard Worker vld1.16 {\d12}, [r6] // p0 74*c0909341SAndroid Build Coastguard Worker vld1.16 {\d22}, [r9] // p1 75*c0909341SAndroid Build Coastguard Worker.endif 76*c0909341SAndroid Build Coastguard Worker.endm 77*c0909341SAndroid Build Coastguard Worker.macro handle_pixel s1, s2, thresh_vec, shift, tap, min 78*c0909341SAndroid Build Coastguard Worker.if \min 79*c0909341SAndroid Build Coastguard Worker vmin.u16 q2, q2, \s1 80*c0909341SAndroid Build Coastguard Worker vmax.s16 q3, q3, \s1 81*c0909341SAndroid Build Coastguard Worker vmin.u16 q2, q2, \s2 82*c0909341SAndroid Build Coastguard Worker vmax.s16 q3, q3, \s2 83*c0909341SAndroid Build Coastguard Worker.endif 84*c0909341SAndroid Build Coastguard Worker vabd.u16 q8, q0, \s1 // abs(diff) 85*c0909341SAndroid Build Coastguard Worker vabd.u16 q11, q0, \s2 // abs(diff) 86*c0909341SAndroid Build Coastguard Worker vshl.u16 q9, q8, \shift // abs(diff) >> shift 87*c0909341SAndroid Build Coastguard Worker vshl.u16 q12, q11, \shift // abs(diff) >> shift 88*c0909341SAndroid Build Coastguard Worker vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) 89*c0909341SAndroid Build Coastguard Worker vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) 90*c0909341SAndroid Build Coastguard Worker vsub.i16 q10, \s1, q0 // diff = p0 - px 91*c0909341SAndroid Build Coastguard Worker vsub.i16 q13, \s2, q0 // diff = p1 - px 92*c0909341SAndroid Build Coastguard Worker vneg.s16 q8, q9 // -clip 93*c0909341SAndroid Build Coastguard Worker vneg.s16 q11, q12 // -clip 94*c0909341SAndroid Build Coastguard Worker vmin.s16 q10, q10, q9 // imin(diff, clip) 95*c0909341SAndroid Build Coastguard Worker vmin.s16 q13, q13, q12 // imin(diff, clip) 96*c0909341SAndroid Build Coastguard Worker vdup.16 q9, \tap // taps[k] 97*c0909341SAndroid Build Coastguard Worker vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) 98*c0909341SAndroid Build Coastguard Worker vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) 99*c0909341SAndroid Build Coastguard Worker vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() 100*c0909341SAndroid Build Coastguard Worker vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() 101*c0909341SAndroid Build Coastguard Worker.endm 102*c0909341SAndroid Build Coastguard Worker 103*c0909341SAndroid Build Coastguard Worker// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, 104*c0909341SAndroid Build Coastguard Worker// const uint16_t *tmp, int pri_strength, 105*c0909341SAndroid Build Coastguard Worker// int sec_strength, int dir, int damping, 106*c0909341SAndroid Build Coastguard Worker// int h, size_t edges); 107*c0909341SAndroid Build Coastguard Worker.macro filter_func w, bpc, pri, sec, min, suffix 108*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\suffix\()_\bpc\()bpc_neon 109*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 110*c0909341SAndroid Build Coastguard Worker cmp r8, #0xf 111*c0909341SAndroid Build Coastguard Worker beq cdef_filter\w\suffix\()_edged_neon 112*c0909341SAndroid Build Coastguard Worker.endif 113*c0909341SAndroid Build Coastguard Worker.if \pri 114*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 115*c0909341SAndroid Build Coastguard Worker clz r9, r9 116*c0909341SAndroid Build Coastguard Worker sub r9, r9, #24 // -bitdepth_min_8 117*c0909341SAndroid Build Coastguard Worker neg r9, r9 // bitdepth_min_8 118*c0909341SAndroid Build Coastguard Worker.endif 119*c0909341SAndroid Build Coastguard Worker movrel_local r8, pri_taps 120*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 121*c0909341SAndroid Build Coastguard Worker lsr r9, r3, r9 // pri_strength >> bitdepth_min_8 122*c0909341SAndroid Build Coastguard Worker and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1 123*c0909341SAndroid Build Coastguard Worker.else 124*c0909341SAndroid Build Coastguard Worker and r9, r3, #1 125*c0909341SAndroid Build Coastguard Worker.endif 126*c0909341SAndroid Build Coastguard Worker add r8, r8, r9, lsl #1 127*c0909341SAndroid Build Coastguard Worker.endif 128*c0909341SAndroid Build Coastguard Worker movrel_local r9, directions\w 129*c0909341SAndroid Build Coastguard Worker add r5, r9, r5, lsl #1 130*c0909341SAndroid Build Coastguard Worker vmov.u16 d17, #15 131*c0909341SAndroid Build Coastguard Worker vdup.16 d16, r6 // damping 132*c0909341SAndroid Build Coastguard Worker 133*c0909341SAndroid Build Coastguard Worker.if \pri 134*c0909341SAndroid Build Coastguard Worker vdup.16 q5, r3 // threshold 135*c0909341SAndroid Build Coastguard Worker.endif 136*c0909341SAndroid Build Coastguard Worker.if \sec 137*c0909341SAndroid Build Coastguard Worker vdup.16 q7, r4 // threshold 138*c0909341SAndroid Build Coastguard Worker.endif 139*c0909341SAndroid Build Coastguard Worker vmov.16 d8[0], r3 140*c0909341SAndroid Build Coastguard Worker vmov.16 d8[1], r4 141*c0909341SAndroid Build Coastguard Worker vclz.i16 d8, d8 // clz(threshold) 142*c0909341SAndroid Build Coastguard Worker vsub.i16 d8, d17, d8 // ulog2(threshold) 143*c0909341SAndroid Build Coastguard Worker vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) 144*c0909341SAndroid Build Coastguard Worker vneg.s16 d8, d8 // -shift 145*c0909341SAndroid Build Coastguard Worker.if \sec 146*c0909341SAndroid Build Coastguard Worker vdup.16 q6, d8[1] 147*c0909341SAndroid Build Coastguard Worker.endif 148*c0909341SAndroid Build Coastguard Worker.if \pri 149*c0909341SAndroid Build Coastguard Worker vdup.16 q4, d8[0] 150*c0909341SAndroid Build Coastguard Worker.endif 151*c0909341SAndroid Build Coastguard Worker 152*c0909341SAndroid Build Coastguard Worker1: 153*c0909341SAndroid Build Coastguard Worker.if \w == 8 154*c0909341SAndroid Build Coastguard Worker vld1.16 {q0}, [r2, :128] // px 155*c0909341SAndroid Build Coastguard Worker.else 156*c0909341SAndroid Build Coastguard Worker add r12, r2, #2*8 157*c0909341SAndroid Build Coastguard Worker vld1.16 {d0}, [r2, :64] // px 158*c0909341SAndroid Build Coastguard Worker vld1.16 {d1}, [r12, :64] // px 159*c0909341SAndroid Build Coastguard Worker.endif 160*c0909341SAndroid Build Coastguard Worker 161*c0909341SAndroid Build Coastguard Worker vmov.u16 q1, #0 // sum 162*c0909341SAndroid Build Coastguard Worker.if \min 163*c0909341SAndroid Build Coastguard Worker vmov.u16 q2, q0 // min 164*c0909341SAndroid Build Coastguard Worker vmov.u16 q3, q0 // max 165*c0909341SAndroid Build Coastguard Worker.endif 166*c0909341SAndroid Build Coastguard Worker 167*c0909341SAndroid Build Coastguard Worker // Instead of loading sec_taps 2, 1 from memory, just set it 168*c0909341SAndroid Build Coastguard Worker // to 2 initially and decrease for the second round. 169*c0909341SAndroid Build Coastguard Worker // This is also used as loop counter. 170*c0909341SAndroid Build Coastguard Worker mov lr, #2 // sec_taps[0] 171*c0909341SAndroid Build Coastguard Worker 172*c0909341SAndroid Build Coastguard Worker2: 173*c0909341SAndroid Build Coastguard Worker.if \pri 174*c0909341SAndroid Build Coastguard Worker ldrsb r9, [r5] // off1 175*c0909341SAndroid Build Coastguard Worker 176*c0909341SAndroid Build Coastguard Worker load_px d28, d29, d30, d31, \w 177*c0909341SAndroid Build Coastguard Worker.endif 178*c0909341SAndroid Build Coastguard Worker 179*c0909341SAndroid Build Coastguard Worker.if \sec 180*c0909341SAndroid Build Coastguard Worker add r5, r5, #4 // +2*2 181*c0909341SAndroid Build Coastguard Worker ldrsb r9, [r5] // off2 182*c0909341SAndroid Build Coastguard Worker.endif 183*c0909341SAndroid Build Coastguard Worker 184*c0909341SAndroid Build Coastguard Worker.if \pri 185*c0909341SAndroid Build Coastguard Worker ldrb r12, [r8] // *pri_taps 186*c0909341SAndroid Build Coastguard Worker 187*c0909341SAndroid Build Coastguard Worker handle_pixel q14, q15, q5, q4, r12, \min 188*c0909341SAndroid Build Coastguard Worker.endif 189*c0909341SAndroid Build Coastguard Worker 190*c0909341SAndroid Build Coastguard Worker.if \sec 191*c0909341SAndroid Build Coastguard Worker load_px d28, d29, d30, d31, \w 192*c0909341SAndroid Build Coastguard Worker 193*c0909341SAndroid Build Coastguard Worker add r5, r5, #8 // +2*4 194*c0909341SAndroid Build Coastguard Worker ldrsb r9, [r5] // off3 195*c0909341SAndroid Build Coastguard Worker 196*c0909341SAndroid Build Coastguard Worker handle_pixel q14, q15, q7, q6, lr, \min 197*c0909341SAndroid Build Coastguard Worker 198*c0909341SAndroid Build Coastguard Worker load_px d28, d29, d30, d31, \w 199*c0909341SAndroid Build Coastguard Worker 200*c0909341SAndroid Build Coastguard Worker handle_pixel q14, q15, q7, q6, lr, \min 201*c0909341SAndroid Build Coastguard Worker 202*c0909341SAndroid Build Coastguard Worker sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; 203*c0909341SAndroid Build Coastguard Worker.else 204*c0909341SAndroid Build Coastguard Worker add r5, r5, #1 // r5 += 1 205*c0909341SAndroid Build Coastguard Worker.endif 206*c0909341SAndroid Build Coastguard Worker subs lr, lr, #1 // sec_tap-- (value) 207*c0909341SAndroid Build Coastguard Worker.if \pri 208*c0909341SAndroid Build Coastguard Worker add r8, r8, #1 // pri_taps++ (pointer) 209*c0909341SAndroid Build Coastguard Worker.endif 210*c0909341SAndroid Build Coastguard Worker bne 2b 211*c0909341SAndroid Build Coastguard Worker 212*c0909341SAndroid Build Coastguard Worker vshr.s16 q14, q1, #15 // -(sum < 0) 213*c0909341SAndroid Build Coastguard Worker vadd.i16 q1, q1, q14 // sum - (sum < 0) 214*c0909341SAndroid Build Coastguard Worker vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 215*c0909341SAndroid Build Coastguard Worker vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 216*c0909341SAndroid Build Coastguard Worker.if \min 217*c0909341SAndroid Build Coastguard Worker vmin.s16 q0, q0, q3 218*c0909341SAndroid Build Coastguard Worker vmax.s16 q0, q0, q2 // iclip(px + .., min, max) 219*c0909341SAndroid Build Coastguard Worker.endif 220*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 221*c0909341SAndroid Build Coastguard Worker vmovn.u16 d0, q0 222*c0909341SAndroid Build Coastguard Worker.endif 223*c0909341SAndroid Build Coastguard Worker.if \w == 8 224*c0909341SAndroid Build Coastguard Worker add r2, r2, #2*16 // tmp += tmp_stride 225*c0909341SAndroid Build Coastguard Worker subs r7, r7, #1 // h-- 226*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 227*c0909341SAndroid Build Coastguard Worker vst1.8 {d0}, [r0, :64], r1 228*c0909341SAndroid Build Coastguard Worker.else 229*c0909341SAndroid Build Coastguard Worker vst1.16 {q0}, [r0, :128], r1 230*c0909341SAndroid Build Coastguard Worker.endif 231*c0909341SAndroid Build Coastguard Worker.else 232*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 233*c0909341SAndroid Build Coastguard Worker vst1.32 {d0[0]}, [r0, :32], r1 234*c0909341SAndroid Build Coastguard Worker.else 235*c0909341SAndroid Build Coastguard Worker vst1.16 {d0}, [r0, :64], r1 236*c0909341SAndroid Build Coastguard Worker.endif 237*c0909341SAndroid Build Coastguard Worker add r2, r2, #2*16 // tmp += 2*tmp_stride 238*c0909341SAndroid Build Coastguard Worker subs r7, r7, #2 // h -= 2 239*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 240*c0909341SAndroid Build Coastguard Worker vst1.32 {d0[1]}, [r0, :32], r1 241*c0909341SAndroid Build Coastguard Worker.else 242*c0909341SAndroid Build Coastguard Worker vst1.16 {d1}, [r0, :64], r1 243*c0909341SAndroid Build Coastguard Worker.endif 244*c0909341SAndroid Build Coastguard Worker.endif 245*c0909341SAndroid Build Coastguard Worker 246*c0909341SAndroid Build Coastguard Worker // Reset pri_taps and directions back to the original point 247*c0909341SAndroid Build Coastguard Worker sub r5, r5, #2 248*c0909341SAndroid Build Coastguard Worker.if \pri 249*c0909341SAndroid Build Coastguard Worker sub r8, r8, #2 250*c0909341SAndroid Build Coastguard Worker.endif 251*c0909341SAndroid Build Coastguard Worker 252*c0909341SAndroid Build Coastguard Worker bgt 1b 253*c0909341SAndroid Build Coastguard Worker vpop {q4-q7} 254*c0909341SAndroid Build Coastguard Worker pop {r4-r9,pc} 255*c0909341SAndroid Build Coastguard Workerendfunc 256*c0909341SAndroid Build Coastguard Worker.endm 257*c0909341SAndroid Build Coastguard Worker 258*c0909341SAndroid Build Coastguard Worker.macro filter w, bpc 259*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri 260*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec 261*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec 262*c0909341SAndroid Build Coastguard Worker 263*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\()_\bpc\()bpc_neon, export=1 264*c0909341SAndroid Build Coastguard Worker push {r4-r9,lr} 265*c0909341SAndroid Build Coastguard Worker vpush {q4-q7} 266*c0909341SAndroid Build Coastguard Worker ldrd r4, r5, [sp, #92] 267*c0909341SAndroid Build Coastguard Worker ldrd r6, r7, [sp, #100] 268*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 269*c0909341SAndroid Build Coastguard Worker ldrd r8, r9, [sp, #108] 270*c0909341SAndroid Build Coastguard Worker.else 271*c0909341SAndroid Build Coastguard Worker ldr r8, [sp, #108] 272*c0909341SAndroid Build Coastguard Worker.endif 273*c0909341SAndroid Build Coastguard Worker cmp r3, #0 // pri_strength 274*c0909341SAndroid Build Coastguard Worker bne 1f 275*c0909341SAndroid Build Coastguard Worker b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 276*c0909341SAndroid Build Coastguard Worker1: 277*c0909341SAndroid Build Coastguard Worker cmp r4, #0 // sec_strength 278*c0909341SAndroid Build Coastguard Worker bne 1f 279*c0909341SAndroid Build Coastguard Worker b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 280*c0909341SAndroid Build Coastguard Worker1: 281*c0909341SAndroid Build Coastguard Worker b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec 282*c0909341SAndroid Build Coastguard Workerendfunc 283*c0909341SAndroid Build Coastguard Worker.endm 284*c0909341SAndroid Build Coastguard Worker 285*c0909341SAndroid Build Coastguard Workerconst div_table, align=4 286*c0909341SAndroid Build Coastguard Worker .short 840, 420, 280, 210, 168, 140, 120, 105 287*c0909341SAndroid Build Coastguard Workerendconst 288*c0909341SAndroid Build Coastguard Worker 289*c0909341SAndroid Build Coastguard Workerconst alt_fact, align=4 290*c0909341SAndroid Build Coastguard Worker .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 291*c0909341SAndroid Build Coastguard Workerendconst 292*c0909341SAndroid Build Coastguard Worker 293*c0909341SAndroid Build Coastguard Worker.macro cost_alt dest, s1, s2, s3, s4, s5, s6 294*c0909341SAndroid Build Coastguard Worker vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] 295*c0909341SAndroid Build Coastguard Worker vmull.s16 q2, \s2, \s2 296*c0909341SAndroid Build Coastguard Worker vmull.s16 q3, \s3, \s3 297*c0909341SAndroid Build Coastguard Worker vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] 298*c0909341SAndroid Build Coastguard Worker vmull.s16 q12, \s5, \s5 299*c0909341SAndroid Build Coastguard Worker vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here 300*c0909341SAndroid Build Coastguard Worker vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact 301*c0909341SAndroid Build Coastguard Worker vmla.i32 q1, q2, q14 302*c0909341SAndroid Build Coastguard Worker vmla.i32 q1, q3, q15 303*c0909341SAndroid Build Coastguard Worker vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact 304*c0909341SAndroid Build Coastguard Worker vmla.i32 q5, q12, q14 305*c0909341SAndroid Build Coastguard Worker vmla.i32 q5, q6, q15 306*c0909341SAndroid Build Coastguard Worker vadd.i32 d2, d2, d3 307*c0909341SAndroid Build Coastguard Worker vadd.i32 d3, d10, d11 308*c0909341SAndroid Build Coastguard Worker vpadd.i32 \dest, d2, d3 // *cost_ptr 309*c0909341SAndroid Build Coastguard Worker.endm 310*c0909341SAndroid Build Coastguard Worker 311*c0909341SAndroid Build Coastguard Worker.macro find_best s1, s2, s3 312*c0909341SAndroid Build Coastguard Worker.ifnb \s2 313*c0909341SAndroid Build Coastguard Worker vmov.32 lr, \s2 314*c0909341SAndroid Build Coastguard Worker.endif 315*c0909341SAndroid Build Coastguard Worker cmp r12, r1 // cost[n] > best_cost 316*c0909341SAndroid Build Coastguard Worker itt gt 317*c0909341SAndroid Build Coastguard Worker movgt r0, r3 // best_dir = n 318*c0909341SAndroid Build Coastguard Worker movgt r1, r12 // best_cost = cost[n] 319*c0909341SAndroid Build Coastguard Worker.ifnb \s2 320*c0909341SAndroid Build Coastguard Worker add r3, r3, #1 // n++ 321*c0909341SAndroid Build Coastguard Worker cmp lr, r1 // cost[n] > best_cost 322*c0909341SAndroid Build Coastguard Worker vmov.32 r12, \s3 323*c0909341SAndroid Build Coastguard Worker itt gt 324*c0909341SAndroid Build Coastguard Worker movgt r0, r3 // best_dir = n 325*c0909341SAndroid Build Coastguard Worker movgt r1, lr // best_cost = cost[n] 326*c0909341SAndroid Build Coastguard Worker add r3, r3, #1 // n++ 327*c0909341SAndroid Build Coastguard Worker.endif 328*c0909341SAndroid Build Coastguard Worker.endm 329*c0909341SAndroid Build Coastguard Worker 330*c0909341SAndroid Build Coastguard Worker// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, 331*c0909341SAndroid Build Coastguard Worker// unsigned *const var) 332*c0909341SAndroid Build Coastguard Worker.macro find_dir bpc 333*c0909341SAndroid Build Coastguard Workerfunction cdef_find_dir_\bpc\()bpc_neon, export=1 334*c0909341SAndroid Build Coastguard Worker push {lr} 335*c0909341SAndroid Build Coastguard Worker vpush {q4-q7} 336*c0909341SAndroid Build Coastguard Worker.if \bpc == 16 337*c0909341SAndroid Build Coastguard Worker clz r3, r3 // clz(bitdepth_max) 338*c0909341SAndroid Build Coastguard Worker sub lr, r3, #24 // -bitdepth_min_8 339*c0909341SAndroid Build Coastguard Worker.endif 340*c0909341SAndroid Build Coastguard Worker sub sp, sp, #32 // cost 341*c0909341SAndroid Build Coastguard Worker mov r3, #8 342*c0909341SAndroid Build Coastguard Worker vmov.u16 q1, #0 // q0-q1 sum_diag[0] 343*c0909341SAndroid Build Coastguard Worker vmov.u16 q3, #0 // q2-q3 sum_diag[1] 344*c0909341SAndroid Build Coastguard Worker vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] 345*c0909341SAndroid Build Coastguard Worker vmov.u16 q8, #0 // q6,d16 sum_alt[0] 346*c0909341SAndroid Build Coastguard Worker // q7,d17 sum_alt[1] 347*c0909341SAndroid Build Coastguard Worker vmov.u16 q9, #0 // q9,d22 sum_alt[2] 348*c0909341SAndroid Build Coastguard Worker vmov.u16 q11, #0 349*c0909341SAndroid Build Coastguard Worker vmov.u16 q10, #0 // q10,d23 sum_alt[3] 350*c0909341SAndroid Build Coastguard Worker 351*c0909341SAndroid Build Coastguard Worker 352*c0909341SAndroid Build Coastguard Worker.irpc i, 01234567 353*c0909341SAndroid Build Coastguard Worker.if \bpc == 8 354*c0909341SAndroid Build Coastguard Worker vld1.8 {d30}, [r0, :64], r1 355*c0909341SAndroid Build Coastguard Worker vmov.u8 d31, #128 356*c0909341SAndroid Build Coastguard Worker vsubl.u8 q15, d30, d31 // img[x] - 128 357*c0909341SAndroid Build Coastguard Worker.else 358*c0909341SAndroid Build Coastguard Worker vld1.16 {q15}, [r0, :128], r1 359*c0909341SAndroid Build Coastguard Worker vdup.16 q14, lr // -bitdepth_min_8 360*c0909341SAndroid Build Coastguard Worker vshl.u16 q15, q15, q14 361*c0909341SAndroid Build Coastguard Worker vmov.u16 q14, #128 362*c0909341SAndroid Build Coastguard Worker vsub.i16 q15, q15, q14 // img[x] - 128 363*c0909341SAndroid Build Coastguard Worker.endif 364*c0909341SAndroid Build Coastguard Worker vmov.u16 q14, #0 365*c0909341SAndroid Build Coastguard Worker 366*c0909341SAndroid Build Coastguard Worker.if \i == 0 367*c0909341SAndroid Build Coastguard Worker vmov q0, q15 // sum_diag[0] 368*c0909341SAndroid Build Coastguard Worker.else 369*c0909341SAndroid Build Coastguard Worker vext.8 q12, q14, q15, #(16-2*\i) 370*c0909341SAndroid Build Coastguard Worker vext.8 q13, q15, q14, #(16-2*\i) 371*c0909341SAndroid Build Coastguard Worker vadd.i16 q0, q0, q12 // sum_diag[0] 372*c0909341SAndroid Build Coastguard Worker vadd.i16 q1, q1, q13 // sum_diag[0] 373*c0909341SAndroid Build Coastguard Worker.endif 374*c0909341SAndroid Build Coastguard Worker vrev64.16 q13, q15 375*c0909341SAndroid Build Coastguard Worker vswp d26, d27 // [-x] 376*c0909341SAndroid Build Coastguard Worker.if \i == 0 377*c0909341SAndroid Build Coastguard Worker vmov q2, q13 // sum_diag[1] 378*c0909341SAndroid Build Coastguard Worker.else 379*c0909341SAndroid Build Coastguard Worker vext.8 q12, q14, q13, #(16-2*\i) 380*c0909341SAndroid Build Coastguard Worker vext.8 q13, q13, q14, #(16-2*\i) 381*c0909341SAndroid Build Coastguard Worker vadd.i16 q2, q2, q12 // sum_diag[1] 382*c0909341SAndroid Build Coastguard Worker vadd.i16 q3, q3, q13 // sum_diag[1] 383*c0909341SAndroid Build Coastguard Worker.endif 384*c0909341SAndroid Build Coastguard Worker 385*c0909341SAndroid Build Coastguard Worker vpadd.u16 d26, d30, d31 // [(x >> 1)] 386*c0909341SAndroid Build Coastguard Worker vmov.u16 d27, #0 387*c0909341SAndroid Build Coastguard Worker vpadd.u16 d24, d26, d28 388*c0909341SAndroid Build Coastguard Worker vpadd.u16 d24, d24, d28 // [y] 389*c0909341SAndroid Build Coastguard Worker vmov.u16 r12, d24[0] 390*c0909341SAndroid Build Coastguard Worker vadd.i16 q5, q5, q15 // sum_hv[1] 391*c0909341SAndroid Build Coastguard Worker.if \i < 4 392*c0909341SAndroid Build Coastguard Worker vmov.16 d8[\i], r12 // sum_hv[0] 393*c0909341SAndroid Build Coastguard Worker.else 394*c0909341SAndroid Build Coastguard Worker vmov.16 d9[\i-4], r12 // sum_hv[0] 395*c0909341SAndroid Build Coastguard Worker.endif 396*c0909341SAndroid Build Coastguard Worker 397*c0909341SAndroid Build Coastguard Worker.if \i == 0 398*c0909341SAndroid Build Coastguard Worker vmov.u16 q6, q13 // sum_alt[0] 399*c0909341SAndroid Build Coastguard Worker.else 400*c0909341SAndroid Build Coastguard Worker vext.8 q12, q14, q13, #(16-2*\i) 401*c0909341SAndroid Build Coastguard Worker vext.8 q14, q13, q14, #(16-2*\i) 402*c0909341SAndroid Build Coastguard Worker vadd.i16 q6, q6, q12 // sum_alt[0] 403*c0909341SAndroid Build Coastguard Worker vadd.i16 d16, d16, d28 // sum_alt[0] 404*c0909341SAndroid Build Coastguard Worker.endif 405*c0909341SAndroid Build Coastguard Worker vrev64.16 d26, d26 // [-(x >> 1)] 406*c0909341SAndroid Build Coastguard Worker vmov.u16 q14, #0 407*c0909341SAndroid Build Coastguard Worker.if \i == 0 408*c0909341SAndroid Build Coastguard Worker vmov q7, q13 // sum_alt[1] 409*c0909341SAndroid Build Coastguard Worker.else 410*c0909341SAndroid Build Coastguard Worker vext.8 q12, q14, q13, #(16-2*\i) 411*c0909341SAndroid Build Coastguard Worker vext.8 q13, q13, q14, #(16-2*\i) 412*c0909341SAndroid Build Coastguard Worker vadd.i16 q7, q7, q12 // sum_alt[1] 413*c0909341SAndroid Build Coastguard Worker vadd.i16 d17, d17, d26 // sum_alt[1] 414*c0909341SAndroid Build Coastguard Worker.endif 415*c0909341SAndroid Build Coastguard Worker 416*c0909341SAndroid Build Coastguard Worker.if \i < 6 417*c0909341SAndroid Build Coastguard Worker vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) 418*c0909341SAndroid Build Coastguard Worker vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) 419*c0909341SAndroid Build Coastguard Worker vadd.i16 q9, q9, q12 // sum_alt[2] 420*c0909341SAndroid Build Coastguard Worker vadd.i16 d22, d22, d26 // sum_alt[2] 421*c0909341SAndroid Build Coastguard Worker.else 422*c0909341SAndroid Build Coastguard Worker vadd.i16 q9, q9, q15 // sum_alt[2] 423*c0909341SAndroid Build Coastguard Worker.endif 424*c0909341SAndroid Build Coastguard Worker.if \i == 0 425*c0909341SAndroid Build Coastguard Worker vmov q10, q15 // sum_alt[3] 426*c0909341SAndroid Build Coastguard Worker.elseif \i == 1 427*c0909341SAndroid Build Coastguard Worker vadd.i16 q10, q10, q15 // sum_alt[3] 428*c0909341SAndroid Build Coastguard Worker.else 429*c0909341SAndroid Build Coastguard Worker vext.8 q12, q14, q15, #(16-2*(\i/2)) 430*c0909341SAndroid Build Coastguard Worker vext.8 q13, q15, q14, #(16-2*(\i/2)) 431*c0909341SAndroid Build Coastguard Worker vadd.i16 q10, q10, q12 // sum_alt[3] 432*c0909341SAndroid Build Coastguard Worker vadd.i16 d23, d23, d26 // sum_alt[3] 433*c0909341SAndroid Build Coastguard Worker.endif 434*c0909341SAndroid Build Coastguard Worker.endr 435*c0909341SAndroid Build Coastguard Worker 436*c0909341SAndroid Build Coastguard Worker vmov.u32 q15, #105 437*c0909341SAndroid Build Coastguard Worker 438*c0909341SAndroid Build Coastguard Worker vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] 439*c0909341SAndroid Build Coastguard Worker vmlal.s16 q12, d9, d9 440*c0909341SAndroid Build Coastguard Worker vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] 441*c0909341SAndroid Build Coastguard Worker vmlal.s16 q13, d11, d11 442*c0909341SAndroid Build Coastguard Worker vadd.s32 d8, d24, d25 443*c0909341SAndroid Build Coastguard Worker vadd.s32 d9, d26, d27 444*c0909341SAndroid Build Coastguard Worker vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) 445*c0909341SAndroid Build Coastguard Worker vmul.i32 d8, d8, d30 // cost[2,6] *= 105 446*c0909341SAndroid Build Coastguard Worker 447*c0909341SAndroid Build Coastguard Worker vrev64.16 q1, q1 448*c0909341SAndroid Build Coastguard Worker vrev64.16 q3, q3 449*c0909341SAndroid Build Coastguard Worker vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] 450*c0909341SAndroid Build Coastguard Worker vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] 451*c0909341SAndroid Build Coastguard Worker 452*c0909341SAndroid Build Coastguard Worker vstr s16, [sp, #2*4] // cost[2] 453*c0909341SAndroid Build Coastguard Worker vstr s17, [sp, #6*4] // cost[6] 454*c0909341SAndroid Build Coastguard Worker 455*c0909341SAndroid Build Coastguard Worker movrel_local r12, div_table 456*c0909341SAndroid Build Coastguard Worker vld1.16 {q14}, [r12, :128] 457*c0909341SAndroid Build Coastguard Worker 458*c0909341SAndroid Build Coastguard Worker vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] 459*c0909341SAndroid Build Coastguard Worker vmull.s16 q12, d1, d1 460*c0909341SAndroid Build Coastguard Worker vmlal.s16 q5, d2, d2 461*c0909341SAndroid Build Coastguard Worker vmlal.s16 q12, d3, d3 462*c0909341SAndroid Build Coastguard Worker vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] 463*c0909341SAndroid Build Coastguard Worker vmull.s16 q1, d5, d5 464*c0909341SAndroid Build Coastguard Worker vmlal.s16 q0, d6, d6 465*c0909341SAndroid Build Coastguard Worker vmlal.s16 q1, d7, d7 466*c0909341SAndroid Build Coastguard Worker vmovl.u16 q13, d28 // div_table 467*c0909341SAndroid Build Coastguard Worker vmovl.u16 q14, d29 468*c0909341SAndroid Build Coastguard Worker vmul.i32 q5, q5, q13 // cost[0] 469*c0909341SAndroid Build Coastguard Worker vmla.i32 q5, q12, q14 470*c0909341SAndroid Build Coastguard Worker vmul.i32 q0, q0, q13 // cost[4] 471*c0909341SAndroid Build Coastguard Worker vmla.i32 q0, q1, q14 472*c0909341SAndroid Build Coastguard Worker vadd.i32 d10, d10, d11 473*c0909341SAndroid Build Coastguard Worker vadd.i32 d0, d0, d1 474*c0909341SAndroid Build Coastguard Worker vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 475*c0909341SAndroid Build Coastguard Worker 476*c0909341SAndroid Build Coastguard Worker movrel_local r12, alt_fact 477*c0909341SAndroid Build Coastguard Worker vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 478*c0909341SAndroid Build Coastguard Worker 479*c0909341SAndroid Build Coastguard Worker vstr s0, [sp, #0*4] // cost[0] 480*c0909341SAndroid Build Coastguard Worker vstr s1, [sp, #4*4] // cost[4] 481*c0909341SAndroid Build Coastguard Worker 482*c0909341SAndroid Build Coastguard Worker vmovl.u16 q13, d29 // div_table[2*m+1] + 105 483*c0909341SAndroid Build Coastguard Worker vmovl.u16 q14, d30 484*c0909341SAndroid Build Coastguard Worker vmovl.u16 q15, d31 485*c0909341SAndroid Build Coastguard Worker 486*c0909341SAndroid Build Coastguard Worker cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] 487*c0909341SAndroid Build Coastguard Worker cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] 488*c0909341SAndroid Build Coastguard Worker vstr s28, [sp, #1*4] // cost[1] 489*c0909341SAndroid Build Coastguard Worker vstr s29, [sp, #3*4] // cost[3] 490*c0909341SAndroid Build Coastguard Worker 491*c0909341SAndroid Build Coastguard Worker mov r0, #0 // best_dir 492*c0909341SAndroid Build Coastguard Worker vmov.32 r1, d0[0] // best_cost 493*c0909341SAndroid Build Coastguard Worker mov r3, #1 // n 494*c0909341SAndroid Build Coastguard Worker 495*c0909341SAndroid Build Coastguard Worker vstr s30, [sp, #5*4] // cost[5] 496*c0909341SAndroid Build Coastguard Worker vstr s31, [sp, #7*4] // cost[7] 497*c0909341SAndroid Build Coastguard Worker 498*c0909341SAndroid Build Coastguard Worker vmov.32 r12, d14[0] 499*c0909341SAndroid Build Coastguard Worker 500*c0909341SAndroid Build Coastguard Worker find_best d14[0], d8[0], d14[1] 501*c0909341SAndroid Build Coastguard Worker find_best d14[1], d0[1], d15[0] 502*c0909341SAndroid Build Coastguard Worker find_best d15[0], d8[1], d15[1] 503*c0909341SAndroid Build Coastguard Worker find_best d15[1] 504*c0909341SAndroid Build Coastguard Worker 505*c0909341SAndroid Build Coastguard Worker eor r3, r0, #4 // best_dir ^4 506*c0909341SAndroid Build Coastguard Worker ldr r12, [sp, r3, lsl #2] 507*c0909341SAndroid Build Coastguard Worker sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] 508*c0909341SAndroid Build Coastguard Worker lsr r1, r1, #10 509*c0909341SAndroid Build Coastguard Worker str r1, [r2] // *var 510*c0909341SAndroid Build Coastguard Worker 511*c0909341SAndroid Build Coastguard Worker add sp, sp, #32 512*c0909341SAndroid Build Coastguard Worker vpop {q4-q7} 513*c0909341SAndroid Build Coastguard Worker pop {pc} 514*c0909341SAndroid Build Coastguard Workerendfunc 515*c0909341SAndroid Build Coastguard Worker.endm 516