1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker%macro DUP4 1-* 32*c0909341SAndroid Build Coastguard Worker %rep %0 33*c0909341SAndroid Build Coastguard Worker times 4 db %1 34*c0909341SAndroid Build Coastguard Worker %rotate 1 35*c0909341SAndroid Build Coastguard Worker %endrep 36*c0909341SAndroid Build Coastguard Worker%endmacro 37*c0909341SAndroid Build Coastguard Worker 38*c0909341SAndroid Build Coastguard Worker%macro DIRS 16 ; cdef_directions[] 39*c0909341SAndroid Build Coastguard Worker %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 40*c0909341SAndroid Build Coastguard Worker ; masking away unused bits allows us to use a single vpaddd {1to16} 41*c0909341SAndroid Build Coastguard Worker ; instruction instead of having to do vpbroadcastd + paddb 42*c0909341SAndroid Build Coastguard Worker db %13 & 0x3f, -%13 & 0x3f 43*c0909341SAndroid Build Coastguard Worker %rotate 1 44*c0909341SAndroid Build Coastguard Worker %endrep 45*c0909341SAndroid Build Coastguard Worker%endmacro 46*c0909341SAndroid Build Coastguard Worker 47*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 48*c0909341SAndroid Build Coastguard Worker 49*c0909341SAndroid Build Coastguard Workerlut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 50*c0909341SAndroid Build Coastguard Worker db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 51*c0909341SAndroid Build Coastguard Worker db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 52*c0909341SAndroid Build Coastguard Worker db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 53*c0909341SAndroid Build Coastguard Workerlut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 54*c0909341SAndroid Build Coastguard Worker db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 55*c0909341SAndroid Build Coastguard Workerlut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 56*c0909341SAndroid Build Coastguard Worker db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 57*c0909341SAndroid Build Coastguard Worker db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 58*c0909341SAndroid Build Coastguard Worker db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 59*c0909341SAndroid Build Coastguard Workerpd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 60*c0909341SAndroid Build Coastguard Workerlut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55 61*c0909341SAndroid Build Coastguard Worker db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 62*c0909341SAndroid Build Coastguard Workerlut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21 63*c0909341SAndroid Build Coastguard Worker db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25 64*c0909341SAndroid Build Coastguard Worker db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53 65*c0909341SAndroid Build Coastguard Worker db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57 66*c0909341SAndroid Build Coastguard Workerend_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 67*c0909341SAndroid Build Coastguard Worker db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 68*c0909341SAndroid Build Coastguard Workerend_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 69*c0909341SAndroid Build Coastguard Worker db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 70*c0909341SAndroid Build Coastguard Worker db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 71*c0909341SAndroid Build Coastguard Worker db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 72*c0909341SAndroid Build Coastguard Workeredge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 73*c0909341SAndroid Build Coastguard Worker dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 74*c0909341SAndroid Build Coastguard Worker dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 75*c0909341SAndroid Build Coastguard Worker dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 76*c0909341SAndroid Build Coastguard Worker dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 77*c0909341SAndroid Build Coastguard Worker dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 78*c0909341SAndroid Build Coastguard Worker dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 79*c0909341SAndroid Build Coastguard Worker dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 80*c0909341SAndroid Build Coastguard Workerpx_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 81*c0909341SAndroid Build Coastguard Workercdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 82*c0909341SAndroid Build Coastguard Workergf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 83*c0909341SAndroid Build Coastguard Worker dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 84*c0909341SAndroid Build Coastguard Worker dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 85*c0909341SAndroid Build Coastguard Worker dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 86*c0909341SAndroid Build Coastguard Workerpri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 87*c0909341SAndroid Build Coastguard Workersec_tap: db 32, 32, 16, 16 88*c0909341SAndroid Build Coastguard Workerpd_268435568: dd 268435568 89*c0909341SAndroid Build Coastguard Worker 90*c0909341SAndroid Build Coastguard WorkerSECTION .text 91*c0909341SAndroid Build Coastguard Worker 92*c0909341SAndroid Build Coastguard Worker%if WIN64 93*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4 94*c0909341SAndroid Build Coastguard Worker%else 95*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8 96*c0909341SAndroid Build Coastguard Worker%endif 97*c0909341SAndroid Build Coastguard Worker 98*c0909341SAndroid Build Coastguard Worker; lut: 99*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7 100*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7 101*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05 102*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15 103*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25 104*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35 105*c0909341SAndroid Build Coastguard Worker; b0 b1 b2 b3 b4 b5 b6 b7 106*c0909341SAndroid Build Coastguard Worker; B0 B1 B2 B3 B4 B5 B6 B7 107*c0909341SAndroid Build Coastguard Worker 108*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 109*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \ 110*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 111*c0909341SAndroid Build Coastguard Worker%define base r7-edge_mask 112*c0909341SAndroid Build Coastguard Worker movq xmm0, [dstq+strideq*0] 113*c0909341SAndroid Build Coastguard Worker movhps xmm0, [dstq+strideq*1] 114*c0909341SAndroid Build Coastguard Worker lea r7, [edge_mask] 115*c0909341SAndroid Build Coastguard Worker movq xmm1, [topq+strideq*0-2] 116*c0909341SAndroid Build Coastguard Worker movhps xmm1, [topq+strideq*1-2] 117*c0909341SAndroid Build Coastguard Worker mov r6d, edgem 118*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, ymm0, [leftq], 1 119*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 120*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 121*c0909341SAndroid Build Coastguard Worker mova m5, [base+lut_perm_4x4] 122*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [dstq+r2], 2 123*c0909341SAndroid Build Coastguard Worker test r6b, 0x08 ; avoid buffer overread 124*c0909341SAndroid Build Coastguard Worker jz .main 125*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [botq+strideq*0-4], 2 126*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [botq+strideq*1-4], 3 127*c0909341SAndroid Build Coastguard Worker.main: 128*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 129*c0909341SAndroid Build Coastguard Worker mov t0d, dirm 130*c0909341SAndroid Build Coastguard Worker mova m3, [base+px_idx] 131*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 132*c0909341SAndroid Build Coastguard Worker vpermi2b m5, m0, m1 ; lut 133*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 134*c0909341SAndroid Build Coastguard Worker pxor m7, m7 135*c0909341SAndroid Build Coastguard Worker lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 136*c0909341SAndroid Build Coastguard Worker vpermb m6, m3, m5 ; px 137*c0909341SAndroid Build Coastguard Worker cmp r6d, 0x0f 138*c0909341SAndroid Build Coastguard Worker jne .mask_edges ; mask edges only if required 139*c0909341SAndroid Build Coastguard Worker test prid, prid 140*c0909341SAndroid Build Coastguard Worker jz .sec_only 141*c0909341SAndroid Build Coastguard Worker vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 142*c0909341SAndroid Build Coastguard Worker vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 143*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_4x4_PRI 0 144*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m6, m1, 6 ; px > pN 145*c0909341SAndroid Build Coastguard Worker psubb m2, m1, m6 146*c0909341SAndroid Build Coastguard Worker lzcnt r6d, prid 147*c0909341SAndroid Build Coastguard Worker vpsubb m2{k1}, m6, m1 ; abs(diff) 148*c0909341SAndroid Build Coastguard Worker vpbroadcastb m4, prid 149*c0909341SAndroid Build Coastguard Worker and prid, 1 150*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift 151*c0909341SAndroid Build Coastguard Worker movifnidn secd, secm 152*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pri_tap+priq*4] 153*c0909341SAndroid Build Coastguard Worker vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) 154*c0909341SAndroid Build Coastguard Worker psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) 155*c0909341SAndroid Build Coastguard Worker pminub m2, m4 156*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m10 ; sum 157*c0909341SAndroid Build Coastguard Worker%endmacro 158*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_4x4_PRI 159*c0909341SAndroid Build Coastguard Worker test secd, secd 160*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 161*c0909341SAndroid Build Coastguard Worker call .sec 162*c0909341SAndroid Build Coastguard Worker.end_clip: 163*c0909341SAndroid Build Coastguard Worker pminub m4, m6, m1 164*c0909341SAndroid Build Coastguard Worker pmaxub m1, m6 165*c0909341SAndroid Build Coastguard Worker pminub m5, m2, m3 166*c0909341SAndroid Build Coastguard Worker pmaxub m2, m3 167*c0909341SAndroid Build Coastguard Worker pminub m4, m5 168*c0909341SAndroid Build Coastguard Worker pmaxub m2, m1 169*c0909341SAndroid Build Coastguard Worker psrldq m1, m4, 2 170*c0909341SAndroid Build Coastguard Worker psrldq m3, m2, 2 171*c0909341SAndroid Build Coastguard Worker pminub m1, m4 172*c0909341SAndroid Build Coastguard Worker vpcmpw k1, m0, m7, 1 173*c0909341SAndroid Build Coastguard Worker vpshldd m6, m0, 8 174*c0909341SAndroid Build Coastguard Worker pmaxub m2, m3 175*c0909341SAndroid Build Coastguard Worker pslldq m3, m1, 1 176*c0909341SAndroid Build Coastguard Worker psubw m7, m0 177*c0909341SAndroid Build Coastguard Worker paddusw m0, m6 ; clip >0xff 178*c0909341SAndroid Build Coastguard Worker vpsubusw m0{k1}, m6, m7 ; clip <0x00 179*c0909341SAndroid Build Coastguard Worker pslldq m4, m2, 1 180*c0909341SAndroid Build Coastguard Worker pminub m1, m3 181*c0909341SAndroid Build Coastguard Worker pmaxub m2, m4 182*c0909341SAndroid Build Coastguard Worker pmaxub m0, m1 183*c0909341SAndroid Build Coastguard Worker pminub m0, m2 184*c0909341SAndroid Build Coastguard Worker jmp .end 185*c0909341SAndroid Build Coastguard Worker.sec_only: 186*c0909341SAndroid Build Coastguard Worker movifnidn secd, secm 187*c0909341SAndroid Build Coastguard Worker call .sec 188*c0909341SAndroid Build Coastguard Worker.end_no_clip: 189*c0909341SAndroid Build Coastguard Worker vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) 190*c0909341SAndroid Build Coastguard Worker paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 191*c0909341SAndroid Build Coastguard Worker.end: 192*c0909341SAndroid Build Coastguard Worker mova xm1, [base+end_perm] 193*c0909341SAndroid Build Coastguard Worker vpermb m0, m1, m0 ; output in bits 8-15 of each dword 194*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 195*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 196*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 197*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm0, 3 198*c0909341SAndroid Build Coastguard Worker RET 199*c0909341SAndroid Build Coastguard Worker.mask_edges_sec_only: 200*c0909341SAndroid Build Coastguard Worker movifnidn secd, secm 201*c0909341SAndroid Build Coastguard Worker call .mask_edges_sec 202*c0909341SAndroid Build Coastguard Worker jmp .end_no_clip 203*c0909341SAndroid Build Coastguard WorkerALIGN function_align 204*c0909341SAndroid Build Coastguard Worker.mask_edges: 205*c0909341SAndroid Build Coastguard Worker vpbroadcastq m8, [base+edge_mask+r6*8] 206*c0909341SAndroid Build Coastguard Worker test prid, prid 207*c0909341SAndroid Build Coastguard Worker jz .mask_edges_sec_only 208*c0909341SAndroid Build Coastguard Worker vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} 209*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m8, m2 ; index in-range 210*c0909341SAndroid Build Coastguard Worker mova m1, m6 211*c0909341SAndroid Build Coastguard Worker vpermb m1{k1}, m2, m5 212*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_4x4_PRI 213*c0909341SAndroid Build Coastguard Worker test secd, secd 214*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 215*c0909341SAndroid Build Coastguard Worker call .mask_edges_sec 216*c0909341SAndroid Build Coastguard Worker jmp .end_clip 217*c0909341SAndroid Build Coastguard Worker.mask_edges_sec: 218*c0909341SAndroid Build Coastguard Worker vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} 219*c0909341SAndroid Build Coastguard Worker vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} 220*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m8, m4 221*c0909341SAndroid Build Coastguard Worker mova m2, m6 222*c0909341SAndroid Build Coastguard Worker vpermb m2{k1}, m4, m5 223*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m8, m9 224*c0909341SAndroid Build Coastguard Worker mova m3, m6 225*c0909341SAndroid Build Coastguard Worker vpermb m3{k1}, m9, m5 226*c0909341SAndroid Build Coastguard Worker jmp .sec_main 227*c0909341SAndroid Build Coastguard WorkerALIGN function_align 228*c0909341SAndroid Build Coastguard Worker.sec: 229*c0909341SAndroid Build Coastguard Worker vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 230*c0909341SAndroid Build Coastguard Worker vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 231*c0909341SAndroid Build Coastguard Worker vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 232*c0909341SAndroid Build Coastguard Worker vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 233*c0909341SAndroid Build Coastguard Worker.sec_main: 234*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+sec_tap] 235*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m6, m2, 6 236*c0909341SAndroid Build Coastguard Worker psubb m4, m2, m6 237*c0909341SAndroid Build Coastguard Worker vpbroadcastb m12, secd 238*c0909341SAndroid Build Coastguard Worker lzcnt secd, secd 239*c0909341SAndroid Build Coastguard Worker vpsubb m4{k1}, m6, m2 240*c0909341SAndroid Build Coastguard Worker vpcmpub k2, m6, m3, 6 241*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [r3+secq*8] 242*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m10, m4, m11, 0 243*c0909341SAndroid Build Coastguard Worker psubb m5, m3, m6 244*c0909341SAndroid Build Coastguard Worker mova m9, m8 245*c0909341SAndroid Build Coastguard Worker vpsubb m8{k1}, m7, m8 246*c0909341SAndroid Build Coastguard Worker psubusb m10, m12, m10 247*c0909341SAndroid Build Coastguard Worker vpsubb m5{k2}, m6, m3 248*c0909341SAndroid Build Coastguard Worker pminub m4, m10 249*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m8 250*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m11, m5, m11, 0 251*c0909341SAndroid Build Coastguard Worker vpsubb m9{k2}, m7, m9 252*c0909341SAndroid Build Coastguard Worker psubusb m12, m11 253*c0909341SAndroid Build Coastguard Worker pminub m5, m12 254*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m5, m9 255*c0909341SAndroid Build Coastguard Worker ret 256*c0909341SAndroid Build Coastguard Worker 257*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 2, 7 258*c0909341SAndroid Build Coastguard Worker 259*c0909341SAndroid Build Coastguard Worker; lut top lut bottom 260*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 261*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 262*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 263*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 264*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 265*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 266*c0909341SAndroid Build Coastguard Worker; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 267*c0909341SAndroid Build Coastguard Worker; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 268*c0909341SAndroid Build Coastguard Worker 269*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \ 270*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 271*c0909341SAndroid Build Coastguard Worker%define base r8-edge_mask 272*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym21, strided 273*c0909341SAndroid Build Coastguard Worker mov r6d, edgem 274*c0909341SAndroid Build Coastguard Worker lea r8, [edge_mask] 275*c0909341SAndroid Build Coastguard Worker movq xm1, [topq+strideq*0-2] 276*c0909341SAndroid Build Coastguard Worker pmulld ym21, [base+pd_01234567] 277*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 278*c0909341SAndroid Build Coastguard Worker movq xm2, [topq+strideq*1-2] 279*c0909341SAndroid Build Coastguard Worker vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 280*c0909341SAndroid Build Coastguard Worker mova m14, [base+lut_perm_4x8a] 281*c0909341SAndroid Build Coastguard Worker movu m15, [base+lut_perm_4x8b] 282*c0909341SAndroid Build Coastguard Worker test r6b, 0x08 ; avoid buffer overread 283*c0909341SAndroid Build Coastguard Worker jz .main 284*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [botq+strideq*0-2], 1 285*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [botq+strideq*1-2], 1 286*c0909341SAndroid Build Coastguard Worker.main: 287*c0909341SAndroid Build Coastguard Worker punpcklqdq ym1, ym2 288*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ 289*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 290*c0909341SAndroid Build Coastguard Worker mov t0d, dirm 291*c0909341SAndroid Build Coastguard Worker mova m16, [base+px_idx] 292*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 293*c0909341SAndroid Build Coastguard Worker vpermi2b m14, m0, m1 ; lut top 294*c0909341SAndroid Build Coastguard Worker vpermi2b m15, m0, m1 ; lut bottom 295*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 296*c0909341SAndroid Build Coastguard Worker pxor m20, m20 297*c0909341SAndroid Build Coastguard Worker lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 298*c0909341SAndroid Build Coastguard Worker vpermb m2, m16, m14 ; pxt 299*c0909341SAndroid Build Coastguard Worker vpermb m3, m16, m15 ; pxb 300*c0909341SAndroid Build Coastguard Worker mova m1, m0 301*c0909341SAndroid Build Coastguard Worker cmp r6b, 0x0f 302*c0909341SAndroid Build Coastguard Worker jne .mask_edges ; mask edges only if required 303*c0909341SAndroid Build Coastguard Worker test prid, prid 304*c0909341SAndroid Build Coastguard Worker jz .sec_only 305*c0909341SAndroid Build Coastguard Worker vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 306*c0909341SAndroid Build Coastguard Worker vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 307*c0909341SAndroid Build Coastguard Worker vpermb m5, m6, m15 ; pNb 308*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_4x8_PRI 0 309*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m2, m4, 6 ; pxt > pNt 310*c0909341SAndroid Build Coastguard Worker vpcmpub k2, m3, m5, 6 ; pxb > pNb 311*c0909341SAndroid Build Coastguard Worker psubb m6, m4, m2 312*c0909341SAndroid Build Coastguard Worker psubb m7, m5, m3 313*c0909341SAndroid Build Coastguard Worker lzcnt r6d, prid 314*c0909341SAndroid Build Coastguard Worker vpsubb m6{k1}, m2, m4 ; abs(diff_top) 315*c0909341SAndroid Build Coastguard Worker vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) 316*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, prid 317*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [r3+r6*8] 318*c0909341SAndroid Build Coastguard Worker and prid, 1 319*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pri_tap+priq*4] 320*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift 321*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift 322*c0909341SAndroid Build Coastguard Worker mova m10, m11 323*c0909341SAndroid Build Coastguard Worker movifnidn t1d, secm 324*c0909341SAndroid Build Coastguard Worker vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) 325*c0909341SAndroid Build Coastguard Worker vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) 326*c0909341SAndroid Build Coastguard Worker psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) 327*c0909341SAndroid Build Coastguard Worker psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) 328*c0909341SAndroid Build Coastguard Worker pminub m6, m12 329*c0909341SAndroid Build Coastguard Worker pminub m7, m13 330*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m6, m10 ; sum top 331*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m7, m11 ; sum bottom 332*c0909341SAndroid Build Coastguard Worker%endmacro 333*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_4x8_PRI 334*c0909341SAndroid Build Coastguard Worker test t1d, t1d ; sec 335*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 336*c0909341SAndroid Build Coastguard Worker call .sec 337*c0909341SAndroid Build Coastguard Worker.end_clip: 338*c0909341SAndroid Build Coastguard Worker pminub m10, m4, m2 339*c0909341SAndroid Build Coastguard Worker pminub m12, m6, m8 340*c0909341SAndroid Build Coastguard Worker pminub m11, m5, m3 341*c0909341SAndroid Build Coastguard Worker pminub m13, m7, m9 342*c0909341SAndroid Build Coastguard Worker pmaxub m4, m2 343*c0909341SAndroid Build Coastguard Worker pmaxub m6, m8 344*c0909341SAndroid Build Coastguard Worker pmaxub m5, m3 345*c0909341SAndroid Build Coastguard Worker pmaxub m7, m9 346*c0909341SAndroid Build Coastguard Worker pminub m10, m12 347*c0909341SAndroid Build Coastguard Worker pminub m11, m13 348*c0909341SAndroid Build Coastguard Worker pmaxub m4, m6 349*c0909341SAndroid Build Coastguard Worker pmaxub m5, m7 350*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAAAAAA 351*c0909341SAndroid Build Coastguard Worker kmovd k1, r2d 352*c0909341SAndroid Build Coastguard Worker kxnorb k2, k2, k2 ; hw lw 353*c0909341SAndroid Build Coastguard Worker vpshrdd m12, m0, m1, 16 ; m1lw m0hw 354*c0909341SAndroid Build Coastguard Worker vpshrdd m6, m10, m11, 16 ; m11lw m10hw 355*c0909341SAndroid Build Coastguard Worker vpshrdd m8, m4, m5, 16 ; m5lw m4hw 356*c0909341SAndroid Build Coastguard Worker vpblendmw m7{k1}, m10, m11 ; m11hw m10lw 357*c0909341SAndroid Build Coastguard Worker vpblendmw m9{k1}, m4, m5 ; m5hw m4lw 358*c0909341SAndroid Build Coastguard Worker vpblendmw m4{k1}, m0, m12 ; m1lw m0lw 359*c0909341SAndroid Build Coastguard Worker vpblendmw m5{k1}, m12, m1 ; m1hw m0hw 360*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m3, 16 361*c0909341SAndroid Build Coastguard Worker pminub m6, m7 362*c0909341SAndroid Build Coastguard Worker pmaxub m8, m9 363*c0909341SAndroid Build Coastguard Worker mova ym14, [base+end_perm] 364*c0909341SAndroid Build Coastguard Worker vpcmpw k1, m4, m20, 1 365*c0909341SAndroid Build Coastguard Worker vpshldw m2, m5, 8 366*c0909341SAndroid Build Coastguard Worker pslldq m7, m6, 1 367*c0909341SAndroid Build Coastguard Worker pslldq m9, m8, 1 368*c0909341SAndroid Build Coastguard Worker psubw m5, m20, m4 369*c0909341SAndroid Build Coastguard Worker paddusw m0, m4, m2 ; clip >0xff 370*c0909341SAndroid Build Coastguard Worker pminub m6, m7 371*c0909341SAndroid Build Coastguard Worker pmaxub m8, m9 372*c0909341SAndroid Build Coastguard Worker psubusw m0{k1}, m2, m5 ; clip <0x00 373*c0909341SAndroid Build Coastguard Worker pmaxub m0, m6 374*c0909341SAndroid Build Coastguard Worker pminub m0, m8 375*c0909341SAndroid Build Coastguard Worker vpermb m0, m14, m0 376*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+ym21]{k2}, ym0 377*c0909341SAndroid Build Coastguard Worker RET 378*c0909341SAndroid Build Coastguard Worker.sec_only: 379*c0909341SAndroid Build Coastguard Worker movifnidn t1d, secm 380*c0909341SAndroid Build Coastguard Worker call .sec 381*c0909341SAndroid Build Coastguard Worker.end_no_clip: 382*c0909341SAndroid Build Coastguard Worker mova ym4, [base+end_perm] 383*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 384*c0909341SAndroid Build Coastguard Worker vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) 385*c0909341SAndroid Build Coastguard Worker vpshldd m3, m1, 8 386*c0909341SAndroid Build Coastguard Worker paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 387*c0909341SAndroid Build Coastguard Worker paddw m1, m3 388*c0909341SAndroid Build Coastguard Worker pslld m0, 16 389*c0909341SAndroid Build Coastguard Worker vpshrdd m0, m1, 16 390*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m0 ; output in bits 8-15 of each word 391*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+ym21]{k1}, ym0 392*c0909341SAndroid Build Coastguard Worker RET 393*c0909341SAndroid Build Coastguard Worker.mask_edges_sec_only: 394*c0909341SAndroid Build Coastguard Worker movifnidn t1d, secm 395*c0909341SAndroid Build Coastguard Worker call .mask_edges_sec 396*c0909341SAndroid Build Coastguard Worker jmp .end_no_clip 397*c0909341SAndroid Build Coastguard WorkerALIGN function_align 398*c0909341SAndroid Build Coastguard Worker.mask_edges: 399*c0909341SAndroid Build Coastguard Worker mov t1d, r6d 400*c0909341SAndroid Build Coastguard Worker or r6d, 8 ; top 4x4 has bottom 401*c0909341SAndroid Build Coastguard Worker or t1d, 4 ; bottom 4x4 has top 402*c0909341SAndroid Build Coastguard Worker vpbroadcastq m17, [base+edge_mask+r6*8] 403*c0909341SAndroid Build Coastguard Worker vpbroadcastq m18, [base+edge_mask+t1*8] 404*c0909341SAndroid Build Coastguard Worker test prid, prid 405*c0909341SAndroid Build Coastguard Worker jz .mask_edges_sec_only 406*c0909341SAndroid Build Coastguard Worker vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} 407*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m17, m6 ; index in-range 408*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k2, m18, m6 409*c0909341SAndroid Build Coastguard Worker mova m4, m2 410*c0909341SAndroid Build Coastguard Worker mova m5, m3 411*c0909341SAndroid Build Coastguard Worker vpermb m4{k1}, m6, m14 412*c0909341SAndroid Build Coastguard Worker vpermb m5{k2}, m6, m15 413*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_4x8_PRI 414*c0909341SAndroid Build Coastguard Worker test t1d, t1d 415*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 416*c0909341SAndroid Build Coastguard Worker call .mask_edges_sec 417*c0909341SAndroid Build Coastguard Worker jmp .end_clip 418*c0909341SAndroid Build Coastguard Worker.mask_edges_sec: 419*c0909341SAndroid Build Coastguard Worker vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} 420*c0909341SAndroid Build Coastguard Worker vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} 421*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m17, m10 422*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k2, m18, m10 423*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k3, m17, m11 424*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k4, m18, m11 425*c0909341SAndroid Build Coastguard Worker mova m6, m2 426*c0909341SAndroid Build Coastguard Worker mova m7, m3 427*c0909341SAndroid Build Coastguard Worker mova m8, m2 428*c0909341SAndroid Build Coastguard Worker mova m9, m3 429*c0909341SAndroid Build Coastguard Worker vpermb m6{k1}, m10, m14 430*c0909341SAndroid Build Coastguard Worker vpermb m7{k2}, m10, m15 431*c0909341SAndroid Build Coastguard Worker vpermb m8{k3}, m11, m14 432*c0909341SAndroid Build Coastguard Worker vpermb m9{k4}, m11, m15 433*c0909341SAndroid Build Coastguard Worker jmp .sec_main 434*c0909341SAndroid Build Coastguard WorkerALIGN function_align 435*c0909341SAndroid Build Coastguard Worker.sec: 436*c0909341SAndroid Build Coastguard Worker vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 437*c0909341SAndroid Build Coastguard Worker vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 438*c0909341SAndroid Build Coastguard Worker vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 439*c0909341SAndroid Build Coastguard Worker vpermb m7, m8, m15 ; pNb 440*c0909341SAndroid Build Coastguard Worker vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 441*c0909341SAndroid Build Coastguard Worker vpermb m9, m9, m15 ; pNb 442*c0909341SAndroid Build Coastguard Worker.sec_main: 443*c0909341SAndroid Build Coastguard Worker vpbroadcastb m18, t1d 444*c0909341SAndroid Build Coastguard Worker lzcnt t1d, t1d 445*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m2, m6, 6 446*c0909341SAndroid Build Coastguard Worker vpcmpub k2, m3, m7, 6 447*c0909341SAndroid Build Coastguard Worker vpcmpub k3, m2, m8, 6 448*c0909341SAndroid Build Coastguard Worker vpcmpub k4, m3, m9, 6 449*c0909341SAndroid Build Coastguard Worker vpbroadcastq m17, [r3+t1*8] 450*c0909341SAndroid Build Coastguard Worker psubb m10, m6, m2 451*c0909341SAndroid Build Coastguard Worker psubb m11, m7, m3 452*c0909341SAndroid Build Coastguard Worker psubb m12, m8, m2 453*c0909341SAndroid Build Coastguard Worker psubb m13, m9, m3 454*c0909341SAndroid Build Coastguard Worker vpsubb m10{k1}, m2, m6 ; abs(dt0) 455*c0909341SAndroid Build Coastguard Worker vpsubb m11{k2}, m3, m7 ; abs(db0) 456*c0909341SAndroid Build Coastguard Worker vpsubb m12{k3}, m2, m8 ; abs(dt1) 457*c0909341SAndroid Build Coastguard Worker vpsubb m13{k4}, m3, m9 ; abs(db1) 458*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [base+sec_tap] 459*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift 460*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift 461*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift 462*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift 463*c0909341SAndroid Build Coastguard Worker psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) 464*c0909341SAndroid Build Coastguard Worker psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) 465*c0909341SAndroid Build Coastguard Worker psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) 466*c0909341SAndroid Build Coastguard Worker psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) 467*c0909341SAndroid Build Coastguard Worker pminub m10, m14 468*c0909341SAndroid Build Coastguard Worker pminub m11, m15 469*c0909341SAndroid Build Coastguard Worker pminub m12, m16 470*c0909341SAndroid Build Coastguard Worker pminub m13, m17 471*c0909341SAndroid Build Coastguard Worker mova m14, m19 472*c0909341SAndroid Build Coastguard Worker mova m15, m19 473*c0909341SAndroid Build Coastguard Worker mova m16, m19 474*c0909341SAndroid Build Coastguard Worker vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) 475*c0909341SAndroid Build Coastguard Worker vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) 476*c0909341SAndroid Build Coastguard Worker vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) 477*c0909341SAndroid Build Coastguard Worker vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) 478*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m10, m14 479*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m11, m15 480*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m12, m16 481*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m13, m19 482*c0909341SAndroid Build Coastguard Worker ret 483*c0909341SAndroid Build Coastguard Worker 484*c0909341SAndroid Build Coastguard Worker; lut tl lut tr 485*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb 486*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb 487*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09 488*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19 489*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 490*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 491*c0909341SAndroid Build Coastguard Worker; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 492*c0909341SAndroid Build Coastguard Worker; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 493*c0909341SAndroid Build Coastguard Worker; lut bl lut br 494*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 495*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 496*c0909341SAndroid Build Coastguard Worker; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 497*c0909341SAndroid Build Coastguard Worker; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 498*c0909341SAndroid Build Coastguard Worker; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69 499*c0909341SAndroid Build Coastguard Worker; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79 500*c0909341SAndroid Build Coastguard Worker; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb 501*c0909341SAndroid Build Coastguard Worker; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb 502*c0909341SAndroid Build Coastguard Worker 503*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \ 504*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 505*c0909341SAndroid Build Coastguard Worker%define base r8-edge_mask 506*c0909341SAndroid Build Coastguard Worker movu xm16, [dstq+strideq*0] 507*c0909341SAndroid Build Coastguard Worker pinsrd xm16, [leftq+4*0], 3 508*c0909341SAndroid Build Coastguard Worker mov r6d, edgem 509*c0909341SAndroid Build Coastguard Worker vinserti128 ym16, [dstq+strideq*1], 1 510*c0909341SAndroid Build Coastguard Worker lea r10, [dstq+strideq*4] 511*c0909341SAndroid Build Coastguard Worker movu xm17, [dstq+strideq*2] 512*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [topq+strideq*0-2], 2 513*c0909341SAndroid Build Coastguard Worker lea r9, [strideq*3] 514*c0909341SAndroid Build Coastguard Worker pinsrd xm17, [leftq+4*1], 3 515*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T 516*c0909341SAndroid Build Coastguard Worker lea r8, [edge_mask] 517*c0909341SAndroid Build Coastguard Worker vinserti128 ym17, [dstq+r9 ], 1 518*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym18, [leftq+4*2] 519*c0909341SAndroid Build Coastguard Worker vpblendd ym17, ym18, 0x80 520*c0909341SAndroid Build Coastguard Worker movu xm18, [r10 +strideq*2] 521*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [r10 +strideq*0], 2 522*c0909341SAndroid Build Coastguard Worker pinsrd xm18, [leftq+4*3], 3 523*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5 524*c0909341SAndroid Build Coastguard Worker vinserti128 ym18, [r10 +r9 ], 1 525*c0909341SAndroid Build Coastguard Worker test r6b, 0x08 ; avoid buffer overread 526*c0909341SAndroid Build Coastguard Worker jz .main 527*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [botq+strideq*0-2], 2 528*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B 529*c0909341SAndroid Build Coastguard Worker.main: 530*c0909341SAndroid Build Coastguard Worker mova m0, [base+lut_perm_8x8a] 531*c0909341SAndroid Build Coastguard Worker movu m1, [base+lut_perm_8x8b] 532*c0909341SAndroid Build Coastguard Worker mova m30, [base+px_idx] 533*c0909341SAndroid Build Coastguard Worker vpermb m16, m0, m16 534*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 535*c0909341SAndroid Build Coastguard Worker vpermb m17, m1, m17 536*c0909341SAndroid Build Coastguard Worker mov t0d, dirm 537*c0909341SAndroid Build Coastguard Worker vpermb m18, m0, m18 538*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 539*c0909341SAndroid Build Coastguard Worker vshufi32x4 m12, m16, m17, q2020 ; lut tl 540*c0909341SAndroid Build Coastguard Worker vshufi32x4 m13, m16, m17, q3131 ; lut tr 541*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m17, m18, q0220 ; lut bl 542*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m17, m18, q1331 ; lut br 543*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 544*c0909341SAndroid Build Coastguard Worker pxor m31, m31 545*c0909341SAndroid Build Coastguard Worker lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 546*c0909341SAndroid Build Coastguard Worker vpermb m4, m30, m12 ; pxtl 547*c0909341SAndroid Build Coastguard Worker mova m1, m0 548*c0909341SAndroid Build Coastguard Worker vpermb m5, m30, m13 ; pxtr 549*c0909341SAndroid Build Coastguard Worker mova m2, m0 550*c0909341SAndroid Build Coastguard Worker vpermb m6, m30, m14 ; pxbl 551*c0909341SAndroid Build Coastguard Worker mova m3, m0 552*c0909341SAndroid Build Coastguard Worker vpermb m7, m30, m15 ; pxbr 553*c0909341SAndroid Build Coastguard Worker cmp r6b, 0x0f 554*c0909341SAndroid Build Coastguard Worker jne .mask_edges ; mask edges only if required 555*c0909341SAndroid Build Coastguard Worker test prid, prid 556*c0909341SAndroid Build Coastguard Worker jz .sec_only 557*c0909341SAndroid Build Coastguard Worker vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 558*c0909341SAndroid Build Coastguard Worker vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 559*c0909341SAndroid Build Coastguard Worker vpermb m9, m11, m13 ; pNtr 560*c0909341SAndroid Build Coastguard Worker vpermb m10, m11, m14 ; pNbl 561*c0909341SAndroid Build Coastguard Worker vpermb m11, m11, m15 ; pNbr 562*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_8x8_PRI 0 563*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m4, m8, 6 ; pxtl > pNtl 564*c0909341SAndroid Build Coastguard Worker vpcmpub k2, m5, m9, 6 ; pxtr > pNtr 565*c0909341SAndroid Build Coastguard Worker vpcmpub k3, m6, m10, 6 ; pxbl > pNbl 566*c0909341SAndroid Build Coastguard Worker vpcmpub k4, m7, m11, 6 ; pxbr > pNbr 567*c0909341SAndroid Build Coastguard Worker psubb m16, m8, m4 568*c0909341SAndroid Build Coastguard Worker psubb m17, m9, m5 569*c0909341SAndroid Build Coastguard Worker psubb m18, m10, m6 570*c0909341SAndroid Build Coastguard Worker psubb m19, m11, m7 571*c0909341SAndroid Build Coastguard Worker lzcnt r6d, prid 572*c0909341SAndroid Build Coastguard Worker vpsubb m16{k1}, m4, m8 ; abs(diff_tl) 573*c0909341SAndroid Build Coastguard Worker vpsubb m17{k2}, m5, m9 ; abs(diff_tr) 574*c0909341SAndroid Build Coastguard Worker vpsubb m18{k3}, m6, m10 ; abs(diff_bl) 575*c0909341SAndroid Build Coastguard Worker vpsubb m19{k4}, m7, m11 ; abs(diff_br) 576*c0909341SAndroid Build Coastguard Worker vpbroadcastq m28, [r3+r6*8] 577*c0909341SAndroid Build Coastguard Worker vpbroadcastb m29, prid 578*c0909341SAndroid Build Coastguard Worker and prid, 1 579*c0909341SAndroid Build Coastguard Worker vpbroadcastd m27, [base+pri_tap+priq*4] 580*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift 581*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift 582*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift 583*c0909341SAndroid Build Coastguard Worker vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift 584*c0909341SAndroid Build Coastguard Worker mova m24, m27 585*c0909341SAndroid Build Coastguard Worker mova m25, m27 586*c0909341SAndroid Build Coastguard Worker mova m26, m27 587*c0909341SAndroid Build Coastguard Worker movifnidn t1d, secm 588*c0909341SAndroid Build Coastguard Worker vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) 589*c0909341SAndroid Build Coastguard Worker vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) 590*c0909341SAndroid Build Coastguard Worker vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) 591*c0909341SAndroid Build Coastguard Worker vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) 592*c0909341SAndroid Build Coastguard Worker psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) 593*c0909341SAndroid Build Coastguard Worker psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) 594*c0909341SAndroid Build Coastguard Worker psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) 595*c0909341SAndroid Build Coastguard Worker psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) 596*c0909341SAndroid Build Coastguard Worker pminub m16, m20 597*c0909341SAndroid Build Coastguard Worker pminub m17, m21 598*c0909341SAndroid Build Coastguard Worker pminub m18, m22 599*c0909341SAndroid Build Coastguard Worker pminub m19, m23 600*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m16, m24 ; sum tl 601*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m17, m25 ; sum tr 602*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m18, m26 ; sum bl 603*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m19, m27 ; sum br 604*c0909341SAndroid Build Coastguard Worker%endmacro 605*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_8x8_PRI 606*c0909341SAndroid Build Coastguard Worker test t1d, t1d ; sec 607*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 608*c0909341SAndroid Build Coastguard Worker call .sec 609*c0909341SAndroid Build Coastguard Worker.end_clip: 610*c0909341SAndroid Build Coastguard Worker pminub m20, m8, m4 611*c0909341SAndroid Build Coastguard Worker pminub m24, m12, m16 612*c0909341SAndroid Build Coastguard Worker pminub m21, m9, m5 613*c0909341SAndroid Build Coastguard Worker pminub m25, m13, m17 614*c0909341SAndroid Build Coastguard Worker pminub m22, m10, m6 615*c0909341SAndroid Build Coastguard Worker pminub m26, m14, m18 616*c0909341SAndroid Build Coastguard Worker pminub m23, m11, m7 617*c0909341SAndroid Build Coastguard Worker pminub m27, m15, m19 618*c0909341SAndroid Build Coastguard Worker pmaxub m8, m4 619*c0909341SAndroid Build Coastguard Worker pmaxub m12, m16 620*c0909341SAndroid Build Coastguard Worker pmaxub m9, m5 621*c0909341SAndroid Build Coastguard Worker pmaxub m13, m17 622*c0909341SAndroid Build Coastguard Worker pmaxub m10, m6 623*c0909341SAndroid Build Coastguard Worker pmaxub m14, m18 624*c0909341SAndroid Build Coastguard Worker pmaxub m11, m7 625*c0909341SAndroid Build Coastguard Worker pmaxub m15, m19 626*c0909341SAndroid Build Coastguard Worker pminub m20, m24 627*c0909341SAndroid Build Coastguard Worker pminub m21, m25 628*c0909341SAndroid Build Coastguard Worker pminub m22, m26 629*c0909341SAndroid Build Coastguard Worker pminub m23, m27 630*c0909341SAndroid Build Coastguard Worker pmaxub m8, m12 631*c0909341SAndroid Build Coastguard Worker pmaxub m9, m13 632*c0909341SAndroid Build Coastguard Worker pmaxub m10, m14 633*c0909341SAndroid Build Coastguard Worker pmaxub m11, m15 634*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAAAAAA 635*c0909341SAndroid Build Coastguard Worker kmovd k1, r2d 636*c0909341SAndroid Build Coastguard Worker vpshrdd m24, m0, m1, 16 637*c0909341SAndroid Build Coastguard Worker vpshrdd m25, m2, m3, 16 638*c0909341SAndroid Build Coastguard Worker vpshrdd m12, m20, m21, 16 639*c0909341SAndroid Build Coastguard Worker vpshrdd m14, m22, m23, 16 640*c0909341SAndroid Build Coastguard Worker vpshrdd m16, m8, m9, 16 641*c0909341SAndroid Build Coastguard Worker vpshrdd m18, m10, m11, 16 642*c0909341SAndroid Build Coastguard Worker vpblendmw m13{k1}, m20, m21 643*c0909341SAndroid Build Coastguard Worker vpblendmw m15{k1}, m22, m23 644*c0909341SAndroid Build Coastguard Worker vpblendmw m17{k1}, m8, m9 645*c0909341SAndroid Build Coastguard Worker vpblendmw m19{k1}, m10, m11 646*c0909341SAndroid Build Coastguard Worker vpblendmw m20{k1}, m0, m24 647*c0909341SAndroid Build Coastguard Worker vpblendmw m21{k1}, m24, m1 648*c0909341SAndroid Build Coastguard Worker vpblendmw m22{k1}, m2, m25 649*c0909341SAndroid Build Coastguard Worker vpblendmw m23{k1}, m25, m3 650*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m5, 16 651*c0909341SAndroid Build Coastguard Worker vpshrdd m6, m7, 16 652*c0909341SAndroid Build Coastguard Worker pminub m12, m13 653*c0909341SAndroid Build Coastguard Worker pminub m14, m15 654*c0909341SAndroid Build Coastguard Worker pmaxub m16, m17 655*c0909341SAndroid Build Coastguard Worker pmaxub m18, m19 656*c0909341SAndroid Build Coastguard Worker mova m8, [base+end_perm_clip] 657*c0909341SAndroid Build Coastguard Worker vpcmpw k2, m20, m31, 1 658*c0909341SAndroid Build Coastguard Worker vpcmpw k3, m22, m31, 1 659*c0909341SAndroid Build Coastguard Worker vpshldw m4, m21, 8 660*c0909341SAndroid Build Coastguard Worker vpshldw m6, m23, 8 661*c0909341SAndroid Build Coastguard Worker kunpckdq k1, k1, k1 662*c0909341SAndroid Build Coastguard Worker kxnorb k4, k4, k4 663*c0909341SAndroid Build Coastguard Worker vpshrdw m11, m12, m14, 8 664*c0909341SAndroid Build Coastguard Worker vpshrdw m15, m16, m18, 8 665*c0909341SAndroid Build Coastguard Worker vpblendmb m13{k1}, m12, m14 666*c0909341SAndroid Build Coastguard Worker vpblendmb m17{k1}, m16, m18 667*c0909341SAndroid Build Coastguard Worker psubw m21, m31, m20 668*c0909341SAndroid Build Coastguard Worker psubw m23, m31, m22 669*c0909341SAndroid Build Coastguard Worker paddusw m0, m20, m4 ; clip >0xff 670*c0909341SAndroid Build Coastguard Worker paddusw m1, m22, m6 671*c0909341SAndroid Build Coastguard Worker pminub m11, m13 672*c0909341SAndroid Build Coastguard Worker pmaxub m15, m17 673*c0909341SAndroid Build Coastguard Worker psubusw m0{k2}, m4, m21 ; clip <0x00 674*c0909341SAndroid Build Coastguard Worker psubusw m1{k3}, m6, m23 675*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 676*c0909341SAndroid Build Coastguard Worker vmovdqu8 m0{k1}, m1 677*c0909341SAndroid Build Coastguard Worker pmaxub m0, m11 678*c0909341SAndroid Build Coastguard Worker pminub m0, m15 679*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 680*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m0, 1 681*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 682*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m0, 3 683*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 684*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 685*c0909341SAndroid Build Coastguard Worker movq [r10 +strideq*0], xm2 686*c0909341SAndroid Build Coastguard Worker movq [r10 +strideq*2], xm3 687*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 688*c0909341SAndroid Build Coastguard Worker movhps [dstq+r9 ], xm1 689*c0909341SAndroid Build Coastguard Worker movhps [r10 +strideq*1], xm2 690*c0909341SAndroid Build Coastguard Worker movhps [r10 +r9 ], xm3 691*c0909341SAndroid Build Coastguard Worker RET 692*c0909341SAndroid Build Coastguard Worker.sec_only: 693*c0909341SAndroid Build Coastguard Worker movifnidn t1d, secm 694*c0909341SAndroid Build Coastguard Worker call .sec 695*c0909341SAndroid Build Coastguard Worker.end_no_clip: 696*c0909341SAndroid Build Coastguard Worker mova xm8, [base+end_perm] 697*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 698*c0909341SAndroid Build Coastguard Worker vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) 699*c0909341SAndroid Build Coastguard Worker vpshldd m5, m1, 8 700*c0909341SAndroid Build Coastguard Worker vpshldd m6, m2, 8 701*c0909341SAndroid Build Coastguard Worker vpshldd m7, m3, 8 702*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 703*c0909341SAndroid Build Coastguard Worker paddw m1, m5 704*c0909341SAndroid Build Coastguard Worker paddw m2, m6 705*c0909341SAndroid Build Coastguard Worker paddw m3, m7 706*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 707*c0909341SAndroid Build Coastguard Worker vpermb m1, m8, m1 708*c0909341SAndroid Build Coastguard Worker vpermb m2, m8, m2 709*c0909341SAndroid Build Coastguard Worker vpermb m3, m8, m3 710*c0909341SAndroid Build Coastguard Worker punpckldq m4, m0, m1 711*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m1 712*c0909341SAndroid Build Coastguard Worker punpckldq m5, m2, m3 713*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m3 714*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm4 715*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 716*c0909341SAndroid Build Coastguard Worker movq [r10 +strideq*0], xm5 717*c0909341SAndroid Build Coastguard Worker movq [r10 +strideq*2], xm2 718*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm4 719*c0909341SAndroid Build Coastguard Worker movhps [dstq+r9 ], xm0 720*c0909341SAndroid Build Coastguard Worker movhps [r10 +strideq*1], xm5 721*c0909341SAndroid Build Coastguard Worker movhps [r10 +r9 ], xm2 722*c0909341SAndroid Build Coastguard Worker RET 723*c0909341SAndroid Build Coastguard Worker.mask_edges_sec_only: 724*c0909341SAndroid Build Coastguard Worker movifnidn t1d, secm 725*c0909341SAndroid Build Coastguard Worker call .mask_edges_sec 726*c0909341SAndroid Build Coastguard Worker jmp .end_no_clip 727*c0909341SAndroid Build Coastguard WorkerALIGN function_align 728*c0909341SAndroid Build Coastguard Worker.mask_edges: 729*c0909341SAndroid Build Coastguard Worker mov t0d, r6d 730*c0909341SAndroid Build Coastguard Worker mov t1d, r6d 731*c0909341SAndroid Build Coastguard Worker or t0d, 0xA ; top-left 4x4 has bottom and right 732*c0909341SAndroid Build Coastguard Worker or t1d, 0x9 ; top-right 4x4 has bottom and left 733*c0909341SAndroid Build Coastguard Worker vpbroadcastq m26, [base+edge_mask+t0*8] 734*c0909341SAndroid Build Coastguard Worker vpbroadcastq m27, [base+edge_mask+t1*8] 735*c0909341SAndroid Build Coastguard Worker mov t1d, r6d 736*c0909341SAndroid Build Coastguard Worker or r6d, 0x6 ; bottom-left 4x4 has top and right 737*c0909341SAndroid Build Coastguard Worker or t1d, 0x5 ; bottom-right 4x4 has top and left 738*c0909341SAndroid Build Coastguard Worker vpbroadcastq m28, [base+edge_mask+r6*8] 739*c0909341SAndroid Build Coastguard Worker vpbroadcastq m29, [base+edge_mask+t1*8] 740*c0909341SAndroid Build Coastguard Worker mov t0d, dirm 741*c0909341SAndroid Build Coastguard Worker test prid, prid 742*c0909341SAndroid Build Coastguard Worker jz .mask_edges_sec_only 743*c0909341SAndroid Build Coastguard Worker vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} 744*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m26, m20 ; index in-range 745*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k2, m27, m20 746*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k3, m28, m20 747*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k4, m29, m20 748*c0909341SAndroid Build Coastguard Worker mova m8, m4 749*c0909341SAndroid Build Coastguard Worker mova m9, m5 750*c0909341SAndroid Build Coastguard Worker mova m10, m6 751*c0909341SAndroid Build Coastguard Worker mova m11, m7 752*c0909341SAndroid Build Coastguard Worker vpermb m8{k1}, m20, m12 753*c0909341SAndroid Build Coastguard Worker vpermb m9{k2}, m20, m13 754*c0909341SAndroid Build Coastguard Worker vpermb m10{k3}, m20, m14 755*c0909341SAndroid Build Coastguard Worker vpermb m11{k4}, m20, m15 756*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m26 757*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], m27 758*c0909341SAndroid Build Coastguard Worker mova [rsp+0x80], m28 759*c0909341SAndroid Build Coastguard Worker mova [rsp+0xC0], m29 760*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_8x8_PRI 761*c0909341SAndroid Build Coastguard Worker test t1d, t1d 762*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 763*c0909341SAndroid Build Coastguard Worker mova m26, [rsp+0x00] 764*c0909341SAndroid Build Coastguard Worker mova m27, [rsp+0x40] 765*c0909341SAndroid Build Coastguard Worker mova m28, [rsp+0x80] 766*c0909341SAndroid Build Coastguard Worker mova m29, [rsp+0xC0] 767*c0909341SAndroid Build Coastguard Worker call .mask_edges_sec 768*c0909341SAndroid Build Coastguard Worker jmp .end_clip 769*c0909341SAndroid Build Coastguard Worker.mask_edges_sec: 770*c0909341SAndroid Build Coastguard Worker vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} 771*c0909341SAndroid Build Coastguard Worker vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} 772*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m26, m20 773*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k2, m27, m20 774*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k3, m28, m20 775*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k4, m29, m20 776*c0909341SAndroid Build Coastguard Worker mova m16, m4 777*c0909341SAndroid Build Coastguard Worker mova m17, m5 778*c0909341SAndroid Build Coastguard Worker mova m18, m6 779*c0909341SAndroid Build Coastguard Worker mova m19, m7 780*c0909341SAndroid Build Coastguard Worker vpermb m16{k1}, m20, m12 781*c0909341SAndroid Build Coastguard Worker vpermb m17{k2}, m20, m13 782*c0909341SAndroid Build Coastguard Worker vpermb m18{k3}, m20, m14 783*c0909341SAndroid Build Coastguard Worker vpermb m19{k4}, m20, m15 784*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k1, m26, m21 785*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k2, m27, m21 786*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k3, m28, m21 787*c0909341SAndroid Build Coastguard Worker vpshufbitqmb k4, m29, m21 788*c0909341SAndroid Build Coastguard Worker vpermb m12, m21, m12 789*c0909341SAndroid Build Coastguard Worker vpermb m13, m21, m13 790*c0909341SAndroid Build Coastguard Worker vpermb m14, m21, m14 791*c0909341SAndroid Build Coastguard Worker vpermb m15, m21, m15 792*c0909341SAndroid Build Coastguard Worker vpblendmb m12{k1}, m4, m12 793*c0909341SAndroid Build Coastguard Worker vpblendmb m13{k2}, m5, m13 794*c0909341SAndroid Build Coastguard Worker vpblendmb m14{k3}, m6, m14 795*c0909341SAndroid Build Coastguard Worker vpblendmb m15{k4}, m7, m15 796*c0909341SAndroid Build Coastguard Worker jmp .sec_main 797*c0909341SAndroid Build Coastguard WorkerALIGN function_align 798*c0909341SAndroid Build Coastguard Worker.sec: 799*c0909341SAndroid Build Coastguard Worker vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 800*c0909341SAndroid Build Coastguard Worker vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 801*c0909341SAndroid Build Coastguard Worker vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 802*c0909341SAndroid Build Coastguard Worker vpermb m17, m20, m13 ; pNtr 803*c0909341SAndroid Build Coastguard Worker vpermb m18, m20, m14 ; pNbl 804*c0909341SAndroid Build Coastguard Worker vpermb m19, m20, m15 ; pNbr 805*c0909341SAndroid Build Coastguard Worker vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 806*c0909341SAndroid Build Coastguard Worker vpermb m13, m21, m13 ; pNtr 807*c0909341SAndroid Build Coastguard Worker vpermb m14, m21, m14 ; pNbl 808*c0909341SAndroid Build Coastguard Worker vpermb m15, m21, m15 ; pNbr 809*c0909341SAndroid Build Coastguard Worker.sec_main: 810*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants 811*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m4, %1, 6 812*c0909341SAndroid Build Coastguard Worker vpcmpub k2, m5, %2, 6 813*c0909341SAndroid Build Coastguard Worker vpcmpub k3, m6, %3, 6 814*c0909341SAndroid Build Coastguard Worker vpcmpub k4, m7, %4, 6 815*c0909341SAndroid Build Coastguard Worker psubb m20, %1, m4 816*c0909341SAndroid Build Coastguard Worker psubb m21, %2, m5 817*c0909341SAndroid Build Coastguard Worker psubb m22, %3, m6 818*c0909341SAndroid Build Coastguard Worker psubb m23, %4, m7 819*c0909341SAndroid Build Coastguard Worker%if %5 820*c0909341SAndroid Build Coastguard Worker vpbroadcastb m28, t1d 821*c0909341SAndroid Build Coastguard Worker lzcnt t1d, t1d 822*c0909341SAndroid Build Coastguard Worker vpbroadcastq m29, [r3+t1*8] 823*c0909341SAndroid Build Coastguard Worker%endif 824*c0909341SAndroid Build Coastguard Worker vpsubb m20{k1}, m4, %1 825*c0909341SAndroid Build Coastguard Worker vpsubb m21{k2}, m5, %2 826*c0909341SAndroid Build Coastguard Worker vpsubb m22{k3}, m6, %3 827*c0909341SAndroid Build Coastguard Worker vpsubb m23{k4}, m7, %4 828*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m24, m20, m29, 0 829*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m25, m21, m29, 0 830*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m26, m22, m29, 0 831*c0909341SAndroid Build Coastguard Worker gf2p8affineqb m27, m23, m29, 0 832*c0909341SAndroid Build Coastguard Worker%if %5 833*c0909341SAndroid Build Coastguard Worker vpbroadcastd m30, [base+sec_tap] 834*c0909341SAndroid Build Coastguard Worker%endif 835*c0909341SAndroid Build Coastguard Worker psubusb m24, m28, m24 836*c0909341SAndroid Build Coastguard Worker psubusb m25, m28, m25 837*c0909341SAndroid Build Coastguard Worker psubusb m26, m28, m26 838*c0909341SAndroid Build Coastguard Worker psubusb m27, m28, m27 839*c0909341SAndroid Build Coastguard Worker pminub m20, m24 840*c0909341SAndroid Build Coastguard Worker pminub m21, m25 841*c0909341SAndroid Build Coastguard Worker pminub m22, m26 842*c0909341SAndroid Build Coastguard Worker pminub m23, m27 843*c0909341SAndroid Build Coastguard Worker mova m24, m30 844*c0909341SAndroid Build Coastguard Worker mova m25, m30 845*c0909341SAndroid Build Coastguard Worker mova m26, m30 846*c0909341SAndroid Build Coastguard Worker mova m27, m30 847*c0909341SAndroid Build Coastguard Worker vpsubb m24{k1}, m31, m30 848*c0909341SAndroid Build Coastguard Worker vpsubb m25{k2}, m31, m30 849*c0909341SAndroid Build Coastguard Worker vpsubb m26{k3}, m31, m30 850*c0909341SAndroid Build Coastguard Worker vpsubb m27{k4}, m31, m30 851*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m20, m24 852*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m21, m25 853*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m22, m26 854*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m23, m27 855*c0909341SAndroid Build Coastguard Worker%endmacro 856*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 857*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_8x8_SEC m12, m13, m14, m15 858*c0909341SAndroid Build Coastguard Worker ret 859*c0909341SAndroid Build Coastguard Worker 860*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 861