1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHTS 1-* 32*c0909341SAndroid Build Coastguard Workerconst smooth_weights_1d_16bpc ; sm_weights[] << 7 33*c0909341SAndroid Build Coastguard Worker %rep %0 34*c0909341SAndroid Build Coastguard Worker dw %1*128 35*c0909341SAndroid Build Coastguard Worker %rotate 1 36*c0909341SAndroid Build Coastguard Worker %endrep 37*c0909341SAndroid Build Coastguard Workerconst smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] 38*c0909341SAndroid Build Coastguard Worker %rep %0 39*c0909341SAndroid Build Coastguard Worker dw %1, 256-%1 40*c0909341SAndroid Build Coastguard Worker %rotate 1 41*c0909341SAndroid Build Coastguard Worker %endrep 42*c0909341SAndroid Build Coastguard Worker%endmacro 43*c0909341SAndroid Build Coastguard Worker 44*c0909341SAndroid Build Coastguard WorkerSMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 45*c0909341SAndroid Build Coastguard Worker 255, 197, 146, 105, 73, 50, 37, 32, \ 46*c0909341SAndroid Build Coastguard Worker 255, 225, 196, 170, 145, 123, 102, 84, \ 47*c0909341SAndroid Build Coastguard Worker 68, 54, 43, 33, 26, 20, 17, 16, \ 48*c0909341SAndroid Build Coastguard Worker 255, 240, 225, 210, 196, 182, 169, 157, \ 49*c0909341SAndroid Build Coastguard Worker 145, 133, 122, 111, 101, 92, 83, 74, \ 50*c0909341SAndroid Build Coastguard Worker 66, 59, 52, 45, 39, 34, 29, 25, \ 51*c0909341SAndroid Build Coastguard Worker 21, 17, 14, 12, 10, 9, 8, 8, \ 52*c0909341SAndroid Build Coastguard Worker 255, 248, 240, 233, 225, 218, 210, 203, \ 53*c0909341SAndroid Build Coastguard Worker 196, 189, 182, 176, 169, 163, 156, 150, \ 54*c0909341SAndroid Build Coastguard Worker 144, 138, 133, 127, 121, 116, 111, 106, \ 55*c0909341SAndroid Build Coastguard Worker 101, 96, 91, 86, 82, 77, 73, 69, \ 56*c0909341SAndroid Build Coastguard Worker 65, 61, 57, 54, 50, 47, 44, 41, \ 57*c0909341SAndroid Build Coastguard Worker 38, 35, 32, 29, 27, 25, 22, 20, \ 58*c0909341SAndroid Build Coastguard Worker 18, 16, 15, 13, 12, 10, 9, 8, \ 59*c0909341SAndroid Build Coastguard Worker 7, 6, 6, 5, 5, 4, 4, 4 60*c0909341SAndroid Build Coastguard Worker 61*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 62*c0909341SAndroid Build Coastguard Worker 63*c0909341SAndroid Build Coastguard Workeripred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 64*c0909341SAndroid Build Coastguard Worker db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 65*c0909341SAndroid Build Coastguard Workerfilter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 66*c0909341SAndroid Build Coastguard Workerfilter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 67*c0909341SAndroid Build Coastguard Workerfilter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 68*c0909341SAndroid Build Coastguard Workerpal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 69*c0909341SAndroid Build Coastguard Workerz_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 70*c0909341SAndroid Build Coastguard Worker dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 71*c0909341SAndroid Build Coastguard Workerz_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 72*c0909341SAndroid Build Coastguard Workerz_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 73*c0909341SAndroid Build Coastguard Workerz_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 74*c0909341SAndroid Build Coastguard Worker db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 75*c0909341SAndroid Build Coastguard Workerpw_m1024: times 2 dw -1024 76*c0909341SAndroid Build Coastguard Workerpw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 77*c0909341SAndroid Build Coastguard Workerpw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 78*c0909341SAndroid Build Coastguard Workerz2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 79*c0909341SAndroid Build Coastguard Workerz2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 80*c0909341SAndroid Build Coastguard Workerpb_90: times 4 db 90 81*c0909341SAndroid Build Coastguard Workerz2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 82*c0909341SAndroid Build Coastguard Workerz_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 83*c0909341SAndroid Build Coastguard Workerz2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 84*c0909341SAndroid Build Coastguard Workerz2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 85*c0909341SAndroid Build Coastguard Workerz2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 86*c0909341SAndroid Build Coastguard Workerz_filter_k: dw 4, 4, 5, 5, 4, 4 87*c0909341SAndroid Build Coastguard Worker dw 8, 8, 6, 6, 4, 4 88*c0909341SAndroid Build Coastguard Worker dw 0, 0, 0, 0, 2, 2 89*c0909341SAndroid Build Coastguard Worker 90*c0909341SAndroid Build Coastguard Worker%define pw_2 (z_filter_k+32) 91*c0909341SAndroid Build Coastguard Worker%define pw_4 (z_filter_k+ 0) 92*c0909341SAndroid Build Coastguard Worker%define pw_16 (z2_ymul8 +20) 93*c0909341SAndroid Build Coastguard Worker 94*c0909341SAndroid Build Coastguard Workerpw_1: times 2 dw 1 95*c0909341SAndroid Build Coastguard Workerpw_3: times 2 dw 3 96*c0909341SAndroid Build Coastguard Workerpw_62: times 2 dw 62 97*c0909341SAndroid Build Coastguard Workerpw_512: times 2 dw 512 98*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 99*c0909341SAndroid Build Coastguard Workerpd_8: dd 8 100*c0909341SAndroid Build Coastguard Worker 101*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-* 102*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*4) 103*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_%2) 104*c0909341SAndroid Build Coastguard Worker %%table: 105*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 106*c0909341SAndroid Build Coastguard Worker dd %%base %+ .%3 - (%%table - 2*4) 107*c0909341SAndroid Build Coastguard Worker %rotate 1 108*c0909341SAndroid Build Coastguard Worker %endrep 109*c0909341SAndroid Build Coastguard Worker%endmacro 110*c0909341SAndroid Build Coastguard Worker 111*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) 112*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) 113*c0909341SAndroid Build Coastguard Worker 114*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 115*c0909341SAndroid Build Coastguard Worker s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 116*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 117*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 118*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 119*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 120*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 121*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 122*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 123*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 124*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 125*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 126*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ 127*c0909341SAndroid Build Coastguard Worker s4-8*4, s8-8*4, s16-8*4, s32-8*4 128*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 129*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 130*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 131*c0909341SAndroid Build Coastguard Worker 132*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative 133*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps 134*c0909341SAndroid Build Coastguard Worker 135*c0909341SAndroid Build Coastguard WorkerSECTION .text 136*c0909341SAndroid Build Coastguard Worker 137*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 138*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h 139*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 140*c0909341SAndroid Build Coastguard Worker add tlq, 2 141*c0909341SAndroid Build Coastguard Worker movd xm4, wd 142*c0909341SAndroid Build Coastguard Worker pxor xm3, xm3 143*c0909341SAndroid Build Coastguard Worker pavgw xm4, xm3 144*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 145*c0909341SAndroid Build Coastguard Worker movd xm5, wd 146*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 147*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_left_16bpc_avx2_table] 148*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+wq*4] 149*c0909341SAndroid Build Coastguard Worker add r6, r5 150*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table 151*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 152*c0909341SAndroid Build Coastguard Worker add wq, r5 153*c0909341SAndroid Build Coastguard Worker jmp r6 154*c0909341SAndroid Build Coastguard Worker 155*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 156*c0909341SAndroid Build Coastguard Worker mov hd, hm 157*c0909341SAndroid Build Coastguard Worker sub tlq, hq 158*c0909341SAndroid Build Coastguard Worker movd xm4, hd 159*c0909341SAndroid Build Coastguard Worker sub tlq, hq 160*c0909341SAndroid Build Coastguard Worker pxor xm3, xm3 161*c0909341SAndroid Build Coastguard Worker pavgw xm4, xm3 162*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 163*c0909341SAndroid Build Coastguard Worker movd xm5, r6d 164*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 165*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_left_16bpc_avx2_table] 166*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 167*c0909341SAndroid Build Coastguard Worker add r6, r5 168*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table 169*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 170*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 171*c0909341SAndroid Build Coastguard Worker add wq, r5 172*c0909341SAndroid Build Coastguard Worker jmp r6 173*c0909341SAndroid Build Coastguard Worker.h64: 174*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+96] 175*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+64] 176*c0909341SAndroid Build Coastguard Worker.h32: 177*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+32] 178*c0909341SAndroid Build Coastguard Worker.h16: 179*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 180*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 181*c0909341SAndroid Build Coastguard Worker.h8: 182*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 183*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 184*c0909341SAndroid Build Coastguard Worker.h4: 185*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm3 186*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 187*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 188*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 189*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 190*c0909341SAndroid Build Coastguard Worker paddd xm0, xm4 191*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 192*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 193*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 194*c0909341SAndroid Build Coastguard Worker mova m1, m0 195*c0909341SAndroid Build Coastguard Worker mova m2, m0 196*c0909341SAndroid Build Coastguard Worker mova m3, m0 197*c0909341SAndroid Build Coastguard Worker jmp wq 198*c0909341SAndroid Build Coastguard Worker 199*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 200*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 201*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 202*c0909341SAndroid Build Coastguard Worker lea r5d, [wq+hq] 203*c0909341SAndroid Build Coastguard Worker movd xm4, r5d 204*c0909341SAndroid Build Coastguard Worker tzcnt r5d, r5d 205*c0909341SAndroid Build Coastguard Worker movd xm5, r5d 206*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_16bpc_avx2_table] 207*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 208*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 209*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4+5*4] 210*c0909341SAndroid Build Coastguard Worker pxor m3, m3 211*c0909341SAndroid Build Coastguard Worker psrlw xm4, 1 212*c0909341SAndroid Build Coastguard Worker add r6, r5 213*c0909341SAndroid Build Coastguard Worker add wq, r5 214*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 215*c0909341SAndroid Build Coastguard Worker jmp r6 216*c0909341SAndroid Build Coastguard Worker.h4: 217*c0909341SAndroid Build Coastguard Worker movq xm0, [tlq-8] 218*c0909341SAndroid Build Coastguard Worker jmp wq 219*c0909341SAndroid Build Coastguard Worker.w4: 220*c0909341SAndroid Build Coastguard Worker movq xm1, [tlq+2] 221*c0909341SAndroid Build Coastguard Worker paddw m0, m4 222*c0909341SAndroid Build Coastguard Worker paddw m0, m1 223*c0909341SAndroid Build Coastguard Worker psrlq m1, m0, 32 224*c0909341SAndroid Build Coastguard Worker paddw m0, m1 225*c0909341SAndroid Build Coastguard Worker psrld m1, m0, 16 226*c0909341SAndroid Build Coastguard Worker paddw m0, m1 227*c0909341SAndroid Build Coastguard Worker cmp hd, 4 228*c0909341SAndroid Build Coastguard Worker jg .w4_mul 229*c0909341SAndroid Build Coastguard Worker psrlw xm0, 3 230*c0909341SAndroid Build Coastguard Worker jmp .w4_end 231*c0909341SAndroid Build Coastguard Worker.w4_mul: 232*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 233*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 234*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 235*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB6667 236*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, r2d 237*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm3 238*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm3 239*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 240*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 241*c0909341SAndroid Build Coastguard Worker psrld xm0, 2 242*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 243*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 244*c0909341SAndroid Build Coastguard Worker.w4_end: 245*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm0, xm0 246*c0909341SAndroid Build Coastguard Worker.s4: 247*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 248*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm0 249*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 250*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm0 251*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 252*c0909341SAndroid Build Coastguard Worker sub hd, 4 253*c0909341SAndroid Build Coastguard Worker jg .s4 254*c0909341SAndroid Build Coastguard Worker RET 255*c0909341SAndroid Build Coastguard WorkerALIGN function_align 256*c0909341SAndroid Build Coastguard Worker.h8: 257*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-16] 258*c0909341SAndroid Build Coastguard Worker jmp wq 259*c0909341SAndroid Build Coastguard Worker.w8: 260*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 261*c0909341SAndroid Build Coastguard Worker paddw xm0, [tlq+2] 262*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 263*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 264*c0909341SAndroid Build Coastguard Worker psrld xm1, xm0, 16 265*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 266*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm3, 0xAA 267*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 268*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 269*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 270*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 271*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 272*c0909341SAndroid Build Coastguard Worker cmp hd, 8 273*c0909341SAndroid Build Coastguard Worker je .w8_end 274*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 275*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 276*c0909341SAndroid Build Coastguard Worker cmp hd, 32 277*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 278*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 279*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 280*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 281*c0909341SAndroid Build Coastguard Worker.w8_end: 282*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm0, xm0 283*c0909341SAndroid Build Coastguard Worker.s8: 284*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 285*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm0 286*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm0 287*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], xm0 288*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 289*c0909341SAndroid Build Coastguard Worker sub hd, 4 290*c0909341SAndroid Build Coastguard Worker jg .s8 291*c0909341SAndroid Build Coastguard Worker RET 292*c0909341SAndroid Build Coastguard WorkerALIGN function_align 293*c0909341SAndroid Build Coastguard Worker.h16: 294*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] 295*c0909341SAndroid Build Coastguard Worker jmp wq 296*c0909341SAndroid Build Coastguard Worker.w16: 297*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+2] 298*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 299*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 300*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 301*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm3 302*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm3 303*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 304*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 305*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 306*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 307*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 308*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 309*c0909341SAndroid Build Coastguard Worker cmp hd, 16 310*c0909341SAndroid Build Coastguard Worker je .w16_end 311*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 312*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 313*c0909341SAndroid Build Coastguard Worker test hb, 8|32 314*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 315*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 316*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 317*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 318*c0909341SAndroid Build Coastguard Worker.w16_end: 319*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 320*c0909341SAndroid Build Coastguard Worker.s16: 321*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 322*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 323*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 324*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m0 325*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 326*c0909341SAndroid Build Coastguard Worker sub hd, 4 327*c0909341SAndroid Build Coastguard Worker jg .s16 328*c0909341SAndroid Build Coastguard Worker RET 329*c0909341SAndroid Build Coastguard WorkerALIGN function_align 330*c0909341SAndroid Build Coastguard Worker.h32: 331*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-64] 332*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-32] 333*c0909341SAndroid Build Coastguard Worker jmp wq 334*c0909341SAndroid Build Coastguard Worker.w32: 335*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+ 2] 336*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+34] 337*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 338*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 339*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 340*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm3 341*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm3 342*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 343*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 344*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 345*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 346*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 347*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 348*c0909341SAndroid Build Coastguard Worker cmp hd, 32 349*c0909341SAndroid Build Coastguard Worker je .w32_end 350*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 351*c0909341SAndroid Build Coastguard Worker mov r6d, 0x6667AAAB 352*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, r2d 353*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 354*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 355*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 356*c0909341SAndroid Build Coastguard Worker.w32_end: 357*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 358*c0909341SAndroid Build Coastguard Worker mova m1, m0 359*c0909341SAndroid Build Coastguard Worker.s32: 360*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 361*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 362*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m0 363*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m1 364*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+32*0], m0 365*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+32*1], m1 366*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +32*0], m0 367*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +32*1], m1 368*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 369*c0909341SAndroid Build Coastguard Worker sub hd, 4 370*c0909341SAndroid Build Coastguard Worker jg .s32 371*c0909341SAndroid Build Coastguard Worker RET 372*c0909341SAndroid Build Coastguard WorkerALIGN function_align 373*c0909341SAndroid Build Coastguard Worker.h64: 374*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-128] 375*c0909341SAndroid Build Coastguard Worker mova m1, [tlq- 96] 376*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq- 64] 377*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq- 32] 378*c0909341SAndroid Build Coastguard Worker paddw m0, m1 379*c0909341SAndroid Build Coastguard Worker jmp wq 380*c0909341SAndroid Build Coastguard Worker.w64: 381*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 2] 382*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+34] 383*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq+66] 384*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+98] 385*c0909341SAndroid Build Coastguard Worker paddw m0, m1 386*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 387*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 388*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm3 389*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm3 390*c0909341SAndroid Build Coastguard Worker paddd xm1, xm4 391*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 392*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 393*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 394*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 395*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 396*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 397*c0909341SAndroid Build Coastguard Worker cmp hd, 64 398*c0909341SAndroid Build Coastguard Worker je .w64_end 399*c0909341SAndroid Build Coastguard Worker mov r6d, 0x6667AAAB 400*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, hd 401*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 402*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 403*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 404*c0909341SAndroid Build Coastguard Worker.w64_end: 405*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 406*c0909341SAndroid Build Coastguard Worker mova m1, m0 407*c0909341SAndroid Build Coastguard Worker mova m2, m0 408*c0909341SAndroid Build Coastguard Worker mova m3, m0 409*c0909341SAndroid Build Coastguard Worker.s64: 410*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 411*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 412*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*2], m2 413*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*3], m3 414*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m0 415*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m1 416*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*2], m2 417*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*3], m3 418*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 419*c0909341SAndroid Build Coastguard Worker sub hd, 2 420*c0909341SAndroid Build Coastguard Worker jg .s64 421*c0909341SAndroid Build Coastguard Worker RET 422*c0909341SAndroid Build Coastguard Worker 423*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 424*c0909341SAndroid Build Coastguard Worker mov r6d, r8m 425*c0909341SAndroid Build Coastguard Worker shr r6d, 11 426*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_splat_16bpc_avx2_table] 427*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 428*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 429*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 430*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] 431*c0909341SAndroid Build Coastguard Worker mova m1, m0 432*c0909341SAndroid Build Coastguard Worker mova m2, m0 433*c0909341SAndroid Build Coastguard Worker mova m3, m0 434*c0909341SAndroid Build Coastguard Worker add wq, r5 435*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 436*c0909341SAndroid Build Coastguard Worker jmp wq 437*c0909341SAndroid Build Coastguard Worker 438*c0909341SAndroid Build Coastguard Workercglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 439*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 440*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+ 2] 441*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+34] 442*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+66] 443*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+98] 444*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_splat_16bpc_avx2_table] 445*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 446*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 447*c0909341SAndroid Build Coastguard Worker add wq, r5 448*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 449*c0909341SAndroid Build Coastguard Worker jmp wq 450*c0909341SAndroid Build Coastguard Worker 451*c0909341SAndroid Build Coastguard Worker%macro IPRED_H 2 ; w, store_type 452*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq-2] 453*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq-4] 454*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, [tlq-6] 455*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [tlq-8] 456*c0909341SAndroid Build Coastguard Worker sub tlq, 8 457*c0909341SAndroid Build Coastguard Worker mov%2 [dstq+strideq*0], m0 458*c0909341SAndroid Build Coastguard Worker mov%2 [dstq+strideq*1], m1 459*c0909341SAndroid Build Coastguard Worker mov%2 [dstq+strideq*2], m2 460*c0909341SAndroid Build Coastguard Worker mov%2 [dstq+stride3q ], m3 461*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 462*c0909341SAndroid Build Coastguard Worker sub hd, 4 463*c0909341SAndroid Build Coastguard Worker jg .w%1 464*c0909341SAndroid Build Coastguard Worker RET 465*c0909341SAndroid Build Coastguard WorkerALIGN function_align 466*c0909341SAndroid Build Coastguard Worker%endmacro 467*c0909341SAndroid Build Coastguard Worker 468*c0909341SAndroid Build Coastguard Workercglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 469*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 470*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_h_16bpc_avx2_table] 471*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 472*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 473*c0909341SAndroid Build Coastguard Worker add wq, r5 474*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 475*c0909341SAndroid Build Coastguard Worker jmp wq 476*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 477*c0909341SAndroid Build Coastguard Worker.w4: 478*c0909341SAndroid Build Coastguard Worker IPRED_H 4, q 479*c0909341SAndroid Build Coastguard Worker.w8: 480*c0909341SAndroid Build Coastguard Worker IPRED_H 8, a 481*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 482*c0909341SAndroid Build Coastguard Worker.w16: 483*c0909341SAndroid Build Coastguard Worker IPRED_H 16, a 484*c0909341SAndroid Build Coastguard Worker.w32: 485*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq-2] 486*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq-4] 487*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, [tlq-6] 488*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [tlq-8] 489*c0909341SAndroid Build Coastguard Worker sub tlq, 8 490*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 491*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m0 492*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m1 493*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m1 494*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+32*0], m2 495*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+32*1], m2 496*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +32*0], m3 497*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +32*1], m3 498*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 499*c0909341SAndroid Build Coastguard Worker sub hd, 4 500*c0909341SAndroid Build Coastguard Worker jg .w32 501*c0909341SAndroid Build Coastguard Worker RET 502*c0909341SAndroid Build Coastguard Worker.w64: 503*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq-2] 504*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq-4] 505*c0909341SAndroid Build Coastguard Worker sub tlq, 4 506*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 507*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m0 508*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*2], m0 509*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*3], m0 510*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m1 511*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m1 512*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*2], m1 513*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*3], m1 514*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 515*c0909341SAndroid Build Coastguard Worker sub hd, 2 516*c0909341SAndroid Build Coastguard Worker jg .w64 517*c0909341SAndroid Build Coastguard Worker RET 518*c0909341SAndroid Build Coastguard Worker 519*c0909341SAndroid Build Coastguard Worker%macro PAETH 3 ; top, signed_ldiff, ldiff 520*c0909341SAndroid Build Coastguard Worker paddw m0, m%2, m1 521*c0909341SAndroid Build Coastguard Worker psubw m7, m3, m0 ; tldiff 522*c0909341SAndroid Build Coastguard Worker psubw m0, m%1 ; tdiff 523*c0909341SAndroid Build Coastguard Worker pabsw m7, m7 524*c0909341SAndroid Build Coastguard Worker pabsw m0, m0 525*c0909341SAndroid Build Coastguard Worker pminsw m7, m0 526*c0909341SAndroid Build Coastguard Worker pcmpeqw m0, m7 527*c0909341SAndroid Build Coastguard Worker pcmpgtw m7, m%3, m7 528*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m3, m%1, m0 529*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m1, m0, m7 530*c0909341SAndroid Build Coastguard Worker%endmacro 531*c0909341SAndroid Build Coastguard Worker 532*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h 533*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_16bpc_avx2_table 534*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 535*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_paeth_16bpc_avx2_table] 536*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 537*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 538*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [tlq] ; topleft 539*c0909341SAndroid Build Coastguard Worker add wq, r5 540*c0909341SAndroid Build Coastguard Worker jmp wq 541*c0909341SAndroid Build Coastguard Worker.w4: 542*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [tlq+2] ; top 543*c0909341SAndroid Build Coastguard Worker movsldup m6, [base+ipred_hv_shuf] 544*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 545*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m3 546*c0909341SAndroid Build Coastguard Worker pabsw m5, m4 547*c0909341SAndroid Build Coastguard Worker.w4_loop: 548*c0909341SAndroid Build Coastguard Worker sub tlq, 8 549*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [tlq] 550*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 ; left 551*c0909341SAndroid Build Coastguard Worker PAETH 2, 4, 5 552*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 553*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 554*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 555*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 556*c0909341SAndroid Build Coastguard Worker movhps [dstq+r3 ], xm1 557*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 558*c0909341SAndroid Build Coastguard Worker sub hd, 4 559*c0909341SAndroid Build Coastguard Worker jg .w4_loop 560*c0909341SAndroid Build Coastguard Worker RET 561*c0909341SAndroid Build Coastguard WorkerALIGN function_align 562*c0909341SAndroid Build Coastguard Worker.w8: 563*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [tlq+2] 564*c0909341SAndroid Build Coastguard Worker movsldup m6, [base+ipred_hv_shuf] 565*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m3 566*c0909341SAndroid Build Coastguard Worker pabsw m5, m4 567*c0909341SAndroid Build Coastguard Worker.w8_loop: 568*c0909341SAndroid Build Coastguard Worker sub tlq, 4 569*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [tlq] 570*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 571*c0909341SAndroid Build Coastguard Worker PAETH 2, 4, 5 572*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 573*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 574*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 575*c0909341SAndroid Build Coastguard Worker sub hd, 2 576*c0909341SAndroid Build Coastguard Worker jg .w8_loop 577*c0909341SAndroid Build Coastguard Worker RET 578*c0909341SAndroid Build Coastguard WorkerALIGN function_align 579*c0909341SAndroid Build Coastguard Worker.w16: 580*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+2] 581*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m3 582*c0909341SAndroid Build Coastguard Worker pabsw m5, m4 583*c0909341SAndroid Build Coastguard Worker.w16_loop: 584*c0909341SAndroid Build Coastguard Worker sub tlq, 2 585*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq] 586*c0909341SAndroid Build Coastguard Worker PAETH 2, 4, 5 587*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 588*c0909341SAndroid Build Coastguard Worker add dstq, strideq 589*c0909341SAndroid Build Coastguard Worker dec hd 590*c0909341SAndroid Build Coastguard Worker jg .w16_loop 591*c0909341SAndroid Build Coastguard Worker RET 592*c0909341SAndroid Build Coastguard WorkerALIGN function_align 593*c0909341SAndroid Build Coastguard Worker.w32: 594*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+2] 595*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+34] 596*c0909341SAndroid Build Coastguard Worker%if WIN64 597*c0909341SAndroid Build Coastguard Worker movaps r4m, xmm8 598*c0909341SAndroid Build Coastguard Worker movaps r6m, xmm9 599*c0909341SAndroid Build Coastguard Worker%endif 600*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m3 601*c0909341SAndroid Build Coastguard Worker psubw m8, m6, m3 602*c0909341SAndroid Build Coastguard Worker pabsw m5, m4 603*c0909341SAndroid Build Coastguard Worker pabsw m9, m8 604*c0909341SAndroid Build Coastguard Worker.w32_loop: 605*c0909341SAndroid Build Coastguard Worker sub tlq, 2 606*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq] 607*c0909341SAndroid Build Coastguard Worker PAETH 2, 4, 5 608*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 609*c0909341SAndroid Build Coastguard Worker PAETH 6, 8, 9 610*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 611*c0909341SAndroid Build Coastguard Worker add dstq, strideq 612*c0909341SAndroid Build Coastguard Worker dec hd 613*c0909341SAndroid Build Coastguard Worker jg .w32_loop 614*c0909341SAndroid Build Coastguard Worker%if WIN64 615*c0909341SAndroid Build Coastguard Worker movaps xmm8, r4m 616*c0909341SAndroid Build Coastguard Worker movaps xmm9, r6m 617*c0909341SAndroid Build Coastguard Worker%endif 618*c0909341SAndroid Build Coastguard Worker RET 619*c0909341SAndroid Build Coastguard WorkerALIGN function_align 620*c0909341SAndroid Build Coastguard Worker.w64: 621*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 16 622*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 2] 623*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+34] 624*c0909341SAndroid Build Coastguard Worker movu m10, [tlq+66] 625*c0909341SAndroid Build Coastguard Worker movu m13, [tlq+98] 626*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m3 627*c0909341SAndroid Build Coastguard Worker psubw m8, m6, m3 628*c0909341SAndroid Build Coastguard Worker psubw m11, m10, m3 629*c0909341SAndroid Build Coastguard Worker psubw m14, m13, m3 630*c0909341SAndroid Build Coastguard Worker pabsw m5, m4 631*c0909341SAndroid Build Coastguard Worker pabsw m9, m8 632*c0909341SAndroid Build Coastguard Worker pabsw m12, m11 633*c0909341SAndroid Build Coastguard Worker pabsw m15, m14 634*c0909341SAndroid Build Coastguard Worker.w64_loop: 635*c0909341SAndroid Build Coastguard Worker sub tlq, 2 636*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq] 637*c0909341SAndroid Build Coastguard Worker PAETH 2, 4, 5 638*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 639*c0909341SAndroid Build Coastguard Worker PAETH 6, 8, 9 640*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 641*c0909341SAndroid Build Coastguard Worker PAETH 10, 11, 12 642*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 643*c0909341SAndroid Build Coastguard Worker PAETH 13, 14, 15 644*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m0 645*c0909341SAndroid Build Coastguard Worker add dstq, strideq 646*c0909341SAndroid Build Coastguard Worker dec hd 647*c0909341SAndroid Build Coastguard Worker jg .w64_loop 648*c0909341SAndroid Build Coastguard Worker RET 649*c0909341SAndroid Build Coastguard Worker 650*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights 651*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_16bpc_avx2_table 652*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_smooth_v_16bpc_avx2_table] 653*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 654*c0909341SAndroid Build Coastguard Worker mov hd, hm 655*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 656*c0909341SAndroid Build Coastguard Worker lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] 657*c0909341SAndroid Build Coastguard Worker neg hq 658*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, [tlq+hq*2] ; bottom 659*c0909341SAndroid Build Coastguard Worker add wq, r6 660*c0909341SAndroid Build Coastguard Worker jmp wq 661*c0909341SAndroid Build Coastguard Worker.w4: 662*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [tlq+2] ; top 663*c0909341SAndroid Build Coastguard Worker movsldup m3, [base+ipred_hv_shuf] 664*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 665*c0909341SAndroid Build Coastguard Worker psubw m4, m5 ; top - bottom 666*c0909341SAndroid Build Coastguard Worker.w4_loop: 667*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [weightsq+hq*2] 668*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 669*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 670*c0909341SAndroid Build Coastguard Worker paddw m0, m5 671*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 672*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm1 673*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 674*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 675*c0909341SAndroid Build Coastguard Worker movq [dstq+r6 ], xm0 676*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 677*c0909341SAndroid Build Coastguard Worker add hq, 4 678*c0909341SAndroid Build Coastguard Worker jl .w4_loop 679*c0909341SAndroid Build Coastguard Worker.ret: 680*c0909341SAndroid Build Coastguard Worker RET 681*c0909341SAndroid Build Coastguard Worker.w8: 682*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [tlq+2] 683*c0909341SAndroid Build Coastguard Worker movsldup m3, [base+ipred_hv_shuf] 684*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 685*c0909341SAndroid Build Coastguard Worker psubw m4, m5 686*c0909341SAndroid Build Coastguard Worker.w8_loop: 687*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [weightsq+hq*2+0] 688*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [weightsq+hq*2+4] 689*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 690*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 691*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 692*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 693*c0909341SAndroid Build Coastguard Worker paddw m0, m5 694*c0909341SAndroid Build Coastguard Worker paddw m1, m5 695*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*0], m0, 1 696*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm0 697*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*2], m1, 1 698*c0909341SAndroid Build Coastguard Worker mova [dstq+r6 ], xm1 699*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 700*c0909341SAndroid Build Coastguard Worker add hq, 4 701*c0909341SAndroid Build Coastguard Worker jl .w8_loop 702*c0909341SAndroid Build Coastguard Worker RET 703*c0909341SAndroid Build Coastguard Worker.w16: 704*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+2] 705*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 706*c0909341SAndroid Build Coastguard Worker psubw m4, m5 707*c0909341SAndroid Build Coastguard Worker.w16_loop: 708*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [weightsq+hq*2+0] 709*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [weightsq+hq*2+2] 710*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, [weightsq+hq*2+4] 711*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [weightsq+hq*2+6] 712*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4}, m0, m1, m2, m3 713*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m5}, m0, m1, m2, m3 714*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 715*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 716*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 717*c0909341SAndroid Build Coastguard Worker mova [dstq+r6 ], m3 718*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 719*c0909341SAndroid Build Coastguard Worker add hq, 4 720*c0909341SAndroid Build Coastguard Worker jl .w16_loop 721*c0909341SAndroid Build Coastguard Worker RET 722*c0909341SAndroid Build Coastguard Worker.w32: 723*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 724*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+ 2] 725*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+34] 726*c0909341SAndroid Build Coastguard Worker psubw m4, m5 727*c0909341SAndroid Build Coastguard Worker psubw m6, m5 728*c0909341SAndroid Build Coastguard Worker.w32_loop: 729*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [weightsq+hq*2+0] 730*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [weightsq+hq*2+2] 731*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4, m1 732*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 733*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4, m3 734*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 735*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m5}, m0, m1, m2, m3 736*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 737*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 738*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m2 739*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m3 740*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 741*c0909341SAndroid Build Coastguard Worker add hq, 2 742*c0909341SAndroid Build Coastguard Worker jl .w32_loop 743*c0909341SAndroid Build Coastguard Worker RET 744*c0909341SAndroid Build Coastguard Worker.w64: 745*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 746*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+ 2] 747*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+34] 748*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+66] 749*c0909341SAndroid Build Coastguard Worker movu m7, [tlq+98] 750*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m5}, m3, m4, m6, m7 751*c0909341SAndroid Build Coastguard Worker.w64_loop: 752*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, [weightsq+hq*2] 753*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3, m2 754*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4, m2 755*c0909341SAndroid Build Coastguard Worker paddw m0, m5 756*c0909341SAndroid Build Coastguard Worker paddw m1, m5 757*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 758*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6, m2 759*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 760*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7, m2 761*c0909341SAndroid Build Coastguard Worker paddw m0, m5 762*c0909341SAndroid Build Coastguard Worker paddw m1, m5 763*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 764*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m1 765*c0909341SAndroid Build Coastguard Worker add dstq, strideq 766*c0909341SAndroid Build Coastguard Worker inc hq 767*c0909341SAndroid Build Coastguard Worker jl .w64_loop 768*c0909341SAndroid Build Coastguard Worker RET 769*c0909341SAndroid Build Coastguard Worker 770*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 771*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_h_16bpc_avx2_table 772*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_smooth_h_16bpc_avx2_table] 773*c0909341SAndroid Build Coastguard Worker mov wd, wm 774*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 775*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, [tlq+wq*2] ; right 776*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 777*c0909341SAndroid Build Coastguard Worker add hd, hd 778*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 779*c0909341SAndroid Build Coastguard Worker sub tlq, hq 780*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 781*c0909341SAndroid Build Coastguard Worker add wq, r6 782*c0909341SAndroid Build Coastguard Worker jmp wq 783*c0909341SAndroid Build Coastguard Worker.w4: 784*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] 785*c0909341SAndroid Build Coastguard Worker movsldup m3, [base+ipred_hv_shuf] 786*c0909341SAndroid Build Coastguard Worker.w4_loop: 787*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [tlq+hq-8] ; left 788*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 789*c0909341SAndroid Build Coastguard Worker psubw m0, m5 ; left - right 790*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 791*c0909341SAndroid Build Coastguard Worker paddw m0, m5 792*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 793*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 794*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 795*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 796*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 797*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 798*c0909341SAndroid Build Coastguard Worker sub hd, 4*2 799*c0909341SAndroid Build Coastguard Worker jg .w4_loop 800*c0909341SAndroid Build Coastguard Worker RET 801*c0909341SAndroid Build Coastguard Worker.w8: 802*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] 803*c0909341SAndroid Build Coastguard Worker movsldup m3, [base+ipred_hv_shuf] 804*c0909341SAndroid Build Coastguard Worker.w8_loop: 805*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-4] 806*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [tlq+hq-8] 807*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 808*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 809*c0909341SAndroid Build Coastguard Worker psubw m0, m5 810*c0909341SAndroid Build Coastguard Worker psubw m1, m5 811*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 812*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 813*c0909341SAndroid Build Coastguard Worker paddw m0, m5 814*c0909341SAndroid Build Coastguard Worker paddw m1, m5 815*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 816*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 817*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 818*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 819*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 820*c0909341SAndroid Build Coastguard Worker sub hq, 4*2 821*c0909341SAndroid Build Coastguard Worker jg .w8_loop 822*c0909341SAndroid Build Coastguard Worker RET 823*c0909341SAndroid Build Coastguard Worker.w16: 824*c0909341SAndroid Build Coastguard Worker movu m4, [base+smooth_weights_1d_16bpc+16*2] 825*c0909341SAndroid Build Coastguard Worker.w16_loop: 826*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [tlq+hq-8] 827*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 828*c0909341SAndroid Build Coastguard Worker psubw m3, m5 829*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q3333 830*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q2222 831*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q1111 832*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 833*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4}, m0, m1, m2, m3 834*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m5}, m0, m1, m2, m3 835*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 836*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 837*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 838*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m3 839*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 840*c0909341SAndroid Build Coastguard Worker sub hq, 4*2 841*c0909341SAndroid Build Coastguard Worker jg .w16_loop 842*c0909341SAndroid Build Coastguard Worker RET 843*c0909341SAndroid Build Coastguard Worker.w32: 844*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 845*c0909341SAndroid Build Coastguard Worker movu m4, [base+smooth_weights_1d_16bpc+32*2] 846*c0909341SAndroid Build Coastguard Worker movu m6, [base+smooth_weights_1d_16bpc+32*3] 847*c0909341SAndroid Build Coastguard Worker.w32_loop: 848*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [tlq+hq-2] 849*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [tlq+hq-4] 850*c0909341SAndroid Build Coastguard Worker psubw m1, m5 851*c0909341SAndroid Build Coastguard Worker psubw m3, m5 852*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4, m1 853*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 854*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4, m3 855*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 856*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m5}, m0, m1, m2, m3 857*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 858*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 859*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m2 860*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m3 861*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 862*c0909341SAndroid Build Coastguard Worker sub hq, 2*2 863*c0909341SAndroid Build Coastguard Worker jg .w32_loop 864*c0909341SAndroid Build Coastguard Worker RET 865*c0909341SAndroid Build Coastguard Worker.w64: 866*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 867*c0909341SAndroid Build Coastguard Worker movu m3, [base+smooth_weights_1d_16bpc+32*4] 868*c0909341SAndroid Build Coastguard Worker movu m4, [base+smooth_weights_1d_16bpc+32*5] 869*c0909341SAndroid Build Coastguard Worker movu m6, [base+smooth_weights_1d_16bpc+32*6] 870*c0909341SAndroid Build Coastguard Worker movu m7, [base+smooth_weights_1d_16bpc+32*7] 871*c0909341SAndroid Build Coastguard Worker.w64_loop: 872*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, [tlq+hq-2] 873*c0909341SAndroid Build Coastguard Worker psubw m2, m5 874*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3, m2 875*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4, m2 876*c0909341SAndroid Build Coastguard Worker paddw m0, m5 877*c0909341SAndroid Build Coastguard Worker paddw m1, m5 878*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 879*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6, m2 880*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 881*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7, m2 882*c0909341SAndroid Build Coastguard Worker paddw m0, m5 883*c0909341SAndroid Build Coastguard Worker paddw m1, m5 884*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 885*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m1 886*c0909341SAndroid Build Coastguard Worker add dstq, strideq 887*c0909341SAndroid Build Coastguard Worker sub hq, 1*2 888*c0909341SAndroid Build Coastguard Worker jg .w64_loop 889*c0909341SAndroid Build Coastguard Worker RET 890*c0909341SAndroid Build Coastguard Worker 891*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] 892*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m%1, m%3 893*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m%2, m%4 894*c0909341SAndroid Build Coastguard Worker paddd m0, m%5 895*c0909341SAndroid Build Coastguard Worker paddd m1, m%6 896*c0909341SAndroid Build Coastguard Worker psrld m0, 8 897*c0909341SAndroid Build Coastguard Worker psrld m1, 8 898*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 899*c0909341SAndroid Build Coastguard Worker pavgw m0, m5 900*c0909341SAndroid Build Coastguard Worker%endmacro 901*c0909341SAndroid Build Coastguard Worker 902*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights 903*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_16bpc_avx2_table 904*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_smooth_16bpc_avx2_table] 905*c0909341SAndroid Build Coastguard Worker mov wd, wm 906*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [tlq+wq*2] ; right 907*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 908*c0909341SAndroid Build Coastguard Worker mov hd, hm 909*c0909341SAndroid Build Coastguard Worker sub tlq, hq 910*c0909341SAndroid Build Coastguard Worker sub tlq, hq 911*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 912*c0909341SAndroid Build Coastguard Worker pxor m5, m5 913*c0909341SAndroid Build Coastguard Worker add wq, r6 914*c0909341SAndroid Build Coastguard Worker lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] 915*c0909341SAndroid Build Coastguard Worker jmp wq 916*c0909341SAndroid Build Coastguard Worker.w4: 917*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 11 918*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq] ; bottom 919*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [tlq+hq*2+2] 920*c0909341SAndroid Build Coastguard Worker movsldup m7, [base+ipred_hv_shuf] 921*c0909341SAndroid Build Coastguard Worker movshdup m9, [base+ipred_hv_shuf] 922*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] 923*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m0 ; top, bottom 924*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m9, m9 925*c0909341SAndroid Build Coastguard Worker punpckhqdq m9, m9 926*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 927*c0909341SAndroid Build Coastguard Worker.w4_loop: 928*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [tlq+hq*2-8] 929*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [v_weightsq] 930*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 931*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 ; left, right 932*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 933*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m10 934*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m10 935*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m8 936*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 937*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 0, 1, 6, 6, 2, 3 938*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 939*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 940*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 941*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 942*c0909341SAndroid Build Coastguard Worker movhps [dstq+r3 ], xm1 943*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 944*c0909341SAndroid Build Coastguard Worker add v_weightsq, 16 945*c0909341SAndroid Build Coastguard Worker sub hd, 4 946*c0909341SAndroid Build Coastguard Worker jg .w4_loop 947*c0909341SAndroid Build Coastguard Worker RET 948*c0909341SAndroid Build Coastguard Worker.w8: 949*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12 950*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq] ; bottom 951*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [tlq+hq*2+2] 952*c0909341SAndroid Build Coastguard Worker movsldup m8, [base+ipred_hv_shuf] 953*c0909341SAndroid Build Coastguard Worker movshdup m9, [base+ipred_hv_shuf] 954*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] 955*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] 956*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m0 ; top, bottom 957*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m0 958*c0909341SAndroid Build Coastguard Worker.w8_loop: 959*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [tlq+hq*2-4] 960*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [v_weightsq] 961*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 962*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 ; left, right 963*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 964*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m10 965*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m11 966*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 967*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 1, 1, 6, 7, 2, 3 968*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 969*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 970*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 971*c0909341SAndroid Build Coastguard Worker add v_weightsq, 8 972*c0909341SAndroid Build Coastguard Worker sub hd, 2 973*c0909341SAndroid Build Coastguard Worker jg .w8_loop 974*c0909341SAndroid Build Coastguard Worker RET 975*c0909341SAndroid Build Coastguard Worker.w16: 976*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 11 977*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq] ; bottom 978*c0909341SAndroid Build Coastguard Worker movu m7, [tlq+hq*2+2] 979*c0909341SAndroid Build Coastguard Worker mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] 980*c0909341SAndroid Build Coastguard Worker mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] 981*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 982*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 983*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m0 ; top, bottom 984*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m0 985*c0909341SAndroid Build Coastguard Worker.w16_loop: 986*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [tlq+hq*2-4] 987*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [v_weightsq+0] 988*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 ; left, right 989*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q1111 990*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m8, m2 991*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m9 992*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 993*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 1, 1, 6, 7, 10, 2 994*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [v_weightsq+4] 995*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8, m3 996*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 997*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 998*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 1, 1, 6, 7, 2, 3 999*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 1000*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1001*c0909341SAndroid Build Coastguard Worker add v_weightsq, 8 1002*c0909341SAndroid Build Coastguard Worker sub hq, 2 1003*c0909341SAndroid Build Coastguard Worker jg .w16_loop 1004*c0909341SAndroid Build Coastguard Worker RET 1005*c0909341SAndroid Build Coastguard Worker.w32: 1006*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 15 1007*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tlq] ; bottom 1008*c0909341SAndroid Build Coastguard Worker movu m7, [tlq+hq*2+ 2] 1009*c0909341SAndroid Build Coastguard Worker movu m9, [tlq+hq*2+34] 1010*c0909341SAndroid Build Coastguard Worker mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] 1011*c0909341SAndroid Build Coastguard Worker mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] 1012*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 1013*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 1014*c0909341SAndroid Build Coastguard Worker mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] 1015*c0909341SAndroid Build Coastguard Worker mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] 1016*c0909341SAndroid Build Coastguard Worker vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 1017*c0909341SAndroid Build Coastguard Worker vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 1018*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m0 1019*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m0 1020*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m9, m0 1021*c0909341SAndroid Build Coastguard Worker punpckhwd m9, m0 1022*c0909341SAndroid Build Coastguard Worker.w32_loop: 1023*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [tlq+hq*2-2] 1024*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [v_weightsq] 1025*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 1026*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10, m3 1027*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11, m3 1028*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6, m14 1029*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1030*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7, m14 1031*c0909341SAndroid Build Coastguard Worker paddd m1, m2 1032*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12, m3 1033*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 1034*c0909341SAndroid Build Coastguard Worker psrld m0, 8 1035*c0909341SAndroid Build Coastguard Worker psrld m1, 8 1036*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1037*c0909341SAndroid Build Coastguard Worker pavgw m0, m5 1038*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 1039*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 14, 14, 8, 9, 2, 3 1040*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 1041*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1042*c0909341SAndroid Build Coastguard Worker add v_weightsq, 4 1043*c0909341SAndroid Build Coastguard Worker dec hd 1044*c0909341SAndroid Build Coastguard Worker jg .w32_loop 1045*c0909341SAndroid Build Coastguard Worker RET 1046*c0909341SAndroid Build Coastguard Worker.w64: 1047*c0909341SAndroid Build Coastguard Worker PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base 1048*c0909341SAndroid Build Coastguard Worker mov dst_baseq, dstq 1049*c0909341SAndroid Build Coastguard Worker mov tl_baseq, tlq 1050*c0909341SAndroid Build Coastguard Worker mov v_weights_baseq, v_weightsq 1051*c0909341SAndroid Build Coastguard Worker xor xq, xq 1052*c0909341SAndroid Build Coastguard Worker.w64_loop_x: 1053*c0909341SAndroid Build Coastguard Worker mov yq, hq 1054*c0909341SAndroid Build Coastguard Worker lea tlq, [tl_baseq+hq*2] 1055*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [tl_baseq] ; bottom 1056*c0909341SAndroid Build Coastguard Worker movu m7, [tlq+xq*2+ 2] 1057*c0909341SAndroid Build Coastguard Worker movu m9, [tlq+xq*2+34] 1058*c0909341SAndroid Build Coastguard Worker mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] 1059*c0909341SAndroid Build Coastguard Worker mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] 1060*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 1061*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 1062*c0909341SAndroid Build Coastguard Worker mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] 1063*c0909341SAndroid Build Coastguard Worker mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] 1064*c0909341SAndroid Build Coastguard Worker vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 1065*c0909341SAndroid Build Coastguard Worker vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 1066*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m0 1067*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m0 1068*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m9, m0 1069*c0909341SAndroid Build Coastguard Worker punpckhwd m9, m0 1070*c0909341SAndroid Build Coastguard Worker lea tlq, [tl_baseq-2] 1071*c0909341SAndroid Build Coastguard Worker.w64_loop_y: 1072*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [tlq+yq*2] 1073*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [v_weightsq] 1074*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 1075*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m10, m3 1076*c0909341SAndroid Build Coastguard Worker pmaddwd m15, m11, m3 1077*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12, m3 1078*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 1079*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6, m1 1080*c0909341SAndroid Build Coastguard Worker paddd m0, m14 1081*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m7, m1 1082*c0909341SAndroid Build Coastguard Worker paddd m14, m15 1083*c0909341SAndroid Build Coastguard Worker psrld m0, 8 1084*c0909341SAndroid Build Coastguard Worker psrld m14, 8 1085*c0909341SAndroid Build Coastguard Worker packssdw m0, m14 1086*c0909341SAndroid Build Coastguard Worker pavgw m0, m5 1087*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 1088*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 8, 9, 1, 1, 2, 3 1089*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 1090*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1091*c0909341SAndroid Build Coastguard Worker add v_weightsq, 4 1092*c0909341SAndroid Build Coastguard Worker dec yq 1093*c0909341SAndroid Build Coastguard Worker jg .w64_loop_y 1094*c0909341SAndroid Build Coastguard Worker lea dstq, [dst_baseq+32*2] 1095*c0909341SAndroid Build Coastguard Worker add r6, 16*8 1096*c0909341SAndroid Build Coastguard Worker mov v_weightsq, v_weights_baseq 1097*c0909341SAndroid Build Coastguard Worker add xq, 32 1098*c0909341SAndroid Build Coastguard Worker test xb, 64 1099*c0909341SAndroid Build Coastguard Worker jz .w64_loop_x 1100*c0909341SAndroid Build Coastguard Worker RET 1101*c0909341SAndroid Build Coastguard Worker 1102*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase 1103*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_z1_16bpc_avx2_table] 1104*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 1105*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 1106*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1107*c0909341SAndroid Build Coastguard Worker lea r7, [dr_intra_derivative] 1108*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 1109*c0909341SAndroid Build Coastguard Worker add tlq, 2 1110*c0909341SAndroid Build Coastguard Worker add wq, r6 1111*c0909341SAndroid Build Coastguard Worker mov dxd, angled 1112*c0909341SAndroid Build Coastguard Worker and dxd, 0x7e 1113*c0909341SAndroid Build Coastguard Worker add angled, 165 ; ~90 1114*c0909341SAndroid Build Coastguard Worker movzx dxd, word [r7+dxq] 1115*c0909341SAndroid Build Coastguard Worker xor angled, 0x4ff ; d = 90 - angle 1116*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_62] 1117*c0909341SAndroid Build Coastguard Worker jmp wq 1118*c0909341SAndroid Build Coastguard Worker.w4: 1119*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -64, 7 1120*c0909341SAndroid Build Coastguard Worker cmp angleb, 40 1121*c0909341SAndroid Build Coastguard Worker jae .w4_no_upsample 1122*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq-1024] 1123*c0909341SAndroid Build Coastguard Worker sar r3d, 7 1124*c0909341SAndroid Build Coastguard Worker add r3d, hd 1125*c0909341SAndroid Build Coastguard Worker jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 1126*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, [tlq+14] 1127*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1128*c0909341SAndroid Build Coastguard Worker palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 1129*c0909341SAndroid Build Coastguard Worker paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 1130*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1131*c0909341SAndroid Build Coastguard Worker palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 1132*c0909341SAndroid Build Coastguard Worker paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d 1133*c0909341SAndroid Build Coastguard Worker psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 1134*c0909341SAndroid Build Coastguard Worker psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 1135*c0909341SAndroid Build Coastguard Worker pxor xm4, xm4 1136*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0 1137*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm0, r8m ; pixel_max 1138*c0909341SAndroid Build Coastguard Worker mova [rsp+32], xm3 1139*c0909341SAndroid Build Coastguard Worker movd xm3, dxd 1140*c0909341SAndroid Build Coastguard Worker pmaxsw xm2, xm4 1141*c0909341SAndroid Build Coastguard Worker mov r3d, dxd 1142*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm4 1143*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 1144*c0909341SAndroid Build Coastguard Worker pminsw xm2, xm0 1145*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm1, xm2 1146*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm2 1147*c0909341SAndroid Build Coastguard Worker lea r5, [strideq*3] 1148*c0909341SAndroid Build Coastguard Worker pslldq m2, m3, 8 1149*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], xm0 1150*c0909341SAndroid Build Coastguard Worker mova [rsp+16], xm1 1151*c0909341SAndroid Build Coastguard Worker paddw m6, m3, m3 1152*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1153*c0909341SAndroid Build Coastguard Worker vpblendd m4, m6, 0xf0 1154*c0909341SAndroid Build Coastguard Worker paddw m6, m6 1155*c0909341SAndroid Build Coastguard Worker paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 1156*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [z_upsample] 1157*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop: 1158*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 1159*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base0 1160*c0909341SAndroid Build Coastguard Worker movu xm1, [rsp+r3*2] 1161*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 1162*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base1 1163*c0909341SAndroid Build Coastguard Worker movu xm2, [rsp+r2*2] 1164*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 1165*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base2 1166*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [rsp+r3*2], 1 ; 0 2 1167*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 1168*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base3 1169*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [rsp+r2*2], 1 ; 1 3 1170*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1171*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1172*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 1173*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2 1174*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 ; frac 1175*c0909341SAndroid Build Coastguard Worker psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 1176*c0909341SAndroid Build Coastguard Worker psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) 1177*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) 1178*c0909341SAndroid Build Coastguard Worker paddw m3, m6 ; xpos += dx 1179*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1180*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1181*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 1182*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 1183*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 1184*c0909341SAndroid Build Coastguard Worker movhps [dstq+r5 ], xm1 1185*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1186*c0909341SAndroid Build Coastguard Worker sub hd, 4 1187*c0909341SAndroid Build Coastguard Worker jg .w4_upsample_loop 1188*c0909341SAndroid Build Coastguard Worker RET 1189*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1190*c0909341SAndroid Build Coastguard Worker.filter_strength: ; w4/w8/w16 1191*c0909341SAndroid Build Coastguard Worker%define base r3-z_filter_t0 1192*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 1193*c0909341SAndroid Build Coastguard Worker lea r3, [z_filter_t0] 1194*c0909341SAndroid Build Coastguard Worker movd xm1, angled 1195*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1196*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 1197*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 1198*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, [base+z_filter_wh] 1199*c0909341SAndroid Build Coastguard Worker mova xm2, [r3+angleq*8] 1200*c0909341SAndroid Build Coastguard Worker pand m0, m1 1201*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m2 1202*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m0 1203*c0909341SAndroid Build Coastguard Worker ret 1204*c0909341SAndroid Build Coastguard Worker.w4_no_upsample: 1205*c0909341SAndroid Build Coastguard Worker mov maxbased, 7 1206*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1207*c0909341SAndroid Build Coastguard Worker jnz .w4_main 1208*c0909341SAndroid Build Coastguard Worker lea maxbased, [hq+3] 1209*c0909341SAndroid Build Coastguard Worker call .filter_strength 1210*c0909341SAndroid Build Coastguard Worker mov maxbased, 7 1211*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1212*c0909341SAndroid Build Coastguard Worker jz .w4_main ; filter_strength == 0 1213*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 1214*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, [tlq+14] 1215*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 1216*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] 1217*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] 1218*c0909341SAndroid Build Coastguard Worker palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 1219*c0909341SAndroid Build Coastguard Worker pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1220*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0 1221*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm4 1222*c0909341SAndroid Build Coastguard Worker movd [rsp+16], xm3 1223*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 1224*c0909341SAndroid Build Coastguard Worker jne .w4_3tap 1225*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1226*c0909341SAndroid Build Coastguard Worker palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 1227*c0909341SAndroid Build Coastguard Worker pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 1228*c0909341SAndroid Build Coastguard Worker movzx r3d, word [tlq+14] 1229*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq+12] 1230*c0909341SAndroid Build Coastguard Worker inc maxbased 1231*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0 1232*c0909341SAndroid Build Coastguard Worker sub r2d, r3d 1233*c0909341SAndroid Build Coastguard Worker paddw xm2, xm2 1234*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r3*8+4] 1235*c0909341SAndroid Build Coastguard Worker shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 1236*c0909341SAndroid Build Coastguard Worker mov [rsp+16], r2w 1237*c0909341SAndroid Build Coastguard Worker.w4_3tap: 1238*c0909341SAndroid Build Coastguard Worker pxor xm0, xm0 1239*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1240*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1241*c0909341SAndroid Build Coastguard Worker psrlw xm1, 3 1242*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1243*c0909341SAndroid Build Coastguard Worker sbb maxbased, -1 1244*c0909341SAndroid Build Coastguard Worker pavgw xm0, xm1 1245*c0909341SAndroid Build Coastguard Worker mova [tlq], xm0 1246*c0909341SAndroid Build Coastguard Worker.w4_main: 1247*c0909341SAndroid Build Coastguard Worker movd xm3, dxd 1248*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [z_base_inc] 1249*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] 1250*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 1251*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 1252*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 1253*c0909341SAndroid Build Coastguard Worker mov r3d, dxd ; xpos 1254*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 1255*c0909341SAndroid Build Coastguard Worker paddw m4, m3, m3 1256*c0909341SAndroid Build Coastguard Worker psubw m1, m0 ; -max_base_x 1257*c0909341SAndroid Build Coastguard Worker vpblendd m3, m4, 0xcc 1258*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m3 1259*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 1260*c0909341SAndroid Build Coastguard Worker paddw m4, m4 1261*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1262*c0909341SAndroid Build Coastguard Worker.w4_loop: 1263*c0909341SAndroid Build Coastguard Worker lea r5d, [r3+dxq] 1264*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base0 1265*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq+r3*2] 1266*c0909341SAndroid Build Coastguard Worker lea r3d, [r5+dxq] 1267*c0909341SAndroid Build Coastguard Worker shr r5d, 6 ; base1 1268*c0909341SAndroid Build Coastguard Worker movu xm2, [tlq+r5*2] 1269*c0909341SAndroid Build Coastguard Worker lea r5d, [r3+dxq] 1270*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base2 1271*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tlq+r3*2], 1 ; 0 2 1272*c0909341SAndroid Build Coastguard Worker lea r3d, [r5+dxq] 1273*c0909341SAndroid Build Coastguard Worker shr r5d, 6 ; base3 1274*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [tlq+r5*2], 1 ; 1 3 1275*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 1276*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 1277*c0909341SAndroid Build Coastguard Worker pslldq m2, 6 1278*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0xcc 1279*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1280*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1281*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1282*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1283*c0909341SAndroid Build Coastguard Worker psraw m2, m3, 15 ; xpos < max_base_x 1284*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1285*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1286*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m2 1287*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1288*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 1289*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 1290*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1291*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 1292*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 1293*c0909341SAndroid Build Coastguard Worker sub hd, 4 1294*c0909341SAndroid Build Coastguard Worker jz .w4_end 1295*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1296*c0909341SAndroid Build Coastguard Worker cmp r3d, maxbased 1297*c0909341SAndroid Build Coastguard Worker jb .w4_loop 1298*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 1299*c0909341SAndroid Build Coastguard Worker.w4_end_loop: 1300*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm6 1301*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm6 1302*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm6 1303*c0909341SAndroid Build Coastguard Worker movq [dstq+r6 ], xm6 1304*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1305*c0909341SAndroid Build Coastguard Worker sub hd, 4 1306*c0909341SAndroid Build Coastguard Worker jg .w4_end_loop 1307*c0909341SAndroid Build Coastguard Worker.w4_end: 1308*c0909341SAndroid Build Coastguard Worker RET 1309*c0909341SAndroid Build Coastguard Worker.w8: 1310*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -64, 7 1311*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+216] 1312*c0909341SAndroid Build Coastguard Worker mov r3b, hb 1313*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 1314*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1315*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ 1316*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ 1317*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1318*c0909341SAndroid Build Coastguard Worker cmp hd, 4 1319*c0909341SAndroid Build Coastguard Worker jne .w8_upsample_h8 ; awkward single-pixel edge case 1320*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ 1321*c0909341SAndroid Build Coastguard Worker.w8_upsample_h8: 1322*c0909341SAndroid Build Coastguard Worker paddw m2, m1 1323*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1324*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1325*c0909341SAndroid Build Coastguard Worker psubw m0, m2, m0 1326*c0909341SAndroid Build Coastguard Worker psraw m0, 3 1327*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1328*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1329*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, r8m 1330*c0909341SAndroid Build Coastguard Worker movd xm3, dxd 1331*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m4 1332*c0909341SAndroid Build Coastguard Worker mov r3d, dxd 1333*c0909341SAndroid Build Coastguard Worker pavgw m2, m4 1334*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 1335*c0909341SAndroid Build Coastguard Worker pminsw m2, m0 1336*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 1337*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 1338*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [z_upsample] 1339*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], xm0 1340*c0909341SAndroid Build Coastguard Worker mova [rsp+16], xm1 1341*c0909341SAndroid Build Coastguard Worker paddw m6, m3, m3 1342*c0909341SAndroid Build Coastguard Worker vextracti128 [rsp+32], m0, 1 1343*c0909341SAndroid Build Coastguard Worker vextracti128 [rsp+48], m1, 1 1344*c0909341SAndroid Build Coastguard Worker vpblendd m3, m6, 0xf0 ; xpos0 xpos1 1345*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop: 1346*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 1347*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base0 1348*c0909341SAndroid Build Coastguard Worker movu xm1, [rsp+r3*2] 1349*c0909341SAndroid Build Coastguard Worker movu xm2, [rsp+r3*2+16] 1350*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 1351*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base1 1352*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [rsp+r2*2], 1 1353*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [rsp+r2*2+16], 1 1354*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1355*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1356*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 1357*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2 1358*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1359*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1360*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1361*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1362*c0909341SAndroid Build Coastguard Worker paddw m3, m6 1363*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1364*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 1365*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 1366*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1367*c0909341SAndroid Build Coastguard Worker sub hd, 2 1368*c0909341SAndroid Build Coastguard Worker jg .w8_upsample_loop 1369*c0909341SAndroid Build Coastguard Worker RET 1370*c0909341SAndroid Build Coastguard Worker.w8_no_intra_edge_filter: 1371*c0909341SAndroid Build Coastguard Worker and maxbased, 7 1372*c0909341SAndroid Build Coastguard Worker or maxbased, 8 ; imin(h+7, 15) 1373*c0909341SAndroid Build Coastguard Worker jmp .w8_main 1374*c0909341SAndroid Build Coastguard Worker.w8_no_upsample: 1375*c0909341SAndroid Build Coastguard Worker lea maxbased, [hq+7] 1376*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1377*c0909341SAndroid Build Coastguard Worker jnz .w8_no_intra_edge_filter 1378*c0909341SAndroid Build Coastguard Worker call .filter_strength 1379*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1380*c0909341SAndroid Build Coastguard Worker jz .w8_main 1381*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 1382*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] 1383*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 1384*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1385*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1386*c0909341SAndroid Build Coastguard Worker pmullw m1, m2 1387*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1388*c0909341SAndroid Build Coastguard Worker jl .w8_filter_h4 1389*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m2 1390*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 1391*c0909341SAndroid Build Coastguard Worker je .w8_filter_end ; 8x4 and 8x8 are always 3-tap 1392*c0909341SAndroid Build Coastguard Worker movzx r3d, word [tlq+30] 1393*c0909341SAndroid Build Coastguard Worker mov maxbased, 16 1394*c0909341SAndroid Build Coastguard Worker mov [rsp+32], r3d 1395*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 1396*c0909341SAndroid Build Coastguard Worker jne .w8_filter_end 1397*c0909341SAndroid Build Coastguard Worker punpcklwd xm6, xm0, xm0 1398*c0909341SAndroid Build Coastguard Worker vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g 1399*c0909341SAndroid Build Coastguard Worker vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1400*c0909341SAndroid Build Coastguard Worker movzx r5d, word [tlq+28] 1401*c0909341SAndroid Build Coastguard Worker mov [rsp+34], r3w 1402*c0909341SAndroid Build Coastguard Worker paddw m2, m6 1403*c0909341SAndroid Build Coastguard Worker sub r5d, r3d 1404*c0909341SAndroid Build Coastguard Worker inc maxbased 1405*c0909341SAndroid Build Coastguard Worker paddw m2, m2 1406*c0909341SAndroid Build Coastguard Worker lea r3d, [r5+r3*8+4] 1407*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1408*c0909341SAndroid Build Coastguard Worker shr r3d, 3 1409*c0909341SAndroid Build Coastguard Worker mov [rsp+32], r3w 1410*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_end 1411*c0909341SAndroid Build Coastguard Worker.w8_filter_h4: 1412*c0909341SAndroid Build Coastguard Worker pshuflw m3, m2, q3321 1413*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ 1414*c0909341SAndroid Build Coastguard Worker.w8_filter_end: 1415*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1416*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 1417*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1418*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1419*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1420*c0909341SAndroid Build Coastguard Worker psrlw m0, 3 1421*c0909341SAndroid Build Coastguard Worker pavgw m0, m2 1422*c0909341SAndroid Build Coastguard Worker mova [tlq], m0 1423*c0909341SAndroid Build Coastguard Worker.w8_main: 1424*c0909341SAndroid Build Coastguard Worker movd xm3, dxd 1425*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [z_base_inc] 1426*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] 1427*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 1428*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 1429*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 1430*c0909341SAndroid Build Coastguard Worker mov r3d, dxd 1431*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 1432*c0909341SAndroid Build Coastguard Worker paddw m4, m3, m3 1433*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1434*c0909341SAndroid Build Coastguard Worker vpblendd m3, m4, 0xf0 ; xpos0 xpos1 1435*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1436*c0909341SAndroid Build Coastguard Worker.w8_loop: 1437*c0909341SAndroid Build Coastguard Worker lea r5d, [r3+dxq] 1438*c0909341SAndroid Build Coastguard Worker shr r3d, 6 1439*c0909341SAndroid Build Coastguard Worker movu xm0, [tlq+r3*2] 1440*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq+r3*2+2] 1441*c0909341SAndroid Build Coastguard Worker lea r3d, [r5+dxq] 1442*c0909341SAndroid Build Coastguard Worker shr r5d, 6 1443*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [tlq+r5*2], 1 1444*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tlq+r5*2+2], 1 1445*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1446*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1447*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1448*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1449*c0909341SAndroid Build Coastguard Worker psraw m2, m3, 15 1450*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1451*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1452*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m2 1453*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 1454*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 1455*c0909341SAndroid Build Coastguard Worker sub hd, 2 1456*c0909341SAndroid Build Coastguard Worker jz .w8_end 1457*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1458*c0909341SAndroid Build Coastguard Worker cmp r3d, maxbased 1459*c0909341SAndroid Build Coastguard Worker jb .w8_loop 1460*c0909341SAndroid Build Coastguard Worker.w8_end_loop: 1461*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm6 1462*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm6 1463*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1464*c0909341SAndroid Build Coastguard Worker sub hd, 2 1465*c0909341SAndroid Build Coastguard Worker jg .w8_end_loop 1466*c0909341SAndroid Build Coastguard Worker.w8_end: 1467*c0909341SAndroid Build Coastguard Worker RET 1468*c0909341SAndroid Build Coastguard Worker.w16_no_intra_edge_filter: 1469*c0909341SAndroid Build Coastguard Worker and maxbased, 15 1470*c0909341SAndroid Build Coastguard Worker or maxbased, 16 ; imin(h+15, 31) 1471*c0909341SAndroid Build Coastguard Worker jmp .w16_main 1472*c0909341SAndroid Build Coastguard Worker.w16: 1473*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -96, 7 1474*c0909341SAndroid Build Coastguard Worker lea maxbased, [hq+15] 1475*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1476*c0909341SAndroid Build Coastguard Worker jnz .w16_no_intra_edge_filter 1477*c0909341SAndroid Build Coastguard Worker call .filter_strength 1478*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1479*c0909341SAndroid Build Coastguard Worker jz .w16_main 1480*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 1481*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1482*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1483*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 1484*c0909341SAndroid Build Coastguard Worker jne .w16_filter_3tap 1485*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pw_3] 1486*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm0 1487*c0909341SAndroid Build Coastguard Worker vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1488*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1489*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1490*c0909341SAndroid Build Coastguard Worker pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1491*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1492*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1493*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1494*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1495*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1496*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1497*c0909341SAndroid Build Coastguard Worker jl .w16_filter_5tap_h4 1498*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m3 1499*c0909341SAndroid Build Coastguard Worker je .w16_filter_5tap_h8 1500*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1501*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1502*c0909341SAndroid Build Coastguard Worker movzx r3d, word [tlq+62] 1503*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq+60] 1504*c0909341SAndroid Build Coastguard Worker pavgw m2, m4 1505*c0909341SAndroid Build Coastguard Worker sub r2d, r3d 1506*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1507*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r3*8+4] 1508*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1509*c0909341SAndroid Build Coastguard Worker shr r2d, 3 1510*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 1511*c0909341SAndroid Build Coastguard Worker mov [rsp+66], r3w 1512*c0909341SAndroid Build Coastguard Worker mov [rsp+64], r2w 1513*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1514*c0909341SAndroid Build Coastguard Worker mov r3d, 33 1515*c0909341SAndroid Build Coastguard Worker cmp hd, 16 1516*c0909341SAndroid Build Coastguard Worker cmovg maxbased, r3d 1517*c0909341SAndroid Build Coastguard Worker jmp .w16_filter_end2 1518*c0909341SAndroid Build Coastguard Worker.w16_filter_5tap_h8: 1519*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 1520*c0909341SAndroid Build Coastguard Worker vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 1521*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm4 1522*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 1523*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1524*c0909341SAndroid Build Coastguard Worker psrlw xm1, 2 1525*c0909341SAndroid Build Coastguard Worker jmp .w16_filter_end2 1526*c0909341SAndroid Build Coastguard Worker.w16_filter_5tap_h4: 1527*c0909341SAndroid Build Coastguard Worker pshuflw xm4, xm3, q3332 ; 4 5 5 5 1528*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm3, q3321 ; 3 4 5 5 1529*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm4 1530*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 1531*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1532*c0909341SAndroid Build Coastguard Worker psrlw xm1, 2 1533*c0909341SAndroid Build Coastguard Worker jmp .w16_filter_end2 1534*c0909341SAndroid Build Coastguard Worker.w16_filter_3tap: 1535*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] 1536*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 1537*c0909341SAndroid Build Coastguard Worker pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1538*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1539*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 1540*c0909341SAndroid Build Coastguard Worker pmullw m3, m2 1541*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1542*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1543*c0909341SAndroid Build Coastguard Worker je .w16_filter_3tap_h8 1544*c0909341SAndroid Build Coastguard Worker jl .w16_filter_3tap_h4 1545*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m2 1546*c0909341SAndroid Build Coastguard Worker vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 1547*c0909341SAndroid Build Coastguard Worker jmp .w16_filter_end 1548*c0909341SAndroid Build Coastguard Worker.w16_filter_3tap_h4: 1549*c0909341SAndroid Build Coastguard Worker pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ 1550*c0909341SAndroid Build Coastguard Worker jmp .w16_filter_end 1551*c0909341SAndroid Build Coastguard Worker.w16_filter_3tap_h8: 1552*c0909341SAndroid Build Coastguard Worker psrldq xm2, 2 1553*c0909341SAndroid Build Coastguard Worker pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 1554*c0909341SAndroid Build Coastguard Worker.w16_filter_end: 1555*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1556*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 1557*c0909341SAndroid Build Coastguard Worker psrlw m0, 3 1558*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1559*c0909341SAndroid Build Coastguard Worker paddw m2, m3 1560*c0909341SAndroid Build Coastguard Worker psrlw m2, 3 1561*c0909341SAndroid Build Coastguard Worker pavgw m0, m1 1562*c0909341SAndroid Build Coastguard Worker pavgw m1, m2 1563*c0909341SAndroid Build Coastguard Worker.w16_filter_end2: 1564*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1565*c0909341SAndroid Build Coastguard Worker mova [tlq+ 0], m0 1566*c0909341SAndroid Build Coastguard Worker mova [tlq+32], m1 1567*c0909341SAndroid Build Coastguard Worker.w16_main: 1568*c0909341SAndroid Build Coastguard Worker movd xm4, dxd 1569*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] 1570*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 1571*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 1572*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 1573*c0909341SAndroid Build Coastguard Worker mov r3d, dxd 1574*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 1575*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [z_base_inc] 1576*c0909341SAndroid Build Coastguard Worker psubw m3, m0 1577*c0909341SAndroid Build Coastguard Worker.w16_loop: 1578*c0909341SAndroid Build Coastguard Worker lea r5d, [r3+dxq] 1579*c0909341SAndroid Build Coastguard Worker shr r3d, 6 1580*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2] 1581*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+2] 1582*c0909341SAndroid Build Coastguard Worker lea r3d, [r5+dxq] 1583*c0909341SAndroid Build Coastguard Worker shr r5d, 6 1584*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1585*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1586*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1587*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1588*c0909341SAndroid Build Coastguard Worker psraw m2, m3, 15 1589*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1590*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1591*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5*2] 1592*c0909341SAndroid Build Coastguard Worker vpblendvb m2, m6, m1, m2 1593*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2+2] 1594*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m2 1595*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1596*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1597*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1598*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1599*c0909341SAndroid Build Coastguard Worker psraw m2, m3, 15 1600*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1601*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1602*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m2 1603*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 1604*c0909341SAndroid Build Coastguard Worker sub hd, 2 1605*c0909341SAndroid Build Coastguard Worker jz .w16_end 1606*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1607*c0909341SAndroid Build Coastguard Worker cmp r3d, maxbased 1608*c0909341SAndroid Build Coastguard Worker jb .w16_loop 1609*c0909341SAndroid Build Coastguard Worker.w16_end_loop: 1610*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m6 1611*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m6 1612*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1613*c0909341SAndroid Build Coastguard Worker sub hd, 2 1614*c0909341SAndroid Build Coastguard Worker jg .w16_end_loop 1615*c0909341SAndroid Build Coastguard Worker.w16_end: 1616*c0909341SAndroid Build Coastguard Worker RET 1617*c0909341SAndroid Build Coastguard Worker.w32: 1618*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -160, 8 1619*c0909341SAndroid Build Coastguard Worker lea maxbased, [hq+31] 1620*c0909341SAndroid Build Coastguard Worker mov r3d, 63 1621*c0909341SAndroid Build Coastguard Worker cmp hd, 32 1622*c0909341SAndroid Build Coastguard Worker cmova maxbased, r3d 1623*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1624*c0909341SAndroid Build Coastguard Worker jnz .w32_main 1625*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pw_3] 1626*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1627*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm0 1628*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1629*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1630*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1631*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1632*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1633*c0909341SAndroid Build Coastguard Worker mov r3, rsp 1634*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1635*c0909341SAndroid Build Coastguard Worker lea r5d, [maxbaseq-31] 1636*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1637*c0909341SAndroid Build Coastguard Worker mova [r3], m0 1638*c0909341SAndroid Build Coastguard Worker.w32_filter_loop: 1639*c0909341SAndroid Build Coastguard Worker mova m0, [tlq+30] 1640*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [tlq+28] 1641*c0909341SAndroid Build Coastguard Worker add tlq, 32 1642*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+0] 1643*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq+4] 1644*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+2] 1645*c0909341SAndroid Build Coastguard Worker add r3, 32 1646*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1647*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1648*c0909341SAndroid Build Coastguard Worker mova [r3], m0 1649*c0909341SAndroid Build Coastguard Worker sub r5d, 16 1650*c0909341SAndroid Build Coastguard Worker jg .w32_filter_loop 1651*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1652*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m0 1653*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1654*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1655*c0909341SAndroid Build Coastguard Worker jl .w32_filter_h8 1656*c0909341SAndroid Build Coastguard Worker vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1657*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1658*c0909341SAndroid Build Coastguard Worker movzx r5d, word [tlq+62] 1659*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq+60] 1660*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 1661*c0909341SAndroid Build Coastguard Worker sub r2d, r5d 1662*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1663*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r5*8+4] 1664*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1665*c0909341SAndroid Build Coastguard Worker shr r2d, 3 1666*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1667*c0909341SAndroid Build Coastguard Worker mova [r3+32], m0 1668*c0909341SAndroid Build Coastguard Worker mov [r3+66], r5w 1669*c0909341SAndroid Build Coastguard Worker mov [r3+64], r2w 1670*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1671*c0909341SAndroid Build Coastguard Worker mov r3d, 65 1672*c0909341SAndroid Build Coastguard Worker cmp hd, 64 1673*c0909341SAndroid Build Coastguard Worker cmove maxbased, r3d 1674*c0909341SAndroid Build Coastguard Worker jmp .w32_main 1675*c0909341SAndroid Build Coastguard Worker.w32_filter_h8: 1676*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 1677*c0909341SAndroid Build Coastguard Worker vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 1678*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm3 1679*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 1680*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1681*c0909341SAndroid Build Coastguard Worker paddw xm0, xm2 1682*c0909341SAndroid Build Coastguard Worker psrlw xm0, 2 1683*c0909341SAndroid Build Coastguard Worker mova [r3+32], xm0 1684*c0909341SAndroid Build Coastguard Worker.w32_main: 1685*c0909341SAndroid Build Coastguard Worker movd xm4, dxd 1686*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] 1687*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 1688*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 1689*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 1690*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1691*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_m1024] ; -16 * 64 1692*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 1693*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [z_base_inc] 1694*c0909341SAndroid Build Coastguard Worker psubw m3, m0 1695*c0909341SAndroid Build Coastguard Worker.w32_loop: 1696*c0909341SAndroid Build Coastguard Worker mov r3d, r5d 1697*c0909341SAndroid Build Coastguard Worker shr r3d, 6 1698*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2] 1699*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+2] 1700*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1701*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1702*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1703*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1704*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1705*c0909341SAndroid Build Coastguard Worker psraw m1, m3, 15 1706*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 1707*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 1708*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+32] 1709*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+34] 1710*c0909341SAndroid Build Coastguard Worker add r5d, dxd 1711*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1712*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1713*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m7, m3 1714*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1715*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1716*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m2 1717*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 1718*c0909341SAndroid Build Coastguard Worker dec hd 1719*c0909341SAndroid Build Coastguard Worker jz .w32_end 1720*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1721*c0909341SAndroid Build Coastguard Worker cmp r5d, maxbased 1722*c0909341SAndroid Build Coastguard Worker jb .w32_loop 1723*c0909341SAndroid Build Coastguard Worker.w32_end_loop: 1724*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m6 1725*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m6 1726*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1727*c0909341SAndroid Build Coastguard Worker dec hd 1728*c0909341SAndroid Build Coastguard Worker jg .w32_end_loop 1729*c0909341SAndroid Build Coastguard Worker.w32_end: 1730*c0909341SAndroid Build Coastguard Worker RET 1731*c0909341SAndroid Build Coastguard Worker.w64: 1732*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -256, 10 1733*c0909341SAndroid Build Coastguard Worker lea maxbased, [hq+63] 1734*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1735*c0909341SAndroid Build Coastguard Worker jnz .w64_main 1736*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pw_3] 1737*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1738*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm0 1739*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1740*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1741*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1742*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1743*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1744*c0909341SAndroid Build Coastguard Worker mov r3, rsp 1745*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1746*c0909341SAndroid Build Coastguard Worker lea r5d, [hq+32] 1747*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1748*c0909341SAndroid Build Coastguard Worker mova [r3], m0 1749*c0909341SAndroid Build Coastguard Worker.w64_filter_loop: 1750*c0909341SAndroid Build Coastguard Worker mova m0, [tlq+30] 1751*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [tlq+28] 1752*c0909341SAndroid Build Coastguard Worker add tlq, 32 1753*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+0] 1754*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq+4] 1755*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+2] 1756*c0909341SAndroid Build Coastguard Worker add r3, 32 1757*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1758*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1759*c0909341SAndroid Build Coastguard Worker mova [r3], m0 1760*c0909341SAndroid Build Coastguard Worker sub r5d, 16 1761*c0909341SAndroid Build Coastguard Worker jg .w64_filter_loop 1762*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1763*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m0 1764*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1765*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1766*c0909341SAndroid Build Coastguard Worker vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1767*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1768*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 1769*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1770*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1771*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1772*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 1773*c0909341SAndroid Build Coastguard Worker mova [r3+32], m0 1774*c0909341SAndroid Build Coastguard Worker.w64_main: 1775*c0909341SAndroid Build Coastguard Worker movd xm4, dxd 1776*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] 1777*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 1778*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 1779*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 1780*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1781*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_m1024] ; -16 * 64 1782*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 1783*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [z_base_inc] 1784*c0909341SAndroid Build Coastguard Worker paddw m8, m7, m7 ; -32 * 64 1785*c0909341SAndroid Build Coastguard Worker psubw m3, m0 1786*c0909341SAndroid Build Coastguard Worker paddw m9, m8, m7 ; -48 * 64 1787*c0909341SAndroid Build Coastguard Worker.w64_loop: 1788*c0909341SAndroid Build Coastguard Worker mov r3d, r5d 1789*c0909341SAndroid Build Coastguard Worker shr r3d, 6 1790*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2] 1791*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+2] 1792*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 1793*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1794*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1795*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1796*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1797*c0909341SAndroid Build Coastguard Worker psraw m1, m3, 15 1798*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 1799*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 1800*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+32] 1801*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+34] 1802*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1803*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1804*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1805*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m7, m3 1806*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 1807*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 1808*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+64] 1809*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+66] 1810*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1811*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1812*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1813*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m8, m3 1814*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 1815*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 1816*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+96] 1817*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+98] 1818*c0909341SAndroid Build Coastguard Worker add r5d, dxd 1819*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1820*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1821*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m9, m3 1822*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1823*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1824*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m2 1825*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m0 1826*c0909341SAndroid Build Coastguard Worker dec hd 1827*c0909341SAndroid Build Coastguard Worker jz .w64_end 1828*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1829*c0909341SAndroid Build Coastguard Worker cmp r5d, maxbased 1830*c0909341SAndroid Build Coastguard Worker jb .w64_loop 1831*c0909341SAndroid Build Coastguard Worker.w64_end_loop: 1832*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m6 1833*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m6 1834*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m6 1835*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m6 1836*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1837*c0909341SAndroid Build Coastguard Worker dec hd 1838*c0909341SAndroid Build Coastguard Worker jg .w64_end_loop 1839*c0909341SAndroid Build Coastguard Worker.w64_end: 1840*c0909341SAndroid Build Coastguard Worker RET 1841*c0909341SAndroid Build Coastguard Worker 1842*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy 1843*c0909341SAndroid Build Coastguard Worker%define base r9-z_filter_t0 1844*c0909341SAndroid Build Coastguard Worker lea r9, [ipred_z2_16bpc_avx2_table] 1845*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 1846*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 1847*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1848*c0909341SAndroid Build Coastguard Worker lea dxq, [dr_intra_derivative-90] 1849*c0909341SAndroid Build Coastguard Worker movsxd wq, [r9+wq*4] 1850*c0909341SAndroid Build Coastguard Worker mova m1, [tlq- 0] 1851*c0909341SAndroid Build Coastguard Worker movzx dyd, angleb 1852*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 1853*c0909341SAndroid Build Coastguard Worker mova m2, [tlq- 32] 1854*c0909341SAndroid Build Coastguard Worker mov r8, dxq 1855*c0909341SAndroid Build Coastguard Worker sub dxq, dyq 1856*c0909341SAndroid Build Coastguard Worker mova m3, [tlq- 64] 1857*c0909341SAndroid Build Coastguard Worker add wq, r9 1858*c0909341SAndroid Build Coastguard Worker add r9, z_filter_t0-ipred_z2_16bpc_avx2_table 1859*c0909341SAndroid Build Coastguard Worker mova m4, [tlq- 96] 1860*c0909341SAndroid Build Coastguard Worker and dyd, ~1 1861*c0909341SAndroid Build Coastguard Worker mova m5, [tlq-128] 1862*c0909341SAndroid Build Coastguard Worker and dxq, ~1 1863*c0909341SAndroid Build Coastguard Worker movzx dyd, word [r8+dyq] ; angle - 90 1864*c0909341SAndroid Build Coastguard Worker movzx dxd, word [dxq+270] ; 180 - angle 1865*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_62] 1866*c0909341SAndroid Build Coastguard Worker mova [rsp+128], m1 1867*c0909341SAndroid Build Coastguard Worker mova [rsp+ 96], m2 1868*c0909341SAndroid Build Coastguard Worker mova [rsp+ 64], m3 1869*c0909341SAndroid Build Coastguard Worker neg dxd 1870*c0909341SAndroid Build Coastguard Worker mova [rsp+ 32], m4 1871*c0909341SAndroid Build Coastguard Worker neg dyq 1872*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], m5 1873*c0909341SAndroid Build Coastguard Worker jmp wq 1874*c0909341SAndroid Build Coastguard Worker.w4: 1875*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [base+z2_x_shuf] 1876*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [base+z_base_inc+2] 1877*c0909341SAndroid Build Coastguard Worker lea r8d, [dxq+(65<<6)] ; xpos 1878*c0909341SAndroid Build Coastguard Worker mov r10d, (63-4)<<6 1879*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1880*c0909341SAndroid Build Coastguard Worker jnz .w4_main ; !enable_intra_edge_filter 1881*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 1882*c0909341SAndroid Build Coastguard Worker add angled, 1022 1883*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1884*c0909341SAndroid Build Coastguard Worker test r3d, angled 1885*c0909341SAndroid Build Coastguard Worker jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1886*c0909341SAndroid Build Coastguard Worker movq xm0, [tlq+2] ; 1 2 3 4 1887*c0909341SAndroid Build Coastguard Worker movq xm1, [tlq+0] ; 0 1 2 3 1888*c0909341SAndroid Build Coastguard Worker pshuflw xm2, xm0, q3321 ; 2 3 4 4 1889*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm1, q2100 ; 0 0 1 2 1890*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm4, r8m ; pixel_max 1891*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [base+z_upsample] 1892*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0 1893*c0909341SAndroid Build Coastguard Worker paddw xm2, xm3 1894*c0909341SAndroid Build Coastguard Worker lea r8d, [r8+dxq+(1<<6)] 1895*c0909341SAndroid Build Coastguard Worker psubw xm2, xm1, xm2 1896*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1897*c0909341SAndroid Build Coastguard Worker psraw xm2, 3 1898*c0909341SAndroid Build Coastguard Worker pxor xm3, xm3 1899*c0909341SAndroid Build Coastguard Worker sub r10d, 3<<6 1900*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1901*c0909341SAndroid Build Coastguard Worker paddw m6, m6 1902*c0909341SAndroid Build Coastguard Worker pmaxsw xm1, xm3 1903*c0909341SAndroid Build Coastguard Worker sub angled, 1075 ; angle - 53 1904*c0909341SAndroid Build Coastguard Worker pavgw xm1, xm3 1905*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1906*c0909341SAndroid Build Coastguard Worker pminsw xm1, xm4 1907*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 1908*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0 1909*c0909341SAndroid Build Coastguard Worker movu [rsp+130], xm1 1910*c0909341SAndroid Build Coastguard Worker call .filter_strength 1911*c0909341SAndroid Build Coastguard Worker jmp .w4_filter_left 1912*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1913*c0909341SAndroid Build Coastguard Worker.filter_strength: 1914*c0909341SAndroid Build Coastguard Worker movd xm8, r3d 1915*c0909341SAndroid Build Coastguard Worker mov r3d, angled 1916*c0909341SAndroid Build Coastguard Worker movd xm7, angled 1917*c0909341SAndroid Build Coastguard Worker vpbroadcastb m8, xm8 1918*c0909341SAndroid Build Coastguard Worker shr r3d, 8 ; is_sm << 1 1919*c0909341SAndroid Build Coastguard Worker vpbroadcastb m7, xm7 1920*c0909341SAndroid Build Coastguard Worker pcmpeqb m8, [base+z_filter_wh] 1921*c0909341SAndroid Build Coastguard Worker mova xm9, [r9+r3*8] 1922*c0909341SAndroid Build Coastguard Worker pand m0, m8, m7 1923*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m9 1924*c0909341SAndroid Build Coastguard Worker pmovmskb r3d, m0 1925*c0909341SAndroid Build Coastguard Worker ret 1926*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1927*c0909341SAndroid Build Coastguard Worker.upsample_left: ; h4/h8 1928*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 1929*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 1930*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm4, r8m ; pixel_max 1931*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1932*c0909341SAndroid Build Coastguard Worker je .upsample_left_h8 1933*c0909341SAndroid Build Coastguard Worker pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 1934*c0909341SAndroid Build Coastguard Worker pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 1935*c0909341SAndroid Build Coastguard Worker jmp .upsample_left_end 1936*c0909341SAndroid Build Coastguard Worker.upsample_left_h8: 1937*c0909341SAndroid Build Coastguard Worker pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 1938*c0909341SAndroid Build Coastguard Worker pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 1939*c0909341SAndroid Build Coastguard Worker.upsample_left_end: 1940*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0 1941*c0909341SAndroid Build Coastguard Worker paddw xm2, xm3 1942*c0909341SAndroid Build Coastguard Worker psubw xm2, xm1, xm2 1943*c0909341SAndroid Build Coastguard Worker add dyq, dyq 1944*c0909341SAndroid Build Coastguard Worker psraw xm2, 3 1945*c0909341SAndroid Build Coastguard Worker pxor xm3, xm3 1946*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1947*c0909341SAndroid Build Coastguard Worker pmaxsw xm1, xm3 1948*c0909341SAndroid Build Coastguard Worker pavgw xm1, xm3 1949*c0909341SAndroid Build Coastguard Worker pminsw xm1, xm4 1950*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm0, xm1 1951*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm1 1952*c0909341SAndroid Build Coastguard Worker mova [rsp+ 96+gprsize], xm2 1953*c0909341SAndroid Build Coastguard Worker mova [rsp+112+gprsize], xm0 1954*c0909341SAndroid Build Coastguard Worker ret 1955*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above: 1956*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1957*c0909341SAndroid Build Coastguard Worker sub angled, 1112 ; angle - 90 1958*c0909341SAndroid Build Coastguard Worker call .filter_strength 1959*c0909341SAndroid Build Coastguard Worker test r3d, r3d 1960*c0909341SAndroid Build Coastguard Worker jz .w4_no_filter_above 1961*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 1962*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 1963*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] 1964*c0909341SAndroid Build Coastguard Worker psrldq xm0, xm1, 2 ; 1 2 3 4 1965*c0909341SAndroid Build Coastguard Worker pshuflw xm2, xm1, q2100 ; 0 0 1 2 1966*c0909341SAndroid Build Coastguard Worker pmullw xm4, xm0 1967*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm0, q3321 ; 2 3 4 4 1968*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 1969*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm0, q3332 ; 3 4 4 4 1970*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5 1971*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] 1972*c0909341SAndroid Build Coastguard Worker paddw xm2, xm3 1973*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, r6m ; max_width 1974*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm5 1975*c0909341SAndroid Build Coastguard Worker packssdw xm3, xm3 1976*c0909341SAndroid Build Coastguard Worker paddw xm1, xm4 1977*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 1978*c0909341SAndroid Build Coastguard Worker psubw xm3, [base+pw_1to16] 1979*c0909341SAndroid Build Coastguard Worker pxor xm4, xm4 1980*c0909341SAndroid Build Coastguard Worker psrlw xm1, 3 1981*c0909341SAndroid Build Coastguard Worker pminsw xm3, xm11 ; clip to byte range since there's no variable word blend 1982*c0909341SAndroid Build Coastguard Worker pavgw xm1, xm4 1983*c0909341SAndroid Build Coastguard Worker vpblendvb xm1, xm0, xm3 1984*c0909341SAndroid Build Coastguard Worker movq [rsp+130], xm1 1985*c0909341SAndroid Build Coastguard Worker.w4_no_filter_above: 1986*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 1987*c0909341SAndroid Build Coastguard Worker add angled, 973 ; angle + 883 1988*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1989*c0909341SAndroid Build Coastguard Worker test r3d, angled 1990*c0909341SAndroid Build Coastguard Worker jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1991*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [base+pb_90] 1992*c0909341SAndroid Build Coastguard Worker psubb xm0, xm7 ; 180 - angle 1993*c0909341SAndroid Build Coastguard Worker pand xm0, xm8 ; reuse from previous filter_strength call 1994*c0909341SAndroid Build Coastguard Worker pcmpgtb xm0, xm9 1995*c0909341SAndroid Build Coastguard Worker pmovmskb r3d, xm0 1996*c0909341SAndroid Build Coastguard Worker.w4_filter_left: 1997*c0909341SAndroid Build Coastguard Worker test r3d, r3d 1998*c0909341SAndroid Build Coastguard Worker jz .w4_main 1999*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2000*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2001*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, r7m ; max_height 2002*c0909341SAndroid Build Coastguard Worker cmp r3d, 3 2003*c0909341SAndroid Build Coastguard Worker je .w4_filter_left_s3 2004*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] 2005*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2006*c0909341SAndroid Build Coastguard Worker pmullw m2, m0 2007*c0909341SAndroid Build Coastguard Worker cmp hd, 8 2008*c0909341SAndroid Build Coastguard Worker jl .w4_filter_left_h4 2009*c0909341SAndroid Build Coastguard Worker movu m4, [tlq-34] 2010*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m0 2011*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2012*c0909341SAndroid Build Coastguard Worker je .w4_filter_left_end 2013*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2014*c0909341SAndroid Build Coastguard Worker jmp .w4_filter_left_end 2015*c0909341SAndroid Build Coastguard Worker.w4_upsample_left: 2016*c0909341SAndroid Build Coastguard Worker call .upsample_left 2017*c0909341SAndroid Build Coastguard Worker mov r11, -16 2018*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [base+z_upsample] 2019*c0909341SAndroid Build Coastguard Worker jmp .w4_main_upsample_left 2020*c0909341SAndroid Build Coastguard Worker.w4_filter_left_s3: ; can only be h16 2021*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2022*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pw_3] 2023*c0909341SAndroid Build Coastguard Worker paddw m1, m0, m2 2024*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m2 2025*c0909341SAndroid Build Coastguard Worker vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 2026*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0, xm0 2027*c0909341SAndroid Build Coastguard Worker paddw m2, m4 2028*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2029*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d 2030*c0909341SAndroid Build Coastguard Worker paddw m1, m4 2031*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 2032*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2033*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 2034*c0909341SAndroid Build Coastguard Worker jmp .w4_filter_left_end2 2035*c0909341SAndroid Build Coastguard Worker.w4_filter_left_h4: 2036*c0909341SAndroid Build Coastguard Worker pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e 2037*c0909341SAndroid Build Coastguard Worker.w4_filter_left_end: 2038*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2039*c0909341SAndroid Build Coastguard Worker pmullw m1, m3 2040*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2041*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2042*c0909341SAndroid Build Coastguard Worker psrlw m1, 3 2043*c0909341SAndroid Build Coastguard Worker pavgw m1, m2 2044*c0909341SAndroid Build Coastguard Worker.w4_filter_left_end2: 2045*c0909341SAndroid Build Coastguard Worker packssdw m5, m5 2046*c0909341SAndroid Build Coastguard Worker psubw m5, [base+pw_16to1] 2047*c0909341SAndroid Build Coastguard Worker pminsw m5, m11 2048*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m0, m5 2049*c0909341SAndroid Build Coastguard Worker mova [rsp+96], m1 2050*c0909341SAndroid Build Coastguard Worker.w4_main: 2051*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [base+z2_x_shuf] 2052*c0909341SAndroid Build Coastguard Worker mov r11, -8 2053*c0909341SAndroid Build Coastguard Worker.w4_main_upsample_left: 2054*c0909341SAndroid Build Coastguard Worker movd xm5, dyd 2055*c0909341SAndroid Build Coastguard Worker mova m4, [base+z2_y_shuf_h4] 2056*c0909341SAndroid Build Coastguard Worker mov r2d, r8d 2057*c0909341SAndroid Build Coastguard Worker movd xm0, dxd 2058*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 2059*c0909341SAndroid Build Coastguard Worker rorx r5, dyq, 5 2060*c0909341SAndroid Build Coastguard Worker lea r8d, [dyq*3] 2061*c0909341SAndroid Build Coastguard Worker pmullw m5, [base+z2_ymul] 2062*c0909341SAndroid Build Coastguard Worker rorx r9, dyq, 4 2063*c0909341SAndroid Build Coastguard Worker sar dyd, 6 2064*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 2065*c0909341SAndroid Build Coastguard Worker sar r8d, 6 2066*c0909341SAndroid Build Coastguard Worker pand m5, m11 ; frac_y 2067*c0909341SAndroid Build Coastguard Worker neg dyd 2068*c0909341SAndroid Build Coastguard Worker psllw m5, 9 2069*c0909341SAndroid Build Coastguard Worker add r5d, dyd 2070*c0909341SAndroid Build Coastguard Worker add r8d, dyd 2071*c0909341SAndroid Build Coastguard Worker add r9d, dyd 2072*c0909341SAndroid Build Coastguard Worker paddw m7, m0, m0 2073*c0909341SAndroid Build Coastguard Worker lea dyq, [rsp+dyq*2+126] 2074*c0909341SAndroid Build Coastguard Worker vpblendd m0, m7, 0xcc 2075*c0909341SAndroid Build Coastguard Worker add dyq, r11 2076*c0909341SAndroid Build Coastguard Worker neg r5d 2077*c0909341SAndroid Build Coastguard Worker paddw m1, m0, m7 2078*c0909341SAndroid Build Coastguard Worker neg r8d 2079*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 2080*c0909341SAndroid Build Coastguard Worker neg r9d 2081*c0909341SAndroid Build Coastguard Worker paddw m7, m7 2082*c0909341SAndroid Build Coastguard Worker paddw m6, m0 2083*c0909341SAndroid Build Coastguard Worker.w4_loop: 2084*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 2085*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x0 2086*c0909341SAndroid Build Coastguard Worker movu xm1, [rsp+r2*2] 2087*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 2088*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x1 2089*c0909341SAndroid Build Coastguard Worker movu xm3, [rsp+r3*2] 2090*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 2091*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x2 2092*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [rsp+r2*2], 1 2093*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 2094*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x3 2095*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [rsp+r3*2], 1 2096*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 2097*c0909341SAndroid Build Coastguard Worker pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 2098*c0909341SAndroid Build Coastguard Worker pand m2, m11, m6 2099*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m3 2100*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m3 2101*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2102*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2103*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2104*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2105*c0909341SAndroid Build Coastguard Worker cmp r3d, 64 2106*c0909341SAndroid Build Coastguard Worker jge .w4_toponly 2107*c0909341SAndroid Build Coastguard Worker movu xm2, [dyq] 2108*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [dyq+r8*2], 1 2109*c0909341SAndroid Build Coastguard Worker movu xm3, [dyq+r5*2] 2110*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [dyq+r9*2], 1 2111*c0909341SAndroid Build Coastguard Worker pshufb m2, m9 2112*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 2113*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 2114*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 2115*c0909341SAndroid Build Coastguard Worker psubw m2, m1 2116*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 2117*c0909341SAndroid Build Coastguard Worker psraw m3, m6, 15 ; base_x < topleft 2118*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2119*c0909341SAndroid Build Coastguard Worker vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 2120*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m1, m3 2121*c0909341SAndroid Build Coastguard Worker.w4_toponly: 2122*c0909341SAndroid Build Coastguard Worker paddw m6, m7 ; xpos += dx 2123*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 2124*c0909341SAndroid Build Coastguard Worker add dyq, r11 2125*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2126*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 2127*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 2128*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 2129*c0909341SAndroid Build Coastguard Worker movhps [dstq+r3 ], xm1 2130*c0909341SAndroid Build Coastguard Worker sub hd, 4 2131*c0909341SAndroid Build Coastguard Worker jz .w4_end 2132*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2133*c0909341SAndroid Build Coastguard Worker cmp r2d, r10d 2134*c0909341SAndroid Build Coastguard Worker jge .w4_loop 2135*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop: 2136*c0909341SAndroid Build Coastguard Worker movu xm1, [dyq] 2137*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [dyq+r8*2], 1 2138*c0909341SAndroid Build Coastguard Worker movu xm2, [dyq+r5*2] 2139*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [dyq+r9*2], 1 2140*c0909341SAndroid Build Coastguard Worker add dyq, r11 2141*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 2142*c0909341SAndroid Build Coastguard Worker pshufb m2, m9 2143*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1, m2 2144*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 2145*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2146*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 2147*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2148*c0909341SAndroid Build Coastguard Worker vpermd m0, m4, m0 2149*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2150*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 2151*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 2152*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 2153*c0909341SAndroid Build Coastguard Worker movhps [dstq+r3 ], xm1 2154*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2155*c0909341SAndroid Build Coastguard Worker sub hd, 4 2156*c0909341SAndroid Build Coastguard Worker jg .w4_leftonly_loop 2157*c0909341SAndroid Build Coastguard Worker.w4_end: 2158*c0909341SAndroid Build Coastguard Worker RET 2159*c0909341SAndroid Build Coastguard Worker.w8: 2160*c0909341SAndroid Build Coastguard Worker mov r10d, hd 2161*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2162*c0909341SAndroid Build Coastguard Worker jnz .w8_main 2163*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+126] 2164*c0909341SAndroid Build Coastguard Worker xor r8d, r8d 2165*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2166*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2167*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2168*c0909341SAndroid Build Coastguard Worker movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 2169*c0909341SAndroid Build Coastguard Worker mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 2170*c0909341SAndroid Build Coastguard Worker pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 2171*c0909341SAndroid Build Coastguard Worker pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 2172*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm4, r8m ; pixel_max 2173*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0 2174*c0909341SAndroid Build Coastguard Worker paddw xm2, xm3 2175*c0909341SAndroid Build Coastguard Worker not r8d 2176*c0909341SAndroid Build Coastguard Worker psubw xm2, xm1, xm2 2177*c0909341SAndroid Build Coastguard Worker add dxd, dxd 2178*c0909341SAndroid Build Coastguard Worker psraw xm2, 3 2179*c0909341SAndroid Build Coastguard Worker sub angled, 53 ; angle - 53 2180*c0909341SAndroid Build Coastguard Worker pxor xm3, xm3 2181*c0909341SAndroid Build Coastguard Worker paddw xm2, xm1 2182*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2183*c0909341SAndroid Build Coastguard Worker pmaxsw xm2, xm3 2184*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 2185*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm3 2186*c0909341SAndroid Build Coastguard Worker pminsw xm2, xm4 2187*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm2, xm0 2188*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm0 2189*c0909341SAndroid Build Coastguard Worker movu [rsp+130], xm1 2190*c0909341SAndroid Build Coastguard Worker movu [rsp+146], xm2 2191*c0909341SAndroid Build Coastguard Worker call .filter_strength 2192*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left 2193*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above: 2194*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2195*c0909341SAndroid Build Coastguard Worker sub angled, 90 ; angle - 90 2196*c0909341SAndroid Build Coastguard Worker call .filter_strength 2197*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2198*c0909341SAndroid Build Coastguard Worker jz .w8_no_filter_above 2199*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2200*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 2201*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] 2202*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] 2203*c0909341SAndroid Build Coastguard Worker movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x 2204*c0909341SAndroid Build Coastguard Worker pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x 2205*c0909341SAndroid Build Coastguard Worker pmullw xm4, xm0 2206*c0909341SAndroid Build Coastguard Worker pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x 2207*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 2208*c0909341SAndroid Build Coastguard Worker vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x 2209*c0909341SAndroid Build Coastguard Worker paddw xm2, xm3 2210*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, r6m ; max_width 2211*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5 2212*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm6 2213*c0909341SAndroid Build Coastguard Worker packssdw xm3, xm3 2214*c0909341SAndroid Build Coastguard Worker paddw xm1, xm4 2215*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2216*c0909341SAndroid Build Coastguard Worker psubw xm3, [base+pw_1to16] 2217*c0909341SAndroid Build Coastguard Worker pxor xm4, xm4 2218*c0909341SAndroid Build Coastguard Worker psrlw xm1, 3 2219*c0909341SAndroid Build Coastguard Worker pminsw xm3, xm11 2220*c0909341SAndroid Build Coastguard Worker pavgw xm1, xm4 2221*c0909341SAndroid Build Coastguard Worker vpblendvb xm1, xm0, xm3 2222*c0909341SAndroid Build Coastguard Worker movu [rsp+130], xm1 2223*c0909341SAndroid Build Coastguard Worker.w8_no_filter_above: 2224*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq-51] 2225*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2226*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2227*c0909341SAndroid Build Coastguard Worker jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 2228*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pb_90] 2229*c0909341SAndroid Build Coastguard Worker psubb m0, m7 2230*c0909341SAndroid Build Coastguard Worker pand m0, m8 2231*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m9 2232*c0909341SAndroid Build Coastguard Worker pmovmskb r3d, m0 2233*c0909341SAndroid Build Coastguard Worker.w8_filter_left: 2234*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2235*c0909341SAndroid Build Coastguard Worker jz .w8_main 2236*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2237*c0909341SAndroid Build Coastguard Worker cmp r3d, 3 2238*c0909341SAndroid Build Coastguard Worker jne .w8_filter_left_s12 2239*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_3] 2240*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_16] 2241*c0909341SAndroid Build Coastguard Worker cmp hd, 16 ; flags needed for later 2242*c0909341SAndroid Build Coastguard Worker jmp .filter_left_s3b 2243*c0909341SAndroid Build Coastguard Worker.w8_upsample_left: 2244*c0909341SAndroid Build Coastguard Worker call .upsample_left 2245*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+z2_y_shuf_us] 2246*c0909341SAndroid Build Coastguard Worker lea r11, [rsp+118] 2247*c0909341SAndroid Build Coastguard Worker mov r8, -8 2248*c0909341SAndroid Build Coastguard Worker jmp .w8_main_upsample_left 2249*c0909341SAndroid Build Coastguard Worker.w16_filter_left_s12: 2250*c0909341SAndroid Build Coastguard Worker xor r8d, r8d 2251*c0909341SAndroid Build Coastguard Worker.w8_filter_left_s12: 2252*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2253*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, r7m ; max_height 2254*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] 2255*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2256*c0909341SAndroid Build Coastguard Worker pmullw m2, m0 2257*c0909341SAndroid Build Coastguard Worker cmp hd, 8 2258*c0909341SAndroid Build Coastguard Worker jl .w8_filter_left_h4 2259*c0909341SAndroid Build Coastguard Worker movu m4, [tlq-34] 2260*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m0 2261*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2262*c0909341SAndroid Build Coastguard Worker je .w8_filter_left_end 2263*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2264*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left_end 2265*c0909341SAndroid Build Coastguard Worker.w8_filter_left_h4: 2266*c0909341SAndroid Build Coastguard Worker pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e 2267*c0909341SAndroid Build Coastguard Worker.w8_filter_left_end: 2268*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2269*c0909341SAndroid Build Coastguard Worker pmullw m1, m3 2270*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2271*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2272*c0909341SAndroid Build Coastguard Worker psrlw m1, 3 2273*c0909341SAndroid Build Coastguard Worker pavgw m1, m2 2274*c0909341SAndroid Build Coastguard Worker packssdw m5, m5 2275*c0909341SAndroid Build Coastguard Worker psubw m5, [base+pw_16to1] 2276*c0909341SAndroid Build Coastguard Worker pminsw m5, m11 2277*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m0, m5 2278*c0909341SAndroid Build Coastguard Worker mova [rsp+96], m1 2279*c0909341SAndroid Build Coastguard Worker test r8d, r8d 2280*c0909341SAndroid Build Coastguard Worker jz .w8_main 2281*c0909341SAndroid Build Coastguard Worker; upsample_main 2282*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [base+z_upsample] 2283*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+z2_y_shuf] 2284*c0909341SAndroid Build Coastguard Worker lea r5, [rsp+120] 2285*c0909341SAndroid Build Coastguard Worker movd xm1, dyd 2286*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [base+z_base_inc+2] 2287*c0909341SAndroid Build Coastguard Worker movd xm2, dxd 2288*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, xm1 2289*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, xm2 2290*c0909341SAndroid Build Coastguard Worker mov r7, dstq 2291*c0909341SAndroid Build Coastguard Worker paddw m4, m4 2292*c0909341SAndroid Build Coastguard Worker pmullw m0, m1, [base+z2_ymul8] 2293*c0909341SAndroid Build Coastguard Worker paddw m5, m2, m2 2294*c0909341SAndroid Build Coastguard Worker psllw xm1, 3 2295*c0909341SAndroid Build Coastguard Worker vpblendd m2, m5, 0xf0 2296*c0909341SAndroid Build Coastguard Worker lea r2d, [dxq+(66<<6)] ; xpos 2297*c0909341SAndroid Build Coastguard Worker paddw m4, m2 2298*c0909341SAndroid Build Coastguard Worker pshufd m6, m0, q2020 2299*c0909341SAndroid Build Coastguard Worker psraw xm0, 6 2300*c0909341SAndroid Build Coastguard Worker pxor xm1, xm1 2301*c0909341SAndroid Build Coastguard Worker psubw xm8, xm1, xm0 2302*c0909341SAndroid Build Coastguard Worker pand m6, m11 2303*c0909341SAndroid Build Coastguard Worker punpckhwd xm9, xm8, xm1 2304*c0909341SAndroid Build Coastguard Worker psllw m6, 9 2305*c0909341SAndroid Build Coastguard Worker punpcklwd xm8, xm1 2306*c0909341SAndroid Build Coastguard Worker.w8_upsample_above_loop: 2307*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 2308*c0909341SAndroid Build Coastguard Worker shr r2d, 6 2309*c0909341SAndroid Build Coastguard Worker movu xm1, [rsp+r2*2] 2310*c0909341SAndroid Build Coastguard Worker movu xm2, [rsp+r2*2+16] 2311*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 2312*c0909341SAndroid Build Coastguard Worker shr r3d, 6 2313*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [rsp+r3*2], 1 2314*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [rsp+r3*2+16], 1 2315*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 2316*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 2317*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 2318*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2 2319*c0909341SAndroid Build Coastguard Worker pand m2, m11, m4 2320*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2321*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2322*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2323*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2324*c0909341SAndroid Build Coastguard Worker cmp r3d, 64 2325*c0909341SAndroid Build Coastguard Worker jge .w8_upsample_above_toponly 2326*c0909341SAndroid Build Coastguard Worker mova m1, m5 2327*c0909341SAndroid Build Coastguard Worker vpgatherdq m3, [r5+xm9*2], m5 2328*c0909341SAndroid Build Coastguard Worker mova m5, m1 2329*c0909341SAndroid Build Coastguard Worker vpgatherdq m2, [r5+xm8*2], m1 2330*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 2331*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 2332*c0909341SAndroid Build Coastguard Worker punpckldq m1, m2, m3 2333*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m3 2334*c0909341SAndroid Build Coastguard Worker psubw m2, m1 2335*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 2336*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2337*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 2338*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 2339*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m1, m2 2340*c0909341SAndroid Build Coastguard Worker.w8_upsample_above_toponly: 2341*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2342*c0909341SAndroid Build Coastguard Worker sub r5, 4 2343*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2344*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 2345*c0909341SAndroid Build Coastguard Worker sub hd, 2 2346*c0909341SAndroid Build Coastguard Worker jz .w8_ret 2347*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2348*c0909341SAndroid Build Coastguard Worker jmp .w8_upsample_above_loop 2349*c0909341SAndroid Build Coastguard Worker.w8_main: 2350*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+z2_y_shuf] 2351*c0909341SAndroid Build Coastguard Worker lea r11, [rsp+120] 2352*c0909341SAndroid Build Coastguard Worker mov r8, -4 2353*c0909341SAndroid Build Coastguard Worker.w8_main_upsample_left: 2354*c0909341SAndroid Build Coastguard Worker movd xm1, dyd 2355*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [base+z_base_inc+2] 2356*c0909341SAndroid Build Coastguard Worker movd xm2, dxd 2357*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, xm1 2358*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, xm2 2359*c0909341SAndroid Build Coastguard Worker mov r7, dstq 2360*c0909341SAndroid Build Coastguard Worker pmullw m0, m1, [base+z2_ymul8] 2361*c0909341SAndroid Build Coastguard Worker paddw m5, m2, m2 2362*c0909341SAndroid Build Coastguard Worker psllw xm1, 3 2363*c0909341SAndroid Build Coastguard Worker vpblendd m2, m5, 0xf0 ; xpos0 xpos1 2364*c0909341SAndroid Build Coastguard Worker lea r9d, [dxq+(65<<6)] ; xpos 2365*c0909341SAndroid Build Coastguard Worker paddw m4, m2 2366*c0909341SAndroid Build Coastguard Worker movd [rsp+284], xm1 2367*c0909341SAndroid Build Coastguard Worker.w8_loop0: 2368*c0909341SAndroid Build Coastguard Worker mov r2d, r9d 2369*c0909341SAndroid Build Coastguard Worker mova [rsp+288], m0 2370*c0909341SAndroid Build Coastguard Worker mov r5, r11 2371*c0909341SAndroid Build Coastguard Worker mova [rsp+320], m4 2372*c0909341SAndroid Build Coastguard Worker pshufd m6, m0, q2020 2373*c0909341SAndroid Build Coastguard Worker psraw xm0, 6 2374*c0909341SAndroid Build Coastguard Worker pxor xm1, xm1 2375*c0909341SAndroid Build Coastguard Worker psubw xm8, xm1, xm0 ; base_y 2376*c0909341SAndroid Build Coastguard Worker pand m6, m11 ; frac_y 2377*c0909341SAndroid Build Coastguard Worker punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 2378*c0909341SAndroid Build Coastguard Worker psllw m6, 9 2379*c0909341SAndroid Build Coastguard Worker punpcklwd xm8, xm1 ; base_y 0 1 4 5 2380*c0909341SAndroid Build Coastguard Worker.w8_loop: 2381*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 2382*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x0 2383*c0909341SAndroid Build Coastguard Worker movu xm0, [rsp+r2*2] 2384*c0909341SAndroid Build Coastguard Worker movu xm1, [rsp+r2*2+2] 2385*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 2386*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x1 2387*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [rsp+r3*2], 1 2388*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [rsp+r3*2+2], 1 2389*c0909341SAndroid Build Coastguard Worker pand m2, m11, m4 2390*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2391*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2392*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2393*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2394*c0909341SAndroid Build Coastguard Worker cmp r3d, 64 2395*c0909341SAndroid Build Coastguard Worker jge .w8_toponly 2396*c0909341SAndroid Build Coastguard Worker mova m1, m5 2397*c0909341SAndroid Build Coastguard Worker vpgatherdq m3, [r5+xm9*2], m5 2398*c0909341SAndroid Build Coastguard Worker mova m5, m1 2399*c0909341SAndroid Build Coastguard Worker vpgatherdq m2, [r5+xm8*2], m1 2400*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 2401*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 2402*c0909341SAndroid Build Coastguard Worker punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 2403*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m3 2404*c0909341SAndroid Build Coastguard Worker psubw m2, m1 2405*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 2406*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2407*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 2408*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 ; base_x < topleft 2409*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m1, m2 2410*c0909341SAndroid Build Coastguard Worker.w8_toponly: 2411*c0909341SAndroid Build Coastguard Worker paddw m4, m5 ; xpos += dx 2412*c0909341SAndroid Build Coastguard Worker add r5, r8 2413*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2414*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 2415*c0909341SAndroid Build Coastguard Worker sub hd, 2 2416*c0909341SAndroid Build Coastguard Worker jz .w8_end 2417*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2418*c0909341SAndroid Build Coastguard Worker cmp r2d, (63-8)<<6 2419*c0909341SAndroid Build Coastguard Worker jge .w8_loop 2420*c0909341SAndroid Build Coastguard Worker.w8_leftonly_loop: 2421*c0909341SAndroid Build Coastguard Worker mova m0, m5 2422*c0909341SAndroid Build Coastguard Worker vpgatherdq m4, [r5+xm9*2], m5 2423*c0909341SAndroid Build Coastguard Worker mova m5, m0 2424*c0909341SAndroid Build Coastguard Worker vpgatherdq m3, [r5+xm8*2], m0 2425*c0909341SAndroid Build Coastguard Worker add r5, r8 2426*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m7 2427*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m7 2428*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 2429*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 2430*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2431*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 2432*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2433*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 2434*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2435*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 2436*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2437*c0909341SAndroid Build Coastguard Worker sub hd, 2 2438*c0909341SAndroid Build Coastguard Worker jg .w8_leftonly_loop 2439*c0909341SAndroid Build Coastguard Worker.w8_end: 2440*c0909341SAndroid Build Coastguard Worker sub r10d, 1<<8 2441*c0909341SAndroid Build Coastguard Worker jl .w8_ret 2442*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [rsp+284] 2443*c0909341SAndroid Build Coastguard Worker add r7, 16 2444*c0909341SAndroid Build Coastguard Worker paddw m0, [rsp+288] ; base_y += 8*dy 2445*c0909341SAndroid Build Coastguard Worker add r9d, 8<<6 2446*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_512] 2447*c0909341SAndroid Build Coastguard Worker movzx hd, r10b 2448*c0909341SAndroid Build Coastguard Worker paddw m4, [rsp+320] ; base_x += 8*64 2449*c0909341SAndroid Build Coastguard Worker mov dstq, r7 2450*c0909341SAndroid Build Coastguard Worker jmp .w8_loop0 2451*c0909341SAndroid Build Coastguard Worker.w8_ret: 2452*c0909341SAndroid Build Coastguard Worker RET 2453*c0909341SAndroid Build Coastguard Worker.w16: 2454*c0909341SAndroid Build Coastguard Worker movd xm0, [tlq+32] 2455*c0909341SAndroid Build Coastguard Worker lea r10d, [hq+(1<<8)] 2456*c0909341SAndroid Build Coastguard Worker movd [rsp+160], xm0 2457*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2458*c0909341SAndroid Build Coastguard Worker jnz .w8_main 2459*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 2460*c0909341SAndroid Build Coastguard Worker sub angled, 90 2461*c0909341SAndroid Build Coastguard Worker call .filter_strength 2462*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2463*c0909341SAndroid Build Coastguard Worker jz .w16_no_filter_above 2464*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2465*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] 2466*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] 2467*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] 2468*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2469*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm1 2470*c0909341SAndroid Build Coastguard Worker vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2471*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m0 2472*c0909341SAndroid Build Coastguard Worker pmullw m4, m0 2473*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 2474*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2475*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g 2476*c0909341SAndroid Build Coastguard Worker paddw m2, m3 2477*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, r6m ; max_width 2478*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 2479*c0909341SAndroid Build Coastguard Worker pmullw m2, m6 2480*c0909341SAndroid Build Coastguard Worker packssdw m3, m3 2481*c0909341SAndroid Build Coastguard Worker paddw m1, m4 2482*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2483*c0909341SAndroid Build Coastguard Worker psubw m3, [base+pw_1to16] 2484*c0909341SAndroid Build Coastguard Worker pxor m4, m4 2485*c0909341SAndroid Build Coastguard Worker psrlw m1, 3 2486*c0909341SAndroid Build Coastguard Worker pminsw m3, m11 2487*c0909341SAndroid Build Coastguard Worker pavgw m1, m4 2488*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m0, m3 2489*c0909341SAndroid Build Coastguard Worker movu [rsp+130], m1 2490*c0909341SAndroid Build Coastguard Worker.w16_no_filter_above: 2491*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pb_90] 2492*c0909341SAndroid Build Coastguard Worker psubb m0, m7 2493*c0909341SAndroid Build Coastguard Worker pand m0, m8 2494*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m9 2495*c0909341SAndroid Build Coastguard Worker pmovmskb r3d, m0 2496*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2497*c0909341SAndroid Build Coastguard Worker jz .w8_main 2498*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2499*c0909341SAndroid Build Coastguard Worker cmp r3d, 3 2500*c0909341SAndroid Build Coastguard Worker jne .w16_filter_left_s12 2501*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_3] 2502*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_16] 2503*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2504*c0909341SAndroid Build Coastguard Worker jne .filter_left_s3 2505*c0909341SAndroid Build Coastguard Worker movq xm0, [tlq-8] ; 0 1 2 3 2506*c0909341SAndroid Build Coastguard Worker movq xm1, [tlq-6] ; 1 2 3 4 2507*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, r7m ; max_height 2508*c0909341SAndroid Build Coastguard Worker movq xm4, [base+pw_16to1+24] ; 4to1 2509*c0909341SAndroid Build Coastguard Worker pshuflw xm2, xm0, q2100 ; 0 0 1 2 2510*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm1, q3321 ; 2 3 4 4 2511*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0 2512*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2513*c0909341SAndroid Build Coastguard Worker pshuflw xm2, xm0, q1000 ; 0 0 0 1 2514*c0909341SAndroid Build Coastguard Worker paddw xm3, xm6 2515*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 2516*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm3 2517*c0909341SAndroid Build Coastguard Worker psubw xm5, xm4 2518*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2519*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm11 2520*c0909341SAndroid Build Coastguard Worker psrlw xm1, 2 2521*c0909341SAndroid Build Coastguard Worker vpblendvb xm1, xm0, xm5 2522*c0909341SAndroid Build Coastguard Worker movq [rsp+120], xm1 2523*c0909341SAndroid Build Coastguard Worker jmp .w8_main 2524*c0909341SAndroid Build Coastguard Worker.w32: 2525*c0909341SAndroid Build Coastguard Worker mova m2, [tlq+32] 2526*c0909341SAndroid Build Coastguard Worker movd xm0, [tlq+64] 2527*c0909341SAndroid Build Coastguard Worker lea r10d, [hq+(3<<8)] 2528*c0909341SAndroid Build Coastguard Worker mova [rsp+160], m2 2529*c0909341SAndroid Build Coastguard Worker movd [rsp+192], xm0 2530*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2531*c0909341SAndroid Build Coastguard Worker jnz .w8_main 2532*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_3] 2533*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, r6m ; max_width 2534*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_16] 2535*c0909341SAndroid Build Coastguard Worker mov r3d, 32 2536*c0909341SAndroid Build Coastguard Worker packssdw m0, m0 2537*c0909341SAndroid Build Coastguard Worker psubw m0, [base+pw_1to16] 2538*c0909341SAndroid Build Coastguard Worker pminsw m8, m0, m11 2539*c0909341SAndroid Build Coastguard Worker psubw m9, m8, m7 2540*c0909341SAndroid Build Coastguard Worker.w32_filter_above: 2541*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+2] 2542*c0909341SAndroid Build Coastguard Worker punpcklwd xm4, xm1, xm1 2543*c0909341SAndroid Build Coastguard Worker paddw m2, m6, [tlq+6] 2544*c0909341SAndroid Build Coastguard Worker paddw m1, m0 2545*c0909341SAndroid Build Coastguard Worker vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2546*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq+4] 2547*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r3+2] 2548*c0909341SAndroid Build Coastguard Worker paddw m5, m6, [tlq+r3-2] 2549*c0909341SAndroid Build Coastguard Worker pavgw m2, m4 2550*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m3, m3 2551*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2552*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 2553*c0909341SAndroid Build Coastguard Worker vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 2554*c0909341SAndroid Build Coastguard Worker pavgw m2, m5 2555*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [tlq+r3] 2556*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2557*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 2558*c0909341SAndroid Build Coastguard Worker paddw m2, m4 2559*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m0, m8 2560*c0909341SAndroid Build Coastguard Worker psrlw m2, 2 2561*c0909341SAndroid Build Coastguard Worker vpblendvb m2, m3, m9 2562*c0909341SAndroid Build Coastguard Worker movu [rsp+130], m1 2563*c0909341SAndroid Build Coastguard Worker movu [rsp+r3+130], m2 2564*c0909341SAndroid Build Coastguard Worker.filter_left_s3: 2565*c0909341SAndroid Build Coastguard Worker cmp hd, 16 2566*c0909341SAndroid Build Coastguard Worker jl .filter_left_s3_h8 ; h8 2567*c0909341SAndroid Build Coastguard Worker.filter_left_s3b: 2568*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2569*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i 2570*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, r7m ; max_height 2571*c0909341SAndroid Build Coastguard Worker paddw m1, m0, m2 2572*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m2 2573*c0909341SAndroid Build Coastguard Worker mov r3d, hd 2574*c0909341SAndroid Build Coastguard Worker vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 2575*c0909341SAndroid Build Coastguard Worker packssdw m5, m5 2576*c0909341SAndroid Build Coastguard Worker not r3 2577*c0909341SAndroid Build Coastguard Worker psubw m5, [base+pw_16to1] 2578*c0909341SAndroid Build Coastguard Worker paddw m2, m6 2579*c0909341SAndroid Build Coastguard Worker pminsw m8, m11, m5 2580*c0909341SAndroid Build Coastguard Worker je .filter_left_s3_end ; h16 2581*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2582*c0909341SAndroid Build Coastguard Worker pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2583*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2584*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 2585*c0909341SAndroid Build Coastguard Worker vpblendvb m3, m1, m0, m8 2586*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2587*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i 2588*c0909341SAndroid Build Coastguard Worker paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j 2589*c0909341SAndroid Build Coastguard Worker psubw m8, m7 2590*c0909341SAndroid Build Coastguard Worker mova [rsp+96], m3 2591*c0909341SAndroid Build Coastguard Worker jnp .filter_left_s3_end ; h32 2592*c0909341SAndroid Build Coastguard Worker mova m5, [tlq-96] 2593*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq-66] 2594*c0909341SAndroid Build Coastguard Worker pavgw m2, [tlq-68] 2595*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2596*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [tlq-94] 2597*c0909341SAndroid Build Coastguard Worker paddw m2, m6, [tlq-92] 2598*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 2599*c0909341SAndroid Build Coastguard Worker paddw m4, [tlq- 98] 2600*c0909341SAndroid Build Coastguard Worker pavgw m2, [tlq-100] 2601*c0909341SAndroid Build Coastguard Worker vpblendvb m3, m1, m0, m8 2602*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-128] 2603*c0909341SAndroid Build Coastguard Worker psubw m8, m7 2604*c0909341SAndroid Build Coastguard Worker paddw m4, m2 2605*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [tlq-126] 2606*c0909341SAndroid Build Coastguard Worker paddw m2, m6, [tlq-124] 2607*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 2608*c0909341SAndroid Build Coastguard Worker mova [rsp+64], m3 2609*c0909341SAndroid Build Coastguard Worker vpblendvb m4, m5, m8 2610*c0909341SAndroid Build Coastguard Worker psubw m8, m7 2611*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m4 2612*c0909341SAndroid Build Coastguard Worker.filter_left_s3_end: 2613*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0, xm0 2614*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g 2615*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f 2616*c0909341SAndroid Build Coastguard Worker paddw m1, m4 2617*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 2618*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2619*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 2620*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m0, m8 2621*c0909341SAndroid Build Coastguard Worker mova [rsp+r3*2+130], m1 2622*c0909341SAndroid Build Coastguard Worker jmp .w8_main 2623*c0909341SAndroid Build Coastguard Worker.filter_left_s3_h8: 2624*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 2625*c0909341SAndroid Build Coastguard Worker movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 2626*c0909341SAndroid Build Coastguard Worker pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 2627*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, r7m ; max_height 2628*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0, xm3 2629*c0909341SAndroid Build Coastguard Worker pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 2630*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2631*c0909341SAndroid Build Coastguard Worker vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 2632*c0909341SAndroid Build Coastguard Worker paddw xm3, xm6 2633*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 2634*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm3 2635*c0909341SAndroid Build Coastguard Worker psubw xm5, [base+pw_16to1+16] ; 8to1 2636*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2637*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm11 2638*c0909341SAndroid Build Coastguard Worker psrlw xm1, 2 2639*c0909341SAndroid Build Coastguard Worker vpblendvb xm1, xm0, xm5 2640*c0909341SAndroid Build Coastguard Worker mova [rsp+112], xm1 2641*c0909341SAndroid Build Coastguard Worker jmp .w8_main 2642*c0909341SAndroid Build Coastguard Worker.w64: 2643*c0909341SAndroid Build Coastguard Worker mova m2, [tlq+ 32] 2644*c0909341SAndroid Build Coastguard Worker mova m3, [tlq+ 64] 2645*c0909341SAndroid Build Coastguard Worker mova m4, [tlq+ 96] 2646*c0909341SAndroid Build Coastguard Worker movd xm0, [tlq+128] 2647*c0909341SAndroid Build Coastguard Worker lea r10d, [hq+(7<<8)] 2648*c0909341SAndroid Build Coastguard Worker mova [rsp+160], m2 2649*c0909341SAndroid Build Coastguard Worker mova [rsp+192], m3 2650*c0909341SAndroid Build Coastguard Worker mova [rsp+224], m4 2651*c0909341SAndroid Build Coastguard Worker movd [rsp+256], xm0 2652*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2653*c0909341SAndroid Build Coastguard Worker jnz .w8_main 2654*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_3] 2655*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2656*c0909341SAndroid Build Coastguard Worker paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2657*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2658*c0909341SAndroid Build Coastguard Worker pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h 2659*c0909341SAndroid Build Coastguard Worker paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h 2660*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+66] 2661*c0909341SAndroid Build Coastguard Worker paddw m3, m6, [tlq+62] 2662*c0909341SAndroid Build Coastguard Worker paddw m7, m4, [tlq+64] 2663*c0909341SAndroid Build Coastguard Worker pavgw m3, [tlq+70] 2664*c0909341SAndroid Build Coastguard Worker paddw m7, [tlq+68] 2665*c0909341SAndroid Build Coastguard Worker paddw m2, m5 2666*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, r6m ; max_width 2667*c0909341SAndroid Build Coastguard Worker mov r3d, 96 2668*c0909341SAndroid Build Coastguard Worker packssdw m5, m5 2669*c0909341SAndroid Build Coastguard Worker paddw m3, m7 2670*c0909341SAndroid Build Coastguard Worker psubw m5, [base+pw_1to16] 2671*c0909341SAndroid Build Coastguard Worker psrlw m2, 2 2672*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_16] 2673*c0909341SAndroid Build Coastguard Worker psrlw m3, 2 2674*c0909341SAndroid Build Coastguard Worker pminsw m8, m11, m5 2675*c0909341SAndroid Build Coastguard Worker psubw m9, m8, m7 2676*c0909341SAndroid Build Coastguard Worker vpblendvb m2, m0, m9 2677*c0909341SAndroid Build Coastguard Worker psubw m9, m7 2678*c0909341SAndroid Build Coastguard Worker vpblendvb m3, m4, m9 2679*c0909341SAndroid Build Coastguard Worker psubw m9, m7 2680*c0909341SAndroid Build Coastguard Worker movu [rsp+162], m2 2681*c0909341SAndroid Build Coastguard Worker movu [rsp+194], m3 2682*c0909341SAndroid Build Coastguard Worker jmp .w32_filter_above 2683*c0909341SAndroid Build Coastguard Worker 2684*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase 2685*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_z3_16bpc_avx2_table] 2686*c0909341SAndroid Build Coastguard Worker tzcnt hd, hm 2687*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 2688*c0909341SAndroid Build Coastguard Worker lea r7, [dr_intra_derivative+45*2-1] 2689*c0909341SAndroid Build Coastguard Worker sub tlq, 2 2690*c0909341SAndroid Build Coastguard Worker movsxd hq, [r6+hq*4] 2691*c0909341SAndroid Build Coastguard Worker sub angled, 180 2692*c0909341SAndroid Build Coastguard Worker add hq, r6 2693*c0909341SAndroid Build Coastguard Worker mov dyd, angled 2694*c0909341SAndroid Build Coastguard Worker neg dyd 2695*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 2696*c0909341SAndroid Build Coastguard Worker or dyq, ~0x7e 2697*c0909341SAndroid Build Coastguard Worker movzx dyd, word [r7+dyq] 2698*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_62] 2699*c0909341SAndroid Build Coastguard Worker mov org_wd, wd 2700*c0909341SAndroid Build Coastguard Worker jmp hq 2701*c0909341SAndroid Build Coastguard Worker.h4: 2702*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -64, 7 2703*c0909341SAndroid Build Coastguard Worker lea r7, [strideq*3] 2704*c0909341SAndroid Build Coastguard Worker cmp angleb, 40 2705*c0909341SAndroid Build Coastguard Worker jae .h4_no_upsample 2706*c0909341SAndroid Build Coastguard Worker lea r4d, [angleq-1024] 2707*c0909341SAndroid Build Coastguard Worker sar r4d, 7 2708*c0909341SAndroid Build Coastguard Worker add r4d, wd 2709*c0909341SAndroid Build Coastguard Worker jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) 2710*c0909341SAndroid Build Coastguard Worker mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 2711*c0909341SAndroid Build Coastguard Worker pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 2712*c0909341SAndroid Build Coastguard Worker vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 2713*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm1, q0000 2714*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2715*c0909341SAndroid Build Coastguard Worker paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 2716*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm4, r8m ; pixel_max 2717*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2718*c0909341SAndroid Build Coastguard Worker psubw xm0, xm1, xm0 2719*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], xm3 2720*c0909341SAndroid Build Coastguard Worker movd xm3, dyd 2721*c0909341SAndroid Build Coastguard Worker psraw xm0, 3 2722*c0909341SAndroid Build Coastguard Worker neg dyd 2723*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0 2724*c0909341SAndroid Build Coastguard Worker pxor xm0, xm0 2725*c0909341SAndroid Build Coastguard Worker lea r2d, [dyq+(16<<6)+63] ; ypos 2726*c0909341SAndroid Build Coastguard Worker pmaxsw xm1, xm0 2727*c0909341SAndroid Build Coastguard Worker pavgw xm1, xm0 2728*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 2729*c0909341SAndroid Build Coastguard Worker pminsw xm1, xm4 2730*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm1, xm2 2731*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm2 2732*c0909341SAndroid Build Coastguard Worker paddw m2, m3, m3 2733*c0909341SAndroid Build Coastguard Worker mova [rsp+32], xm0 2734*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m2 2735*c0909341SAndroid Build Coastguard Worker mova [rsp+16], xm1 2736*c0909341SAndroid Build Coastguard Worker paddw m4, m2, m2 2737*c0909341SAndroid Build Coastguard Worker paddw m2, m3 2738*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 2739*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop: 2740*c0909341SAndroid Build Coastguard Worker lea r4d, [r2+dyq] 2741*c0909341SAndroid Build Coastguard Worker shr r2d, 6 2742*c0909341SAndroid Build Coastguard Worker movu xm1, [rsp+r2*2] 2743*c0909341SAndroid Build Coastguard Worker lea r2d, [r4+dyq] 2744*c0909341SAndroid Build Coastguard Worker shr r4d, 6 2745*c0909341SAndroid Build Coastguard Worker movu xm2, [rsp+r4*2] 2746*c0909341SAndroid Build Coastguard Worker lea r4d, [r2+dyq] 2747*c0909341SAndroid Build Coastguard Worker shr r2d, 6 2748*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [rsp+r2*2], 1 2749*c0909341SAndroid Build Coastguard Worker lea r2d, [r4+dyq] 2750*c0909341SAndroid Build Coastguard Worker shr r4d, 6 2751*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [rsp+r4*2], 1 2752*c0909341SAndroid Build Coastguard Worker psrld m0, m1, 16 2753*c0909341SAndroid Build Coastguard Worker pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 2754*c0909341SAndroid Build Coastguard Worker pslld m2, 16 2755*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 0xaa 2756*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 2757*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2758*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2759*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2760*c0909341SAndroid Build Coastguard Worker paddw m3, m4 2761*c0909341SAndroid Build Coastguard Worker paddw m1, m0 2762*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m1, 1 2763*c0909341SAndroid Build Coastguard Worker punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 2764*c0909341SAndroid Build Coastguard Worker punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 2765*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm0 2766*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm0 2767*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm1 2768*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm1 2769*c0909341SAndroid Build Coastguard Worker add dstq, 8 2770*c0909341SAndroid Build Coastguard Worker sub wd, 4 2771*c0909341SAndroid Build Coastguard Worker jg .h4_upsample_loop 2772*c0909341SAndroid Build Coastguard Worker RET 2773*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2774*c0909341SAndroid Build Coastguard Worker.filter_strength: ; h4/h8/h16 2775*c0909341SAndroid Build Coastguard Worker%define base r4-z_filter_t0 2776*c0909341SAndroid Build Coastguard Worker lea r4, [z_filter_t0] 2777*c0909341SAndroid Build Coastguard Worker movd xm0, maxbased 2778*c0909341SAndroid Build Coastguard Worker movd xm1, angled 2779*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2780*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 2781*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 2782*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, [base+z_filter_wh] 2783*c0909341SAndroid Build Coastguard Worker pand m0, m1 2784*c0909341SAndroid Build Coastguard Worker mova xm1, [r4+angleq*8] 2785*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m1 2786*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m0 2787*c0909341SAndroid Build Coastguard Worker ret 2788*c0909341SAndroid Build Coastguard Worker.h4_no_upsample: 2789*c0909341SAndroid Build Coastguard Worker mov maxbased, 7 2790*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2791*c0909341SAndroid Build Coastguard Worker jnz .h4_main 2792*c0909341SAndroid Build Coastguard Worker lea maxbased, [wq+3] 2793*c0909341SAndroid Build Coastguard Worker call .filter_strength 2794*c0909341SAndroid Build Coastguard Worker mov maxbased, 7 2795*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2796*c0909341SAndroid Build Coastguard Worker jz .h4_main ; filter_strength == 0 2797*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 2798*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 2799*c0909341SAndroid Build Coastguard Worker movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 2800*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] 2801*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] 2802*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm0 2803*c0909341SAndroid Build Coastguard Worker pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 2804*c0909341SAndroid Build Coastguard Worker paddw xm1, xm0, xm3 2805*c0909341SAndroid Build Coastguard Worker movd [rsp+12], xm0 2806*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm4 2807*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 2808*c0909341SAndroid Build Coastguard Worker jne .h4_filter_3tap 2809*c0909341SAndroid Build Coastguard Worker pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 2810*c0909341SAndroid Build Coastguard Worker vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 2811*c0909341SAndroid Build Coastguard Worker movzx r4d, word [tlq-14] 2812*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq-12] 2813*c0909341SAndroid Build Coastguard Worker inc maxbased 2814*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2815*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 2816*c0909341SAndroid Build Coastguard Worker sub r2d, r4d 2817*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, xm0 2818*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r4*8+4] 2819*c0909341SAndroid Build Coastguard Worker shr r2d, 3 2820*c0909341SAndroid Build Coastguard Worker mov [rsp+14], r2w 2821*c0909341SAndroid Build Coastguard Worker.h4_filter_3tap: 2822*c0909341SAndroid Build Coastguard Worker pxor xm0, xm0 2823*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 2824*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+30] 2825*c0909341SAndroid Build Coastguard Worker psrlw xm1, 3 2826*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2827*c0909341SAndroid Build Coastguard Worker sbb maxbased, -1 2828*c0909341SAndroid Build Coastguard Worker pavgw xm0, xm1 2829*c0909341SAndroid Build Coastguard Worker mova [rsp+16], xm0 2830*c0909341SAndroid Build Coastguard Worker.h4_main: 2831*c0909341SAndroid Build Coastguard Worker movd xm3, dyd 2832*c0909341SAndroid Build Coastguard Worker neg maxbaseq 2833*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [z_base_inc] 2834*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] 2835*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 2836*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 2837*c0909341SAndroid Build Coastguard Worker lea r4d, [maxbaseq+3*64] 2838*c0909341SAndroid Build Coastguard Worker neg dyq 2839*c0909341SAndroid Build Coastguard Worker movd xm2, r4d 2840*c0909341SAndroid Build Coastguard Worker sub tlq, 8 2841*c0909341SAndroid Build Coastguard Worker lea r4, [dyq+63] ; ypos 2842*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m1 2843*c0909341SAndroid Build Coastguard Worker paddw m0, m3, m3 2844*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, xm2 2845*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m0 2846*c0909341SAndroid Build Coastguard Worker paddw m4, m0, m0 2847*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2848*c0909341SAndroid Build Coastguard Worker psubw m2, m1 2849*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 2850*c0909341SAndroid Build Coastguard Worker or maxbased, 63 2851*c0909341SAndroid Build Coastguard Worker paddw m3, m2 2852*c0909341SAndroid Build Coastguard Worker.h4_loop: 2853*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 2854*c0909341SAndroid Build Coastguard Worker sar r4, 6 ; base0 2855*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq+r4*2] 2856*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 2857*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base1 2858*c0909341SAndroid Build Coastguard Worker movu xm2, [tlq+r5*2] 2859*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 2860*c0909341SAndroid Build Coastguard Worker sar r4, 6 ; base2 2861*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tlq+r4*2], 1 2862*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 2863*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base3 2864*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [tlq+r5*2], 1 2865*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1, m2 2866*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 2867*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 2868*c0909341SAndroid Build Coastguard Worker palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 2869*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2870*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2871*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2872*c0909341SAndroid Build Coastguard Worker psraw m2, m3, 15 ; ypos < max_base_y 2873*c0909341SAndroid Build Coastguard Worker paddw m3, m4 2874*c0909341SAndroid Build Coastguard Worker paddw m1, m0 2875*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m6, m1, m2 2876*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m1, 1 2877*c0909341SAndroid Build Coastguard Worker punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 2878*c0909341SAndroid Build Coastguard Worker punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 2879*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm0 2880*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm0 2881*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm1 2882*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm1 2883*c0909341SAndroid Build Coastguard Worker sub wd, 4 2884*c0909341SAndroid Build Coastguard Worker jz .h4_end 2885*c0909341SAndroid Build Coastguard Worker add dstq, 8 2886*c0909341SAndroid Build Coastguard Worker cmp r4d, maxbased 2887*c0909341SAndroid Build Coastguard Worker jg .h4_loop 2888*c0909341SAndroid Build Coastguard Worker.h4_end_loop: 2889*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm6 2890*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm6 2891*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm6 2892*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm6 2893*c0909341SAndroid Build Coastguard Worker add dstq, 8 2894*c0909341SAndroid Build Coastguard Worker sub wd, 4 2895*c0909341SAndroid Build Coastguard Worker jg .h4_end_loop 2896*c0909341SAndroid Build Coastguard Worker.h4_end: 2897*c0909341SAndroid Build Coastguard Worker RET 2898*c0909341SAndroid Build Coastguard Worker.h8: 2899*c0909341SAndroid Build Coastguard Worker lea r4d, [angleq+216] 2900*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -64, 8 2901*c0909341SAndroid Build Coastguard Worker mov r4b, wb 2902*c0909341SAndroid Build Coastguard Worker lea r7, [strideq*3] 2903*c0909341SAndroid Build Coastguard Worker cmp r4d, 8 2904*c0909341SAndroid Build Coastguard Worker ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2905*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2906*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e 2907*c0909341SAndroid Build Coastguard Worker movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d 2908*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2909*c0909341SAndroid Build Coastguard Worker je .h8_upsample_w8 2910*c0909341SAndroid Build Coastguard Worker pshufhw xm3, xm2, q1000 2911*c0909341SAndroid Build Coastguard Worker vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d 2912*c0909341SAndroid Build Coastguard Worker.h8_upsample_w8: 2913*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2914*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, r8m ; pixel_max 2915*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2916*c0909341SAndroid Build Coastguard Worker psubw m0, m1, m0 2917*c0909341SAndroid Build Coastguard Worker movd xm6, dyd 2918*c0909341SAndroid Build Coastguard Worker psraw m0, 3 2919*c0909341SAndroid Build Coastguard Worker neg dyd 2920*c0909341SAndroid Build Coastguard Worker paddw m1, m0 2921*c0909341SAndroid Build Coastguard Worker pxor m0, m0 2922*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m0 2923*c0909341SAndroid Build Coastguard Worker lea r4d, [dyq+(16<<6)+63] ; ypos 2924*c0909341SAndroid Build Coastguard Worker pavgw m1, m0 2925*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 2926*c0909341SAndroid Build Coastguard Worker pminsw m1, m4 2927*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1, m2 2928*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 2929*c0909341SAndroid Build Coastguard Worker vextracti128 [rsp+48], m0, 1 2930*c0909341SAndroid Build Coastguard Worker vextracti128 [rsp+32], m1, 1 2931*c0909341SAndroid Build Coastguard Worker paddw m7, m6, m6 2932*c0909341SAndroid Build Coastguard Worker mova [rsp+16], xm0 2933*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], xm1 2934*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 ; ypos0 ypos1 2935*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop: 2936*c0909341SAndroid Build Coastguard Worker lea r2d, [r4+dyq] 2937*c0909341SAndroid Build Coastguard Worker shr r4d, 6 ; base0 2938*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+r4*2] 2939*c0909341SAndroid Build Coastguard Worker lea r4d, [r2+dyq] 2940*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base1 2941*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+r2*2] 2942*c0909341SAndroid Build Coastguard Worker lea r2d, [r4+dyq] 2943*c0909341SAndroid Build Coastguard Worker shr r4d, 6 ; base2 2944*c0909341SAndroid Build Coastguard Worker movu m3, [rsp+r4*2] 2945*c0909341SAndroid Build Coastguard Worker lea r4d, [r2+dyq] 2946*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base3 2947*c0909341SAndroid Build Coastguard Worker movu m4, [rsp+r2*2] 2948*c0909341SAndroid Build Coastguard Worker psrld m0, m1, 16 2949*c0909341SAndroid Build Coastguard Worker pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 2950*c0909341SAndroid Build Coastguard Worker pslld m2, 16 2951*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 0xaa 2952*c0909341SAndroid Build Coastguard Worker psrld m2, m3, 16 2953*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 2954*c0909341SAndroid Build Coastguard Worker pslld m4, 16 2955*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 2956*c0909341SAndroid Build Coastguard Worker pand m4, m5, m6 2957*c0909341SAndroid Build Coastguard Worker paddw m6, m7 2958*c0909341SAndroid Build Coastguard Worker psllw m4, 9 2959*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2960*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 2961*c0909341SAndroid Build Coastguard Worker pand m4, m5, m6 2962*c0909341SAndroid Build Coastguard Worker psllw m4, 9 2963*c0909341SAndroid Build Coastguard Worker psubw m3, m2 2964*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 2965*c0909341SAndroid Build Coastguard Worker paddw m6, m7 2966*c0909341SAndroid Build Coastguard Worker lea r2, [dstq+strideq*4] 2967*c0909341SAndroid Build Coastguard Worker paddw m1, m0 2968*c0909341SAndroid Build Coastguard Worker paddw m3, m2 2969*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 2970*c0909341SAndroid Build Coastguard Worker punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 2971*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m0, 1 2972*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m1, 1 2973*c0909341SAndroid Build Coastguard Worker movhps [r2 +strideq*0], xm0 2974*c0909341SAndroid Build Coastguard Worker movq [r2 +strideq*1], xm0 2975*c0909341SAndroid Build Coastguard Worker movhps [r2 +strideq*2], xm1 2976*c0909341SAndroid Build Coastguard Worker movq [r2 +r7 ], xm1 2977*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm2 2978*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm2 2979*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm3 2980*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm3 2981*c0909341SAndroid Build Coastguard Worker add dstq, 8 2982*c0909341SAndroid Build Coastguard Worker sub wd, 4 2983*c0909341SAndroid Build Coastguard Worker jg .h8_upsample_loop 2984*c0909341SAndroid Build Coastguard Worker RET 2985*c0909341SAndroid Build Coastguard Worker.h8_no_intra_edge_filter: 2986*c0909341SAndroid Build Coastguard Worker and maxbased, 7 2987*c0909341SAndroid Build Coastguard Worker or maxbased, 8 ; imin(w+7, 15) 2988*c0909341SAndroid Build Coastguard Worker jmp .h8_main 2989*c0909341SAndroid Build Coastguard Worker.h8_no_upsample: 2990*c0909341SAndroid Build Coastguard Worker lea maxbased, [wq+7] 2991*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2992*c0909341SAndroid Build Coastguard Worker jnz .h8_no_intra_edge_filter 2993*c0909341SAndroid Build Coastguard Worker call .filter_strength 2994*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2995*c0909341SAndroid Build Coastguard Worker jz .h8_main 2996*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 2997*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2998*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2999*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] 3000*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 3001*c0909341SAndroid Build Coastguard Worker pmullw m2, m0 3002*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3003*c0909341SAndroid Build Coastguard Worker jl .h8_filter_w4 3004*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm0 3005*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3006*c0909341SAndroid Build Coastguard Worker movd [rsp+28], xm0 3007*c0909341SAndroid Build Coastguard Worker paddw m1, m3 3008*c0909341SAndroid Build Coastguard Worker mov r4d, 16 3009*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 3010*c0909341SAndroid Build Coastguard Worker cmovg maxbased, r4d 3011*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 3012*c0909341SAndroid Build Coastguard Worker jne .h8_filter_3tap 3013*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m3 3014*c0909341SAndroid Build Coastguard Worker vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3015*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 3016*c0909341SAndroid Build Coastguard Worker movzx r4d, word [tlq-30] 3017*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq-28] 3018*c0909341SAndroid Build Coastguard Worker inc maxbased 3019*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3020*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3021*c0909341SAndroid Build Coastguard Worker sub r2d, r4d 3022*c0909341SAndroid Build Coastguard Worker paddw m2, m0, m0 3023*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r4*8+4] 3024*c0909341SAndroid Build Coastguard Worker shr r2d, 3 3025*c0909341SAndroid Build Coastguard Worker mov [rsp+30], r2w 3026*c0909341SAndroid Build Coastguard Worker jmp .h8_filter_3tap 3027*c0909341SAndroid Build Coastguard Worker.h8_filter_w4: 3028*c0909341SAndroid Build Coastguard Worker pshufhw xm1, xm0, q2100 3029*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e 3030*c0909341SAndroid Build Coastguard Worker paddw m1, m3 3031*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 3032*c0909341SAndroid Build Coastguard Worker.h8_filter_3tap: 3033*c0909341SAndroid Build Coastguard Worker pxor m0, m0 3034*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3035*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+62] 3036*c0909341SAndroid Build Coastguard Worker psrlw m1, 3 3037*c0909341SAndroid Build Coastguard Worker pavgw m0, m1 3038*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m0 3039*c0909341SAndroid Build Coastguard Worker.h8_main: 3040*c0909341SAndroid Build Coastguard Worker movd xm4, dyd 3041*c0909341SAndroid Build Coastguard Worker neg maxbaseq 3042*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [z_base_inc] 3043*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [tlq+maxbaseq*2] 3044*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 3045*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 3046*c0909341SAndroid Build Coastguard Worker lea r4d, [maxbaseq+7*64] 3047*c0909341SAndroid Build Coastguard Worker neg dyq 3048*c0909341SAndroid Build Coastguard Worker movd xm2, r4d 3049*c0909341SAndroid Build Coastguard Worker sub tlq, 16 3050*c0909341SAndroid Build Coastguard Worker lea r4, [dyq+63] 3051*c0909341SAndroid Build Coastguard Worker paddw m6, m4, m4 3052*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, xm2 3053*c0909341SAndroid Build Coastguard Worker vpblendd m4, m6, 0xf0 ; ypos0 ypos1 3054*c0909341SAndroid Build Coastguard Worker psubw m2, m1 3055*c0909341SAndroid Build Coastguard Worker or maxbased, 63 3056*c0909341SAndroid Build Coastguard Worker paddw m4, m2 3057*c0909341SAndroid Build Coastguard Worker.h8_loop: 3058*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 3059*c0909341SAndroid Build Coastguard Worker sar r4, 6 ; base0 3060*c0909341SAndroid Build Coastguard Worker movu xm0, [tlq+r4*2+2] 3061*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq+r4*2] 3062*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 3063*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base1 3064*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [tlq+r5*2+2], 1 3065*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tlq+r5*2], 1 3066*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 3067*c0909341SAndroid Build Coastguard Worker sar r4, 6 ; base2 3068*c0909341SAndroid Build Coastguard Worker pand m3, m5, m4 3069*c0909341SAndroid Build Coastguard Worker psllw m3, 9 3070*c0909341SAndroid Build Coastguard Worker psubw m1, m0 3071*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 3072*c0909341SAndroid Build Coastguard Worker psraw m3, m4, 15 3073*c0909341SAndroid Build Coastguard Worker paddw m4, m6 3074*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3075*c0909341SAndroid Build Coastguard Worker movu xm1, [tlq+r4*2+2] 3076*c0909341SAndroid Build Coastguard Worker movu xm2, [tlq+r4*2] 3077*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 3078*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base3 3079*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m7, m0, m3 3080*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tlq+r5*2+2], 1 3081*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [tlq+r5*2], 1 3082*c0909341SAndroid Build Coastguard Worker pand m3, m5, m4 3083*c0909341SAndroid Build Coastguard Worker psllw m3, 9 3084*c0909341SAndroid Build Coastguard Worker psubw m2, m1 3085*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 3086*c0909341SAndroid Build Coastguard Worker psraw m3, m4, 15 3087*c0909341SAndroid Build Coastguard Worker paddw m4, m6 3088*c0909341SAndroid Build Coastguard Worker lea r5, [dstq+strideq*4] 3089*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3090*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m7, m1, m3 3091*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 3092*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m2, 1 3093*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 3094*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 3095*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 3096*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m0, 1 3097*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm1 3098*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 3099*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 3100*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm2 3101*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 3102*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 3103*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*0], xm1 3104*c0909341SAndroid Build Coastguard Worker movq [r5 +strideq*1], xm1 3105*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*2], xm0 3106*c0909341SAndroid Build Coastguard Worker movq [r5 +r7 ], xm0 3107*c0909341SAndroid Build Coastguard Worker sub wd, 4 3108*c0909341SAndroid Build Coastguard Worker jz .h8_end 3109*c0909341SAndroid Build Coastguard Worker add dstq, 8 3110*c0909341SAndroid Build Coastguard Worker cmp r4d, maxbased 3111*c0909341SAndroid Build Coastguard Worker jg .h8_loop 3112*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*5] 3113*c0909341SAndroid Build Coastguard Worker lea r2, [strideq+r7*2] ; stride*7 3114*c0909341SAndroid Build Coastguard Worker test wd, 4 3115*c0909341SAndroid Build Coastguard Worker jz .h8_end_loop 3116*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm7 3117*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm7 3118*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm7 3119*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm7 3120*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*4], xm7 3121*c0909341SAndroid Build Coastguard Worker movq [dstq+r6 ], xm7 3122*c0909341SAndroid Build Coastguard Worker movq [dstq+r7*2 ], xm7 3123*c0909341SAndroid Build Coastguard Worker movq [dstq+r2 ], xm7 3124*c0909341SAndroid Build Coastguard Worker add dstq, 8 3125*c0909341SAndroid Build Coastguard Worker sub wd, 4 3126*c0909341SAndroid Build Coastguard Worker jz .h8_end 3127*c0909341SAndroid Build Coastguard Worker.h8_end_loop: 3128*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm7 3129*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm7 3130*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm7 3131*c0909341SAndroid Build Coastguard Worker mova [dstq+r7 ], xm7 3132*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*4], xm7 3133*c0909341SAndroid Build Coastguard Worker mova [dstq+r6 ], xm7 3134*c0909341SAndroid Build Coastguard Worker mova [dstq+r7*2 ], xm7 3135*c0909341SAndroid Build Coastguard Worker mova [dstq+r2 ], xm7 3136*c0909341SAndroid Build Coastguard Worker add dstq, 16 3137*c0909341SAndroid Build Coastguard Worker sub wd, 8 3138*c0909341SAndroid Build Coastguard Worker jg .h8_end_loop 3139*c0909341SAndroid Build Coastguard Worker.h8_end: 3140*c0909341SAndroid Build Coastguard Worker RET 3141*c0909341SAndroid Build Coastguard Worker.h16_no_intra_edge_filter: 3142*c0909341SAndroid Build Coastguard Worker and maxbased, 15 3143*c0909341SAndroid Build Coastguard Worker or maxbased, 16 ; imin(w+15, 31) 3144*c0909341SAndroid Build Coastguard Worker jmp .h16_main 3145*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3146*c0909341SAndroid Build Coastguard Worker.h16: 3147*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -96, 10 3148*c0909341SAndroid Build Coastguard Worker lea maxbased, [wq+15] 3149*c0909341SAndroid Build Coastguard Worker lea r7, [strideq*3] 3150*c0909341SAndroid Build Coastguard Worker test angled, 0x400 3151*c0909341SAndroid Build Coastguard Worker jnz .h16_no_intra_edge_filter 3152*c0909341SAndroid Build Coastguard Worker call .filter_strength 3153*c0909341SAndroid Build Coastguard Worker test r5d, r5d 3154*c0909341SAndroid Build Coastguard Worker jz .h16_main ; filter_strength == 0 3155*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 3156*c0909341SAndroid Build Coastguard Worker movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3157*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3158*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] 3159*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] 3160*c0909341SAndroid Build Coastguard Worker pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3161*c0909341SAndroid Build Coastguard Worker pmullw m1, m7 3162*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3163*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3164*c0909341SAndroid Build Coastguard Worker jg .h16_filter_w16 3165*c0909341SAndroid Build Coastguard Worker mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 3166*c0909341SAndroid Build Coastguard Worker pmullw xm6, xm3 3167*c0909341SAndroid Build Coastguard Worker jl .h16_filter_w4 3168*c0909341SAndroid Build Coastguard Worker pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 3169*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 3170*c0909341SAndroid Build Coastguard Worker jne .h16_filter_w8_3tap 3171*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 3172*c0909341SAndroid Build Coastguard Worker.h16_filter_w8_5tap: 3173*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m0 3174*c0909341SAndroid Build Coastguard Worker vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3175*c0909341SAndroid Build Coastguard Worker paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 3176*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3177*c0909341SAndroid Build Coastguard Worker paddw xm4, xm4 3178*c0909341SAndroid Build Coastguard Worker paddw m0, m0 3179*c0909341SAndroid Build Coastguard Worker paddw xm6, xm4 3180*c0909341SAndroid Build Coastguard Worker paddw m1, m0 3181*c0909341SAndroid Build Coastguard Worker.h16_filter_w8_3tap: 3182*c0909341SAndroid Build Coastguard Worker paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 3183*c0909341SAndroid Build Coastguard Worker pmullw xm3, xm7 3184*c0909341SAndroid Build Coastguard Worker pxor m0, m0 3185*c0909341SAndroid Build Coastguard Worker paddw xm3, xm6 3186*c0909341SAndroid Build Coastguard Worker psrlw xm3, 3 3187*c0909341SAndroid Build Coastguard Worker pavgw xm3, xm0 3188*c0909341SAndroid Build Coastguard Worker mova [rsp+48], xm3 3189*c0909341SAndroid Build Coastguard Worker jmp .h16_filter_end 3190*c0909341SAndroid Build Coastguard Worker.h16_filter_w4: 3191*c0909341SAndroid Build Coastguard Worker pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 3192*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 3193*c0909341SAndroid Build Coastguard Worker jne .h16_filter_w8_3tap 3194*c0909341SAndroid Build Coastguard Worker pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 3195*c0909341SAndroid Build Coastguard Worker jmp .h16_filter_w8_5tap 3196*c0909341SAndroid Build Coastguard Worker.h16_filter_w16: 3197*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3198*c0909341SAndroid Build Coastguard Worker pmullw m6, m3 3199*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm3 3200*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3201*c0909341SAndroid Build Coastguard Worker paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3202*c0909341SAndroid Build Coastguard Worker mov r4d, 32 3203*c0909341SAndroid Build Coastguard Worker cmp wd, 16 3204*c0909341SAndroid Build Coastguard Worker cmovg maxbased, r4d 3205*c0909341SAndroid Build Coastguard Worker movd [rsp+28], xm3 3206*c0909341SAndroid Build Coastguard Worker pmullw m4, m7 3207*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 3208*c0909341SAndroid Build Coastguard Worker jne .h16_filter_w16_3tap 3209*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m0 3210*c0909341SAndroid Build Coastguard Worker vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3211*c0909341SAndroid Build Coastguard Worker vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3212*c0909341SAndroid Build Coastguard Worker paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3213*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3214*c0909341SAndroid Build Coastguard Worker movzx r4d, word [tlq-62] 3215*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq-60] 3216*c0909341SAndroid Build Coastguard Worker or maxbased, 1 3217*c0909341SAndroid Build Coastguard Worker paddw m3, m3 3218*c0909341SAndroid Build Coastguard Worker sub r2d, r4d 3219*c0909341SAndroid Build Coastguard Worker paddw m0, m0 3220*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r4*8+4] 3221*c0909341SAndroid Build Coastguard Worker paddw m4, m3 3222*c0909341SAndroid Build Coastguard Worker shr r2d, 3 3223*c0909341SAndroid Build Coastguard Worker paddw m1, m0 3224*c0909341SAndroid Build Coastguard Worker mov [rsp+30], r2w 3225*c0909341SAndroid Build Coastguard Worker.h16_filter_w16_3tap: 3226*c0909341SAndroid Build Coastguard Worker pxor m0, m0 3227*c0909341SAndroid Build Coastguard Worker paddw m4, m6 3228*c0909341SAndroid Build Coastguard Worker psrlw m4, 3 3229*c0909341SAndroid Build Coastguard Worker pavgw m4, m0 3230*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m4 3231*c0909341SAndroid Build Coastguard Worker.h16_filter_end: 3232*c0909341SAndroid Build Coastguard Worker psrlw m1, 3 3233*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+94] 3234*c0909341SAndroid Build Coastguard Worker pavgw m1, m0 3235*c0909341SAndroid Build Coastguard Worker mova [rsp+64], m1 3236*c0909341SAndroid Build Coastguard Worker.h16_main: 3237*c0909341SAndroid Build Coastguard Worker movd xm8, dyd 3238*c0909341SAndroid Build Coastguard Worker neg maxbaseq 3239*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [tlq+maxbaseq*2] 3240*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 3241*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, xm8 3242*c0909341SAndroid Build Coastguard Worker lea r4d, [maxbaseq+dyq+15*64] 3243*c0909341SAndroid Build Coastguard Worker neg dyq 3244*c0909341SAndroid Build Coastguard Worker movd xm7, r4d 3245*c0909341SAndroid Build Coastguard Worker sub tlq, 32 3246*c0909341SAndroid Build Coastguard Worker lea r4, [dyq+63] 3247*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, xm7 3248*c0909341SAndroid Build Coastguard Worker or maxbased, 63 3249*c0909341SAndroid Build Coastguard Worker psubw m7, [z_base_inc] 3250*c0909341SAndroid Build Coastguard Worker.h16_loop: 3251*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 3252*c0909341SAndroid Build Coastguard Worker sar r4, 6 ; base0 3253*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2+2] 3254*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r4*2] 3255*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 3256*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base1 3257*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2+2] 3258*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r5*2] 3259*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 3260*c0909341SAndroid Build Coastguard Worker sar r4, 6 ; base3 3261*c0909341SAndroid Build Coastguard Worker pand m6, m5, m7 3262*c0909341SAndroid Build Coastguard Worker psllw m6, 9 3263*c0909341SAndroid Build Coastguard Worker psubw m2, m0 3264*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 3265*c0909341SAndroid Build Coastguard Worker psraw m6, m7, 15 3266*c0909341SAndroid Build Coastguard Worker paddw m7, m8 3267*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3268*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r4*2+2] 3269*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+r4*2] 3270*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 3271*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base3 3272*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m9, m0, m6 3273*c0909341SAndroid Build Coastguard Worker pand m6, m5, m7 3274*c0909341SAndroid Build Coastguard Worker psllw m6, 9 3275*c0909341SAndroid Build Coastguard Worker psubw m3, m1 3276*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 3277*c0909341SAndroid Build Coastguard Worker psraw m6, m7, 15 3278*c0909341SAndroid Build Coastguard Worker paddw m7, m8 3279*c0909341SAndroid Build Coastguard Worker paddw m1, m3 3280*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m9, m1, m6 3281*c0909341SAndroid Build Coastguard Worker pand m6, m5, m7 3282*c0909341SAndroid Build Coastguard Worker psllw m6, 9 3283*c0909341SAndroid Build Coastguard Worker psubw m4, m2 3284*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m6 3285*c0909341SAndroid Build Coastguard Worker psraw m6, m7, 15 3286*c0909341SAndroid Build Coastguard Worker paddw m7, m8 3287*c0909341SAndroid Build Coastguard Worker paddw m2, m4 3288*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r5*2+2] 3289*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+r5*2] 3290*c0909341SAndroid Build Coastguard Worker vpblendvb m2, m9, m2, m6 3291*c0909341SAndroid Build Coastguard Worker pand m6, m5, m7 3292*c0909341SAndroid Build Coastguard Worker psllw m6, 9 3293*c0909341SAndroid Build Coastguard Worker psubw m4, m3 3294*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m6 3295*c0909341SAndroid Build Coastguard Worker psraw m6, m7, 15 3296*c0909341SAndroid Build Coastguard Worker paddw m7, m8 3297*c0909341SAndroid Build Coastguard Worker lea r5, [dstq+strideq*4] 3298*c0909341SAndroid Build Coastguard Worker paddw m3, m4 3299*c0909341SAndroid Build Coastguard Worker vpblendvb m3, m9, m3, m6 3300*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 3301*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 3302*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 3303*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 3304*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 3305*c0909341SAndroid Build Coastguard Worker vextracti128 xm6, m3, 1 3306*c0909341SAndroid Build Coastguard Worker punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 3307*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 3308*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 3309*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m4, 1 3310*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm6 3311*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm6 3312*c0909341SAndroid Build Coastguard Worker vextracti128 xm6, m1, 1 3313*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 3314*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm2 3315*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m0, 1 3316*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*0], xm6 3317*c0909341SAndroid Build Coastguard Worker movq [r5 +strideq*1], xm6 3318*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*2], xm2 3319*c0909341SAndroid Build Coastguard Worker movq [r5 +r7 ], xm2 3320*c0909341SAndroid Build Coastguard Worker lea r5, [dstq+strideq*8] 3321*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*0], xm3 3322*c0909341SAndroid Build Coastguard Worker movq [r5 +strideq*1], xm3 3323*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*2], xm4 3324*c0909341SAndroid Build Coastguard Worker movq [r5 +r7 ], xm4 3325*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*4] 3326*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*0], xm1 3327*c0909341SAndroid Build Coastguard Worker movq [r5 +strideq*1], xm1 3328*c0909341SAndroid Build Coastguard Worker movhps [r5 +strideq*2], xm0 3329*c0909341SAndroid Build Coastguard Worker movq [r5 +r7 ], xm0 3330*c0909341SAndroid Build Coastguard Worker sub wd, 4 3331*c0909341SAndroid Build Coastguard Worker jz .h16_end 3332*c0909341SAndroid Build Coastguard Worker add dstq, 8 3333*c0909341SAndroid Build Coastguard Worker cmp r4d, maxbased 3334*c0909341SAndroid Build Coastguard Worker jg .h16_loop 3335*c0909341SAndroid Build Coastguard Worker mov hd, 4 3336*c0909341SAndroid Build Coastguard Worker.h16_end_loop0: 3337*c0909341SAndroid Build Coastguard Worker mov r6d, wd 3338*c0909341SAndroid Build Coastguard Worker mov r2, dstq 3339*c0909341SAndroid Build Coastguard Worker test wb, 4 3340*c0909341SAndroid Build Coastguard Worker jz .h16_end_loop 3341*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm9 3342*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm9 3343*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm9 3344*c0909341SAndroid Build Coastguard Worker movq [dstq+r7 ], xm9 3345*c0909341SAndroid Build Coastguard Worker and r6d, 120 3346*c0909341SAndroid Build Coastguard Worker jz .h16_end_w4 3347*c0909341SAndroid Build Coastguard Worker add dstq, 8 3348*c0909341SAndroid Build Coastguard Worker.h16_end_loop: 3349*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm9 3350*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm9 3351*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm9 3352*c0909341SAndroid Build Coastguard Worker mova [dstq+r7 ], xm9 3353*c0909341SAndroid Build Coastguard Worker add dstq, 16 3354*c0909341SAndroid Build Coastguard Worker sub r6d, 8 3355*c0909341SAndroid Build Coastguard Worker jg .h16_end_loop 3356*c0909341SAndroid Build Coastguard Worker.h16_end_w4: 3357*c0909341SAndroid Build Coastguard Worker lea dstq, [r2+strideq*4] 3358*c0909341SAndroid Build Coastguard Worker dec hd 3359*c0909341SAndroid Build Coastguard Worker jg .h16_end_loop0 3360*c0909341SAndroid Build Coastguard Worker.h16_end: 3361*c0909341SAndroid Build Coastguard Worker RET 3362*c0909341SAndroid Build Coastguard Worker.h32: 3363*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -160, 9 3364*c0909341SAndroid Build Coastguard Worker lea maxbased, [wq+31] 3365*c0909341SAndroid Build Coastguard Worker and maxbased, 31 3366*c0909341SAndroid Build Coastguard Worker or maxbased, 32 ; imin(w+31, 63) 3367*c0909341SAndroid Build Coastguard Worker test angled, 0x400 3368*c0909341SAndroid Build Coastguard Worker jnz .h32_main 3369*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pw_3] 3370*c0909341SAndroid Build Coastguard Worker movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3371*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m0 3372*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3373*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3374*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3375*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3376*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3377*c0909341SAndroid Build Coastguard Worker lea r4, [rsp+128] 3378*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3379*c0909341SAndroid Build Coastguard Worker lea r5d, [maxbaseq-31] 3380*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3381*c0909341SAndroid Build Coastguard Worker mova [r4], m0 3382*c0909341SAndroid Build Coastguard Worker.h32_filter_loop: 3383*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-62] 3384*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [tlq-66] 3385*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-64] 3386*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq-58] 3387*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-60] 3388*c0909341SAndroid Build Coastguard Worker sub tlq, 32 3389*c0909341SAndroid Build Coastguard Worker sub r4, 32 3390*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3391*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3392*c0909341SAndroid Build Coastguard Worker mova [r4], m0 3393*c0909341SAndroid Build Coastguard Worker sub r5d, 16 3394*c0909341SAndroid Build Coastguard Worker jg .h32_filter_loop 3395*c0909341SAndroid Build Coastguard Worker jl .h32_filter_h8 3396*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3397*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm0 3398*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3399*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3400*c0909341SAndroid Build Coastguard Worker vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3401*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3402*c0909341SAndroid Build Coastguard Worker movzx r5d, word [tlq-62] 3403*c0909341SAndroid Build Coastguard Worker movzx r2d, word [tlq-60] 3404*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 3405*c0909341SAndroid Build Coastguard Worker sub r2d, r5d 3406*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3407*c0909341SAndroid Build Coastguard Worker lea r2d, [r2+r5*8+4] 3408*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3409*c0909341SAndroid Build Coastguard Worker shr r2d, 3 3410*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3411*c0909341SAndroid Build Coastguard Worker mova [r4-32], m0 3412*c0909341SAndroid Build Coastguard Worker mov [r4-36], r5w 3413*c0909341SAndroid Build Coastguard Worker mov [r4-34], r2w 3414*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+158] 3415*c0909341SAndroid Build Coastguard Worker mov r4d, 65 3416*c0909341SAndroid Build Coastguard Worker cmp wd, 64 3417*c0909341SAndroid Build Coastguard Worker cmove maxbased, r4d 3418*c0909341SAndroid Build Coastguard Worker jmp .h32_main 3419*c0909341SAndroid Build Coastguard Worker.h32_filter_h8: 3420*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 3421*c0909341SAndroid Build Coastguard Worker pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 3422*c0909341SAndroid Build Coastguard Worker paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 3423*c0909341SAndroid Build Coastguard Worker paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 3424*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 3425*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+158] 3426*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm3 3427*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 3428*c0909341SAndroid Build Coastguard Worker paddw xm0, xm2 3429*c0909341SAndroid Build Coastguard Worker psrlw xm0, 2 3430*c0909341SAndroid Build Coastguard Worker mova [r4-16], xm0 3431*c0909341SAndroid Build Coastguard Worker.h32_main: 3432*c0909341SAndroid Build Coastguard Worker movd xm6, dyd 3433*c0909341SAndroid Build Coastguard Worker neg maxbaseq 3434*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [tlq+maxbaseq*2] 3435*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 3436*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 3437*c0909341SAndroid Build Coastguard Worker lea r4d, [maxbaseq+dyq+15*64] 3438*c0909341SAndroid Build Coastguard Worker neg dyq 3439*c0909341SAndroid Build Coastguard Worker movd xm4, r4d 3440*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_m1024] 3441*c0909341SAndroid Build Coastguard Worker lea r4, [dyq+63] 3442*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 3443*c0909341SAndroid Build Coastguard Worker or maxbased, 63 3444*c0909341SAndroid Build Coastguard Worker psubw m4, [z_base_inc] 3445*c0909341SAndroid Build Coastguard Worker.h32_loop: 3446*c0909341SAndroid Build Coastguard Worker mov r5, r4 3447*c0909341SAndroid Build Coastguard Worker sar r5, 6 3448*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-64] 3449*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5*2-62] 3450*c0909341SAndroid Build Coastguard Worker pand m3, m5, m4 3451*c0909341SAndroid Build Coastguard Worker psllw m3, 9 3452*c0909341SAndroid Build Coastguard Worker psubw m1, m0 3453*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 3454*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m8, m4 3455*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3456*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m7, m0, m2 3457*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2-32] 3458*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-30] 3459*c0909341SAndroid Build Coastguard Worker add r4, dyq 3460*c0909341SAndroid Build Coastguard Worker sub rsp, 64 3461*c0909341SAndroid Build Coastguard Worker psubw m2, m1 3462*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 3463*c0909341SAndroid Build Coastguard Worker psraw m3, m4, 15 3464*c0909341SAndroid Build Coastguard Worker paddw m4, m6 3465*c0909341SAndroid Build Coastguard Worker mova [rsp+32*0], m0 3466*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3467*c0909341SAndroid Build Coastguard Worker vpblendvb m1, m7, m1, m3 3468*c0909341SAndroid Build Coastguard Worker mova [rsp+32*1], m1 3469*c0909341SAndroid Build Coastguard Worker dec wd 3470*c0909341SAndroid Build Coastguard Worker jz .h32_transpose 3471*c0909341SAndroid Build Coastguard Worker cmp r4d, maxbased 3472*c0909341SAndroid Build Coastguard Worker jg .h32_loop 3473*c0909341SAndroid Build Coastguard Worker.h32_end_loop: 3474*c0909341SAndroid Build Coastguard Worker sub rsp, 64 3475*c0909341SAndroid Build Coastguard Worker mova [rsp+32*0], m7 3476*c0909341SAndroid Build Coastguard Worker mova [rsp+32*1], m7 3477*c0909341SAndroid Build Coastguard Worker dec wd 3478*c0909341SAndroid Build Coastguard Worker jg .h32_end_loop 3479*c0909341SAndroid Build Coastguard Worker.h32_transpose: 3480*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3481*c0909341SAndroid Build Coastguard Worker lea r4, [strideq*5] 3482*c0909341SAndroid Build Coastguard Worker mov r8, dstq 3483*c0909341SAndroid Build Coastguard Worker lea r5, [strideq+r3*2] 3484*c0909341SAndroid Build Coastguard Worker.h32_transpose_loop0: 3485*c0909341SAndroid Build Coastguard Worker lea r6, [rsp+32] 3486*c0909341SAndroid Build Coastguard Worker lea r2, [r8+org_wq*2-16] 3487*c0909341SAndroid Build Coastguard Worker.h32_transpose_loop: 3488*c0909341SAndroid Build Coastguard Worker mova m0, [r6+64*7] 3489*c0909341SAndroid Build Coastguard Worker mova m1, [r6+64*6] 3490*c0909341SAndroid Build Coastguard Worker mova m2, [r6+64*5] 3491*c0909341SAndroid Build Coastguard Worker mova m3, [r6+64*4] 3492*c0909341SAndroid Build Coastguard Worker mova m4, [r6+64*3] 3493*c0909341SAndroid Build Coastguard Worker mova m5, [r6+64*2] 3494*c0909341SAndroid Build Coastguard Worker mova m6, [r6+64*1] 3495*c0909341SAndroid Build Coastguard Worker mova m7, [r6+64*0] 3496*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 3497*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 3498*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 3499*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 3500*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 3501*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 3502*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 3503*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 3504*c0909341SAndroid Build Coastguard Worker lea dstq, [r2+strideq*8] 3505*c0909341SAndroid Build Coastguard Worker sub r6, 32 3506*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 3507*c0909341SAndroid Build Coastguard Worker punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 3508*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 3509*c0909341SAndroid Build Coastguard Worker punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 3510*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m7, m1 ; 8 0 3511*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +strideq*0], m5, 1 3512*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m1 ; 9 1 3513*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm5 3514*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m8, m3 ; 10 2 3515*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +strideq*1], m7, 1 3516*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m3 ; 11 3 3517*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm7 3518*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 3519*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +strideq*2], m1, 1 3520*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 3521*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 3522*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 3523*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +r3 ], m8, 1 3524*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 3525*c0909341SAndroid Build Coastguard Worker mova [dstq+r3 ], xm8 3526*c0909341SAndroid Build Coastguard Worker punpckhqdq m6, m3, m2 ; 12 4 3527*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +strideq*4], m6, 1 3528*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m2 ; 13 5 3529*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*4], xm6 3530*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m0, m4 ; 14 6 3531*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +r4 ], m3, 1 3532*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m4 ; 15 7 3533*c0909341SAndroid Build Coastguard Worker mova [dstq+r4 ], xm3 3534*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +r3*2 ], m2, 1 3535*c0909341SAndroid Build Coastguard Worker mova [dstq+r3*2 ], xm2 3536*c0909341SAndroid Build Coastguard Worker vextracti128 [r2 +r5 ], m0, 1 3537*c0909341SAndroid Build Coastguard Worker mova [dstq+r5 ], xm0 3538*c0909341SAndroid Build Coastguard Worker lea r2, [dstq+strideq*8] 3539*c0909341SAndroid Build Coastguard Worker cmp r6, rsp 3540*c0909341SAndroid Build Coastguard Worker jae .h32_transpose_loop 3541*c0909341SAndroid Build Coastguard Worker add rsp, 64*8 3542*c0909341SAndroid Build Coastguard Worker sub org_wd, 8 3543*c0909341SAndroid Build Coastguard Worker jg .h32_transpose_loop0 3544*c0909341SAndroid Build Coastguard Worker.h32_end: 3545*c0909341SAndroid Build Coastguard Worker RET 3546*c0909341SAndroid Build Coastguard Worker.h64: 3547*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -256, 10 3548*c0909341SAndroid Build Coastguard Worker lea maxbased, [wq+63] 3549*c0909341SAndroid Build Coastguard Worker test angled, 0x400 3550*c0909341SAndroid Build Coastguard Worker jnz .h64_main 3551*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pw_3] 3552*c0909341SAndroid Build Coastguard Worker movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3553*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m0 3554*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3555*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3556*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3557*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3558*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3559*c0909341SAndroid Build Coastguard Worker lea r4, [rsp+224] 3560*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3561*c0909341SAndroid Build Coastguard Worker lea r5d, [wq+32] 3562*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3563*c0909341SAndroid Build Coastguard Worker mova [r4], m0 3564*c0909341SAndroid Build Coastguard Worker.h64_filter_loop: 3565*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-62] 3566*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [tlq-66] 3567*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-64] 3568*c0909341SAndroid Build Coastguard Worker pavgw m1, [tlq-58] 3569*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-60] 3570*c0909341SAndroid Build Coastguard Worker sub tlq, 32 3571*c0909341SAndroid Build Coastguard Worker sub r4, 32 3572*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3573*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3574*c0909341SAndroid Build Coastguard Worker mova [r4], m0 3575*c0909341SAndroid Build Coastguard Worker sub r5d, 16 3576*c0909341SAndroid Build Coastguard Worker jg .h64_filter_loop 3577*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3578*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm0 3579*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3580*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3581*c0909341SAndroid Build Coastguard Worker vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3582*c0909341SAndroid Build Coastguard Worker vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3583*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+254] 3584*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 3585*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3586*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3587*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3588*c0909341SAndroid Build Coastguard Worker mova [r4-32], m0 3589*c0909341SAndroid Build Coastguard Worker.h64_main: 3590*c0909341SAndroid Build Coastguard Worker neg maxbaseq 3591*c0909341SAndroid Build Coastguard Worker movd xm4, dyd 3592*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [tlq+maxbaseq*2] 3593*c0909341SAndroid Build Coastguard Worker shl maxbased, 6 3594*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 3595*c0909341SAndroid Build Coastguard Worker lea r4d, [maxbaseq+dyq+15*64] 3596*c0909341SAndroid Build Coastguard Worker neg dyq 3597*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_m1024] 3598*c0909341SAndroid Build Coastguard Worker movd xm3, r4d 3599*c0909341SAndroid Build Coastguard Worker lea r4, [dyq+63] 3600*c0909341SAndroid Build Coastguard Worker paddw m8, m7, m7 3601*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 3602*c0909341SAndroid Build Coastguard Worker or maxbased, 63 3603*c0909341SAndroid Build Coastguard Worker paddw m9, m8, m7 3604*c0909341SAndroid Build Coastguard Worker psubw m3, [z_base_inc] 3605*c0909341SAndroid Build Coastguard Worker.h64_loop: 3606*c0909341SAndroid Build Coastguard Worker mov r5, r4 3607*c0909341SAndroid Build Coastguard Worker sar r5, 6 3608*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-128] 3609*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5*2-126] 3610*c0909341SAndroid Build Coastguard Worker pand m2, m5, m3 3611*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3612*c0909341SAndroid Build Coastguard Worker psubw m1, m0 3613*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 3614*c0909341SAndroid Build Coastguard Worker sub rsp, 128 3615*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3616*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m9, m3 3617*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 3618*c0909341SAndroid Build Coastguard Worker mova [rsp+32*0], m0 3619*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-96] 3620*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5*2-94] 3621*c0909341SAndroid Build Coastguard Worker psubw m1, m0 3622*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 3623*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3624*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m8, m3 3625*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 3626*c0909341SAndroid Build Coastguard Worker mova [rsp+32*1], m0 3627*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-64] 3628*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5*2-62] 3629*c0909341SAndroid Build Coastguard Worker psubw m1, m0 3630*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 3631*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3632*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m7, m3 3633*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m1 3634*c0909341SAndroid Build Coastguard Worker mova [rsp+32*2], m0 3635*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-32] 3636*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5*2-30] 3637*c0909341SAndroid Build Coastguard Worker psubw m1, m0 3638*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 3639*c0909341SAndroid Build Coastguard Worker add r4, dyq 3640*c0909341SAndroid Build Coastguard Worker psraw m2, m3, 15 3641*c0909341SAndroid Build Coastguard Worker paddw m3, m4 3642*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3643*c0909341SAndroid Build Coastguard Worker vpblendvb m0, m6, m0, m2 3644*c0909341SAndroid Build Coastguard Worker mova [rsp+32*3], m0 3645*c0909341SAndroid Build Coastguard Worker dec wd 3646*c0909341SAndroid Build Coastguard Worker jz .h64_transpose 3647*c0909341SAndroid Build Coastguard Worker cmp r4d, maxbased 3648*c0909341SAndroid Build Coastguard Worker jg .h64_loop 3649*c0909341SAndroid Build Coastguard Worker.h64_end_loop: 3650*c0909341SAndroid Build Coastguard Worker sub rsp, 128 3651*c0909341SAndroid Build Coastguard Worker mova [rsp+32*0], m6 3652*c0909341SAndroid Build Coastguard Worker mova [rsp+32*1], m6 3653*c0909341SAndroid Build Coastguard Worker mova [rsp+32*2], m6 3654*c0909341SAndroid Build Coastguard Worker mova [rsp+32*3], m6 3655*c0909341SAndroid Build Coastguard Worker dec wd 3656*c0909341SAndroid Build Coastguard Worker jg .h64_end_loop 3657*c0909341SAndroid Build Coastguard Worker.h64_transpose: 3658*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 3659*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*5] 3660*c0909341SAndroid Build Coastguard Worker mov r5, dstq 3661*c0909341SAndroid Build Coastguard Worker lea r4, [strideq+r2*2] 3662*c0909341SAndroid Build Coastguard Worker.h64_transpose_loop0: 3663*c0909341SAndroid Build Coastguard Worker lea r6, [rsp+112] 3664*c0909341SAndroid Build Coastguard Worker lea dstq, [r5+org_wq*2-32] 3665*c0909341SAndroid Build Coastguard Worker.h64_transpose_loop: 3666*c0909341SAndroid Build Coastguard Worker mova xm0, [r6+128*15] 3667*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r6+128* 7], 1 3668*c0909341SAndroid Build Coastguard Worker mova xm1, [r6+128*14] 3669*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [r6+128* 6], 1 3670*c0909341SAndroid Build Coastguard Worker mova xm2, [r6+128*13] 3671*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [r6+128* 5], 1 3672*c0909341SAndroid Build Coastguard Worker mova xm3, [r6+128*12] 3673*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [r6+128* 4], 1 3674*c0909341SAndroid Build Coastguard Worker mova xm4, [r6+128*11] 3675*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [r6+128* 3], 1 3676*c0909341SAndroid Build Coastguard Worker mova xm5, [r6+128*10] 3677*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r6+128* 2], 1 3678*c0909341SAndroid Build Coastguard Worker mova xm6, [r6+128* 9] 3679*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r6+128* 1], 1 3680*c0909341SAndroid Build Coastguard Worker mova xm7, [r6+128* 8] 3681*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [r6+128* 0], 1 3682*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m0, m1 3683*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 3684*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 3685*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 3686*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m5 3687*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 3688*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6, m7 3689*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 3690*c0909341SAndroid Build Coastguard Worker sub r6, 16 3691*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m8, m1 3692*c0909341SAndroid Build Coastguard Worker punpckldq m8, m1 3693*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m3, m5 3694*c0909341SAndroid Build Coastguard Worker punpckldq m3, m5 3695*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m7, m1 3696*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m1 3697*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m8, m3 3698*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m3 3699*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m0, m2 3700*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m5 3701*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 3702*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m7 3703*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m4, m6 3704*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m1 3705*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6 3706*c0909341SAndroid Build Coastguard Worker mova [dstq+r2 ], m8 3707*c0909341SAndroid Build Coastguard Worker punpckhqdq m6, m3, m2 3708*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*4], m6 3709*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m2 3710*c0909341SAndroid Build Coastguard Worker mova [dstq+r3 ], m3 3711*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m0, m4 3712*c0909341SAndroid Build Coastguard Worker mova [dstq+r2*2 ], m2 3713*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m4 3714*c0909341SAndroid Build Coastguard Worker mova [dstq+r4 ], m0 3715*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*8] 3716*c0909341SAndroid Build Coastguard Worker cmp r6, rsp 3717*c0909341SAndroid Build Coastguard Worker jae .h64_transpose_loop 3718*c0909341SAndroid Build Coastguard Worker add rsp, 128*16 3719*c0909341SAndroid Build Coastguard Worker sub org_wd, 16 3720*c0909341SAndroid Build Coastguard Worker jg .h64_transpose_loop0 3721*c0909341SAndroid Build Coastguard Worker.h64_end: 3722*c0909341SAndroid Build Coastguard Worker RET 3723*c0909341SAndroid Build Coastguard Worker 3724*c0909341SAndroid Build Coastguard Worker%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax 3725*c0909341SAndroid Build Coastguard Worker%ifnum %4 3726*c0909341SAndroid Build Coastguard Worker pshufb xm%2, xm%4 3727*c0909341SAndroid Build Coastguard Worker%else 3728*c0909341SAndroid Build Coastguard Worker pshufb xm%2, %4 3729*c0909341SAndroid Build Coastguard Worker%endif 3730*c0909341SAndroid Build Coastguard Worker vinserti128 m%2, xm%2, 1 3731*c0909341SAndroid Build Coastguard Worker pshufd m%1, m%2, q0000 3732*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m2 3733*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%2, q1111 3734*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m3 3735*c0909341SAndroid Build Coastguard Worker paddd m%1, m1 3736*c0909341SAndroid Build Coastguard Worker paddd m%1, m%3 3737*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%2, q2222 3738*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m4 3739*c0909341SAndroid Build Coastguard Worker paddd m%1, m%3 3740*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%2, q3333 3741*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m5 3742*c0909341SAndroid Build Coastguard Worker paddd m%1, m%3 3743*c0909341SAndroid Build Coastguard Worker psrad m%1, 4 3744*c0909341SAndroid Build Coastguard Worker packusdw m%1, m%1 3745*c0909341SAndroid Build Coastguard Worker pminsw m%1, m%5 3746*c0909341SAndroid Build Coastguard Worker%endmacro 3747*c0909341SAndroid Build Coastguard Worker 3748*c0909341SAndroid Build Coastguard Worker%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax 3749*c0909341SAndroid Build Coastguard Worker pshufb m%2, m%6 3750*c0909341SAndroid Build Coastguard Worker vpermq m%4, m%2, q3232 3751*c0909341SAndroid Build Coastguard Worker vinserti128 m%2, xm%2, 1 3752*c0909341SAndroid Build Coastguard Worker pshufd m%1, m%2, q0000 3753*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%4, q0000 3754*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m2 3755*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m2 3756*c0909341SAndroid Build Coastguard Worker paddd m%1, m1 3757*c0909341SAndroid Build Coastguard Worker paddd m%3, m1 3758*c0909341SAndroid Build Coastguard Worker pshufd m%5, m%2, q1111 3759*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m3 3760*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 3761*c0909341SAndroid Build Coastguard Worker pshufd m%5, m%4, q1111 3762*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m3 3763*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 3764*c0909341SAndroid Build Coastguard Worker pshufd m%5, m%2, q2222 3765*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m4 3766*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 3767*c0909341SAndroid Build Coastguard Worker pshufd m%5, m%4, q2222 3768*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m4 3769*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 3770*c0909341SAndroid Build Coastguard Worker pshufd m%5, m%2, q3333 3771*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m5 3772*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 3773*c0909341SAndroid Build Coastguard Worker pshufd m%5, m%4, q3333 3774*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m5 3775*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 3776*c0909341SAndroid Build Coastguard Worker psrad m%1, 4 3777*c0909341SAndroid Build Coastguard Worker psrad m%3, 4 3778*c0909341SAndroid Build Coastguard Worker packusdw m%1, m%3 3779*c0909341SAndroid Build Coastguard Worker pminsw m%1, m%7 3780*c0909341SAndroid Build Coastguard Worker%endmacro 3781*c0909341SAndroid Build Coastguard Worker 3782*c0909341SAndroid Build Coastguard Worker; The ipred_filter SIMD processes 4x2 blocks in the following order which 3783*c0909341SAndroid Build Coastguard Worker; increases parallelism compared to doing things row by row. One redundant 3784*c0909341SAndroid Build Coastguard Worker; block is calculated for w8 and w16, two for w32. 3785*c0909341SAndroid Build Coastguard Worker; w4 w8 w16 w32 3786*c0909341SAndroid Build Coastguard Worker; 1 1 2 1 2 3 5 1 2 3 5 b c d f 3787*c0909341SAndroid Build Coastguard Worker; 2 2 3 2 4 5 7 2 4 5 7 c e f h 3788*c0909341SAndroid Build Coastguard Worker; 3 3 4 4 6 7 9 4 6 7 9 e g h j 3789*c0909341SAndroid Build Coastguard Worker; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ 3790*c0909341SAndroid Build Coastguard Worker; 5 8 8 i 3791*c0909341SAndroid Build Coastguard Worker 3792*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter 3793*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_filter_16bpc_avx2_table 3794*c0909341SAndroid Build Coastguard Worker lea r6, [filter_intra_taps] 3795*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3796*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm 3797*c0909341SAndroid Build Coastguard Worker movzx filterd, filterb 3798*c0909341SAndroid Build Coastguard Worker%else 3799*c0909341SAndroid Build Coastguard Worker movzx filterd, byte filterm 3800*c0909341SAndroid Build Coastguard Worker%endif 3801*c0909341SAndroid Build Coastguard Worker shl filterd, 6 3802*c0909341SAndroid Build Coastguard Worker add filterq, r6 3803*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_filter_16bpc_avx2_table] 3804*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [tlq-6] 3805*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 3806*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+pd_8] 3807*c0909341SAndroid Build Coastguard Worker pmovsxbw m2, [filterq+16*0] 3808*c0909341SAndroid Build Coastguard Worker pmovsxbw m3, [filterq+16*1] 3809*c0909341SAndroid Build Coastguard Worker pmovsxbw m4, [filterq+16*2] 3810*c0909341SAndroid Build Coastguard Worker pmovsxbw m5, [filterq+16*3] 3811*c0909341SAndroid Build Coastguard Worker add wq, r6 3812*c0909341SAndroid Build Coastguard Worker mov hd, hm 3813*c0909341SAndroid Build Coastguard Worker jmp wq 3814*c0909341SAndroid Build Coastguard Worker.w4: 3815*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 10 3816*c0909341SAndroid Build Coastguard Worker mova xm8, [base+filter_shuf2] 3817*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, r8m ; bitdepth_max 3818*c0909341SAndroid Build Coastguard Worker lea r7, [6+hq*2] 3819*c0909341SAndroid Build Coastguard Worker sub tlq, r7 3820*c0909341SAndroid Build Coastguard Worker jmp .w4_loop_start 3821*c0909341SAndroid Build Coastguard Worker.w4_loop: 3822*c0909341SAndroid Build Coastguard Worker pinsrq xm0, [tlq+hq*2], 0 3823*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3824*c0909341SAndroid Build Coastguard Worker.w4_loop_start: 3825*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 6, 0, 7, 8, 9 3826*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m6, 1 3827*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm6 3828*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm0 3829*c0909341SAndroid Build Coastguard Worker sub hd, 2 3830*c0909341SAndroid Build Coastguard Worker jg .w4_loop 3831*c0909341SAndroid Build Coastguard Worker RET 3832*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3833*c0909341SAndroid Build Coastguard Worker.w8: 3834*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 16 3835*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [base+filter_shuf3] 3836*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m ; bitdepth_max 3837*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 3838*c0909341SAndroid Build Coastguard Worker vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 3839*c0909341SAndroid Build Coastguard Worker pslldq m8, m0, 4 3840*c0909341SAndroid Build Coastguard Worker psrldq m7, m6, 2 3841*c0909341SAndroid Build Coastguard Worker psrldq m0, m6, 10 3842*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m0 3843*c0909341SAndroid Build Coastguard Worker vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 3844*c0909341SAndroid Build Coastguard Worker vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 3845*c0909341SAndroid Build Coastguard Worker vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 3846*c0909341SAndroid Build Coastguard Worker lea r7, [16+hq*2] 3847*c0909341SAndroid Build Coastguard Worker sub tlq, r7 3848*c0909341SAndroid Build Coastguard Worker jmp .w8_loop_start 3849*c0909341SAndroid Build Coastguard Worker.w8_loop: 3850*c0909341SAndroid Build Coastguard Worker vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 3851*c0909341SAndroid Build Coastguard Worker vpermq m6, m9, q2031 3852*c0909341SAndroid Build Coastguard Worker psrldq m0, m6, 2 3853*c0909341SAndroid Build Coastguard Worker psrldq m6, 10 3854*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m0 3855*c0909341SAndroid Build Coastguard Worker vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 3856*c0909341SAndroid Build Coastguard Worker vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 3857*c0909341SAndroid Build Coastguard Worker mova m10, m9 3858*c0909341SAndroid Build Coastguard Worker.w8_loop_start: 3859*c0909341SAndroid Build Coastguard Worker vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 3860*c0909341SAndroid Build Coastguard Worker call .main 3861*c0909341SAndroid Build Coastguard Worker vpblendd m10, m9, 0xCC 3862*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm10 3863*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m10, 1 3864*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3865*c0909341SAndroid Build Coastguard Worker sub hd, 2 3866*c0909341SAndroid Build Coastguard Worker jg .w8_loop 3867*c0909341SAndroid Build Coastguard Worker RET 3868*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3869*c0909341SAndroid Build Coastguard Worker.w16: 3870*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 32, 16 3871*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m ; bitdepth_max 3872*c0909341SAndroid Build Coastguard Worker sub hd, 2 3873*c0909341SAndroid Build Coastguard Worker TAIL_CALL .w16_main, 0 3874*c0909341SAndroid Build Coastguard Worker.w16_main: 3875*c0909341SAndroid Build Coastguard Worker mova xm10, [base+filter_shuf2] 3876*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 13, 0, 6, 10, 15 3877*c0909341SAndroid Build Coastguard Worker vpermq m12, m13, q3120 3878*c0909341SAndroid Build Coastguard Worker mova xm14, [base+filter_shuf3] 3879*c0909341SAndroid Build Coastguard Worker vinserti128 m14, [base+filter_shuf1], 1 3880*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [tlq+10] 3881*c0909341SAndroid Build Coastguard Worker vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ 3882*c0909341SAndroid Build Coastguard Worker psrldq m6, m12, 8 3883*c0909341SAndroid Build Coastguard Worker vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 3884*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m12 3885*c0909341SAndroid Build Coastguard Worker vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 3886*c0909341SAndroid Build Coastguard Worker FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 3887*c0909341SAndroid Build Coastguard Worker vpblendd m13, m12, 0xCC 3888*c0909341SAndroid Build Coastguard Worker vpermq m12, m12, q2031 ; 6___ 5___ 3889*c0909341SAndroid Build Coastguard Worker psrldq xm6, xm12, 2 3890*c0909341SAndroid Build Coastguard Worker psrldq xm8, xm12, 12 3891*c0909341SAndroid Build Coastguard Worker vpblendd xm6, xm8, 0x01 3892*c0909341SAndroid Build Coastguard Worker pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ 3893*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 11, 6, 8, 10, 15 3894*c0909341SAndroid Build Coastguard Worker vpermq m11, m11, q3120 3895*c0909341SAndroid Build Coastguard Worker pshufd m9, m11, q1032 3896*c0909341SAndroid Build Coastguard Worker movu m8, [tlq+6] ; __43 210_ | ____ ____ 3897*c0909341SAndroid Build Coastguard Worker pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ 3898*c0909341SAndroid Build Coastguard Worker pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ 3899*c0909341SAndroid Build Coastguard Worker vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 3900*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm13 3901*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m13, 1 3902*c0909341SAndroid Build Coastguard Worker lea r7, [20+hq*2] 3903*c0909341SAndroid Build Coastguard Worker sub tlq, r7 3904*c0909341SAndroid Build Coastguard Worker vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 3905*c0909341SAndroid Build Coastguard Worker jmp .w16_loop_start 3906*c0909341SAndroid Build Coastguard Worker.w16_loop: 3907*c0909341SAndroid Build Coastguard Worker vpermq m13, m13, q3322 3908*c0909341SAndroid Build Coastguard Worker vpermq m11, m9, q2020 3909*c0909341SAndroid Build Coastguard Worker vpermq m9, m9, q1302 3910*c0909341SAndroid Build Coastguard Worker vpermq m6, m12, q0123 3911*c0909341SAndroid Build Coastguard Worker psrldq m7, 4 3912*c0909341SAndroid Build Coastguard Worker vpblendd m13, m10, 0xCC 3913*c0909341SAndroid Build Coastguard Worker vpblendd m9, m7, 0x40 3914*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+8] 3915*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm13 3916*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m13, 1 3917*c0909341SAndroid Build Coastguard Worker.w16_loop_start: 3918*c0909341SAndroid Build Coastguard Worker mova m13, m12 3919*c0909341SAndroid Build Coastguard Worker vpblendd m0, [tlq+hq*2], 0x0C 3920*c0909341SAndroid Build Coastguard Worker psrldq m7, m12, 8 3921*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m12 3922*c0909341SAndroid Build Coastguard Worker vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 3923*c0909341SAndroid Build Coastguard Worker vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 3924*c0909341SAndroid Build Coastguard Worker FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 3925*c0909341SAndroid Build Coastguard Worker vpermq m12, m10, q2031 3926*c0909341SAndroid Build Coastguard Worker mova [rsp+8], m0 3927*c0909341SAndroid Build Coastguard Worker psrldq m8, m11, 8 3928*c0909341SAndroid Build Coastguard Worker psrldq xm6, xm12, 2 3929*c0909341SAndroid Build Coastguard Worker psrldq xm7, xm12, 10 3930*c0909341SAndroid Build Coastguard Worker psrldq xm0, xm13, 2 3931*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m11 3932*c0909341SAndroid Build Coastguard Worker punpcklwd xm7, xm6 3933*c0909341SAndroid Build Coastguard Worker vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 3934*c0909341SAndroid Build Coastguard Worker vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 3935*c0909341SAndroid Build Coastguard Worker vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 3936*c0909341SAndroid Build Coastguard Worker call .main 3937*c0909341SAndroid Build Coastguard Worker vpermq m8, m11, q3120 3938*c0909341SAndroid Build Coastguard Worker vpblendd m6, m8, m9, 0xCC 3939*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16], xm6 3940*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1+16], m6, 1 3941*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3942*c0909341SAndroid Build Coastguard Worker sub hd, 2 3943*c0909341SAndroid Build Coastguard Worker jg .w16_loop 3944*c0909341SAndroid Build Coastguard Worker vpermq m8, m9, q3120 3945*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m8, 1 ; 4321 ____ 3946*c0909341SAndroid Build Coastguard Worker pshufd xm11, xm11, q1032 3947*c0909341SAndroid Build Coastguard Worker vpblendd xm0, xm11, 0x02 ; 4321 0___ 3948*c0909341SAndroid Build Coastguard Worker psrldq xm6, xm8, 2 3949*c0909341SAndroid Build Coastguard Worker psrldq xm7, xm8, 12 3950*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm6, 0x4 ; 4321 05__ 3951*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm7, 0x2 ; 4321 056_ 3952*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 3953*c0909341SAndroid Build Coastguard Worker vpermq m12, m13, q1302 3954*c0909341SAndroid Build Coastguard Worker vpblendd m12, m10, 0xCC 3955*c0909341SAndroid Build Coastguard Worker vpblendd m9, m6, 0xCC 3956*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+ 0], xm12 3957*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16], xm9 3958*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1+ 0], m12, 1 3959*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1+16], m9, 1 3960*c0909341SAndroid Build Coastguard Worker ret 3961*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3962*c0909341SAndroid Build Coastguard Worker.w32: 3963*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 64, 16 3964*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m ; bitdepth_max 3965*c0909341SAndroid Build Coastguard Worker sub hd, 2 3966*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+32] 3967*c0909341SAndroid Build Coastguard Worker lea r5d, [hd*2+20] 3968*c0909341SAndroid Build Coastguard Worker call .w16_main 3969*c0909341SAndroid Build Coastguard Worker mov dstq, r3 3970*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+r5+32] 3971*c0909341SAndroid Build Coastguard Worker sub r5d, 20 3972*c0909341SAndroid Build Coastguard Worker shr r5d, 1 3973*c0909341SAndroid Build Coastguard Worker sub r5d, 2 3974*c0909341SAndroid Build Coastguard Worker lea r4, [dstq+strideq*2-2] 3975*c0909341SAndroid Build Coastguard WorkerDEFINE_ARGS dst, stride, tl, stride3, left, h 3976*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3977*c0909341SAndroid Build Coastguard Worker movu m8, [tlq-6] ; 4321 0___ 3978*c0909341SAndroid Build Coastguard Worker mova xm10, [base+filter_shuf2] 3979*c0909341SAndroid Build Coastguard Worker pinsrw xm0, xm8, [dstq+strideq*0-2], 2 3980*c0909341SAndroid Build Coastguard Worker pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ 3981*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+strideq*0], 5 3982*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+strideq*1], 4 3983*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 13, 0, 6, 10, 15 3984*c0909341SAndroid Build Coastguard Worker vpermq m12, m13, q3120 3985*c0909341SAndroid Build Coastguard Worker mova xm14, [base+filter_shuf3] 3986*c0909341SAndroid Build Coastguard Worker vinserti128 m14, [base+filter_shuf1], 1 3987*c0909341SAndroid Build Coastguard Worker psrldq m6, m12, 8 3988*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m6, m12 3989*c0909341SAndroid Build Coastguard Worker vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 3990*c0909341SAndroid Build Coastguard Worker vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 3991*c0909341SAndroid Build Coastguard Worker vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 3992*c0909341SAndroid Build Coastguard Worker vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 3993*c0909341SAndroid Build Coastguard Worker FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 3994*c0909341SAndroid Build Coastguard Worker vpblendd m13, m12, 0xCC 3995*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+strideq*2], 3 3996*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+stride3q ], 2 3997*c0909341SAndroid Build Coastguard Worker lea leftq, [leftq+strideq*4] 3998*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+strideq*0], 1 3999*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+strideq*1], 0 4000*c0909341SAndroid Build Coastguard Worker movq [rsp+32], xm9 4001*c0909341SAndroid Build Coastguard Worker mov r7d, 1 4002*c0909341SAndroid Build Coastguard Worker pslldq m8, m9, 4 4003*c0909341SAndroid Build Coastguard Worker vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ 4004*c0909341SAndroid Build Coastguard Worker vpermq m12, m12, q2031 ; 6___ 5___ 4005*c0909341SAndroid Build Coastguard Worker psrldq xm6, xm12, 2 4006*c0909341SAndroid Build Coastguard Worker psrldq xm7, xm12, 12 4007*c0909341SAndroid Build Coastguard Worker vpblendd xm6, xm7, 0x01 ; ____ _56_ 4008*c0909341SAndroid Build Coastguard Worker pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ 4009*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 11, 6, 7, 10, 15 4010*c0909341SAndroid Build Coastguard Worker vpermq m11, m11, q3120 4011*c0909341SAndroid Build Coastguard Worker pshufd m9, m11, q1032 4012*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ 4013*c0909341SAndroid Build Coastguard Worker pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ 4014*c0909341SAndroid Build Coastguard Worker pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ 4015*c0909341SAndroid Build Coastguard Worker vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 4016*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm13 4017*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m13, 1 4018*c0909341SAndroid Build Coastguard Worker vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 4019*c0909341SAndroid Build Coastguard Worker jmp .w32_loop_start 4020*c0909341SAndroid Build Coastguard Worker.w32_loop_last: 4021*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+0] 4022*c0909341SAndroid Build Coastguard Worker jmp .w32_loop 4023*c0909341SAndroid Build Coastguard Worker.w32_loop_left: 4024*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+0] 4025*c0909341SAndroid Build Coastguard Worker vpblendd m0, [rsp+32+r7*4-12], 0x0C 4026*c0909341SAndroid Build Coastguard Worker dec r7d 4027*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4028*c0909341SAndroid Build Coastguard Worker cmp hd, 2 4029*c0909341SAndroid Build Coastguard Worker je .w32_loop 4030*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [rsp+32], 6 4031*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+strideq*2], 5 4032*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+stride3q ], 4 4033*c0909341SAndroid Build Coastguard Worker lea leftq, [leftq+strideq*4] 4034*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+strideq*0], 3 4035*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+strideq*1], 2 4036*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+strideq*2], 1 4037*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+stride3q ], 0 4038*c0909341SAndroid Build Coastguard Worker lea leftq, [leftq+strideq*4] 4039*c0909341SAndroid Build Coastguard Worker movu [rsp+36], xm6 4040*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+strideq*0], 1 4041*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+strideq*1], 0 4042*c0909341SAndroid Build Coastguard Worker movd [rsp+32], xm6 4043*c0909341SAndroid Build Coastguard Worker mov r7d, 4 4044*c0909341SAndroid Build Coastguard Worker.w32_loop: 4045*c0909341SAndroid Build Coastguard Worker vpermq m13, m13, q3322 4046*c0909341SAndroid Build Coastguard Worker vpermq m11, m9, q2020 4047*c0909341SAndroid Build Coastguard Worker vpermq m9, m9, q1302 4048*c0909341SAndroid Build Coastguard Worker vpermq m6, m12, q0123 4049*c0909341SAndroid Build Coastguard Worker psrldq m7, 4 4050*c0909341SAndroid Build Coastguard Worker vpblendd m13, m10, 0xCC 4051*c0909341SAndroid Build Coastguard Worker vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 4052*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm13 4053*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m13, 1 4054*c0909341SAndroid Build Coastguard Worker.w32_loop_start: 4055*c0909341SAndroid Build Coastguard Worker mova m13, m12 4056*c0909341SAndroid Build Coastguard Worker psrldq m7, m12, 8 4057*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m12 4058*c0909341SAndroid Build Coastguard Worker vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 4059*c0909341SAndroid Build Coastguard Worker vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 4060*c0909341SAndroid Build Coastguard Worker FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 4061*c0909341SAndroid Build Coastguard Worker vpermq m12, m10, q2031 4062*c0909341SAndroid Build Coastguard Worker mova [rsp+0], m0 4063*c0909341SAndroid Build Coastguard Worker psrldq m8, m11, 8 4064*c0909341SAndroid Build Coastguard Worker psrldq xm6, xm12, 2 4065*c0909341SAndroid Build Coastguard Worker psrldq xm7, xm12, 10 4066*c0909341SAndroid Build Coastguard Worker psrldq xm0, xm13, 2 4067*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m11 4068*c0909341SAndroid Build Coastguard Worker punpcklwd xm7, xm6 4069*c0909341SAndroid Build Coastguard Worker vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 4070*c0909341SAndroid Build Coastguard Worker vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 4071*c0909341SAndroid Build Coastguard Worker vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 4072*c0909341SAndroid Build Coastguard Worker call .main 4073*c0909341SAndroid Build Coastguard Worker vpermq m8, m11, q3120 4074*c0909341SAndroid Build Coastguard Worker vpblendd m6, m8, m9, 0xCC 4075*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16], xm6 4076*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1+16], m6, 1 4077*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4078*c0909341SAndroid Build Coastguard Worker sub hd, 2 4079*c0909341SAndroid Build Coastguard Worker jg .w32_loop_left 4080*c0909341SAndroid Build Coastguard Worker jz .w32_loop_last 4081*c0909341SAndroid Build Coastguard Worker vpermq m8, m9, q3120 4082*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m8, 1 ; 4321 ____ 4083*c0909341SAndroid Build Coastguard Worker pshufd xm11, xm11, q1032 4084*c0909341SAndroid Build Coastguard Worker vpblendd xm0, xm11, 0x02 ; 4321 0___ 4085*c0909341SAndroid Build Coastguard Worker psrldq xm6, xm8, 2 4086*c0909341SAndroid Build Coastguard Worker psrldq xm7, xm8, 12 4087*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm6, 0x4 ; 4321 05__ 4088*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm7, 0x2 ; 4321 056_ 4089*c0909341SAndroid Build Coastguard Worker FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 4090*c0909341SAndroid Build Coastguard Worker vpermq m12, m13, q1302 4091*c0909341SAndroid Build Coastguard Worker vpblendd m12, m10, 0xCC 4092*c0909341SAndroid Build Coastguard Worker vpblendd m9, m6, 0xCC 4093*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+ 0], xm12 4094*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16], xm9 4095*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1+ 0], m12, 1 4096*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1+16], m9, 1 4097*c0909341SAndroid Build Coastguard Worker RET 4098*c0909341SAndroid Build Coastguard Worker.main: 4099*c0909341SAndroid Build Coastguard Worker FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 4100*c0909341SAndroid Build Coastguard Worker ret 4101*c0909341SAndroid Build Coastguard Worker 4102*c0909341SAndroid Build Coastguard Worker%if WIN64 4103*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5 4104*c0909341SAndroid Build Coastguard Worker%else 4105*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 4106*c0909341SAndroid Build Coastguard Worker%endif 4107*c0909341SAndroid Build Coastguard Worker 4108*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 1 ; ac in, unpacked pixels out 4109*c0909341SAndroid Build Coastguard Worker psignw m3, m%1, m1 4110*c0909341SAndroid Build Coastguard Worker pabsw m%1, m%1 4111*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m2 4112*c0909341SAndroid Build Coastguard Worker psignw m%1, m3 4113*c0909341SAndroid Build Coastguard Worker paddw m%1, m0 4114*c0909341SAndroid Build Coastguard Worker%endmacro 4115*c0909341SAndroid Build Coastguard Worker 4116*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4117*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4118*c0909341SAndroid Build Coastguard Worker add tlq, 2 4119*c0909341SAndroid Build Coastguard Worker movd xm4, wd 4120*c0909341SAndroid Build Coastguard Worker pxor m6, m6 4121*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, r7m 4122*c0909341SAndroid Build Coastguard Worker pavgw xm4, xm6 4123*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 4124*c0909341SAndroid Build Coastguard Worker movd xm5, wd 4125*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 4126*c0909341SAndroid Build Coastguard Worker lea t0, [ipred_cfl_left_16bpc_avx2_table] 4127*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+wq*4] 4128*c0909341SAndroid Build Coastguard Worker add r6, t0 4129*c0909341SAndroid Build Coastguard Worker add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table 4130*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 4131*c0909341SAndroid Build Coastguard Worker add wq, t0 4132*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 4133*c0909341SAndroid Build Coastguard Worker jmp r6 4134*c0909341SAndroid Build Coastguard Worker 4135*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4136*c0909341SAndroid Build Coastguard Worker mov hd, hm ; zero upper half 4137*c0909341SAndroid Build Coastguard Worker sub tlq, hq 4138*c0909341SAndroid Build Coastguard Worker movd xm4, hd 4139*c0909341SAndroid Build Coastguard Worker sub tlq, hq 4140*c0909341SAndroid Build Coastguard Worker pxor m6, m6 4141*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, r7m 4142*c0909341SAndroid Build Coastguard Worker pavgw xm4, xm6 4143*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 4144*c0909341SAndroid Build Coastguard Worker movd xm5, r6d 4145*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 4146*c0909341SAndroid Build Coastguard Worker lea t0, [ipred_cfl_left_16bpc_avx2_table] 4147*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+r6*4] 4148*c0909341SAndroid Build Coastguard Worker add r6, t0 4149*c0909341SAndroid Build Coastguard Worker add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table 4150*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 4151*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 4152*c0909341SAndroid Build Coastguard Worker add wq, t0 4153*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 4154*c0909341SAndroid Build Coastguard Worker jmp r6 4155*c0909341SAndroid Build Coastguard Worker.h32: 4156*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+32] 4157*c0909341SAndroid Build Coastguard Worker.h16: 4158*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4159*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4160*c0909341SAndroid Build Coastguard Worker.h8: 4161*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 4162*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4163*c0909341SAndroid Build Coastguard Worker.h4: 4164*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm6 4165*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 4166*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4167*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 4168*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4169*c0909341SAndroid Build Coastguard Worker paddd xm0, xm4 4170*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 4171*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 4172*c0909341SAndroid Build Coastguard Worker jmp wq 4173*c0909341SAndroid Build Coastguard Worker 4174*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4175*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4176*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 4177*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 4178*c0909341SAndroid Build Coastguard Worker lea t0d, [wq+hq] 4179*c0909341SAndroid Build Coastguard Worker movd xm4, t0d 4180*c0909341SAndroid Build Coastguard Worker tzcnt t0d, t0d 4181*c0909341SAndroid Build Coastguard Worker movd xm5, t0d 4182*c0909341SAndroid Build Coastguard Worker lea t0, [ipred_cfl_16bpc_avx2_table] 4183*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 4184*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+r6*4] 4185*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4+4*4] 4186*c0909341SAndroid Build Coastguard Worker psrlw xm4, 1 4187*c0909341SAndroid Build Coastguard Worker pxor m6, m6 4188*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, r7m 4189*c0909341SAndroid Build Coastguard Worker add r6, t0 4190*c0909341SAndroid Build Coastguard Worker add wq, t0 4191*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 4192*c0909341SAndroid Build Coastguard Worker jmp r6 4193*c0909341SAndroid Build Coastguard Worker.h4: 4194*c0909341SAndroid Build Coastguard Worker movq xm0, [tlq-8] 4195*c0909341SAndroid Build Coastguard Worker jmp wq 4196*c0909341SAndroid Build Coastguard Worker.w4: 4197*c0909341SAndroid Build Coastguard Worker movq xm1, [tlq+2] 4198*c0909341SAndroid Build Coastguard Worker paddw m0, m4 4199*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4200*c0909341SAndroid Build Coastguard Worker psrlq m1, m0, 32 4201*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4202*c0909341SAndroid Build Coastguard Worker psrld m1, m0, 16 4203*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4204*c0909341SAndroid Build Coastguard Worker cmp hd, 4 4205*c0909341SAndroid Build Coastguard Worker jg .w4_mul 4206*c0909341SAndroid Build Coastguard Worker psrlw xm0, 3 4207*c0909341SAndroid Build Coastguard Worker jmp .w4_end 4208*c0909341SAndroid Build Coastguard Worker.w4_mul: 4209*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4210*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4211*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 4212*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB6667 4213*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, r2d 4214*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm6 4215*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm6 4216*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4217*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 4218*c0909341SAndroid Build Coastguard Worker psrld xm0, 2 4219*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 4220*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 4221*c0909341SAndroid Build Coastguard Worker.w4_end: 4222*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 4223*c0909341SAndroid Build Coastguard Worker.s4: 4224*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, alpham 4225*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 4226*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 4227*c0909341SAndroid Build Coastguard Worker psllw m2, 9 4228*c0909341SAndroid Build Coastguard Worker.s4_loop: 4229*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 4230*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 4231*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m6 4232*c0909341SAndroid Build Coastguard Worker pminsw m4, m7 4233*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4234*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm4 4235*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm5 4236*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm4 4237*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], xm5 4238*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4239*c0909341SAndroid Build Coastguard Worker add acq, 32 4240*c0909341SAndroid Build Coastguard Worker sub hd, 4 4241*c0909341SAndroid Build Coastguard Worker jg .s4_loop 4242*c0909341SAndroid Build Coastguard Worker RET 4243*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4244*c0909341SAndroid Build Coastguard Worker.h8: 4245*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-16] 4246*c0909341SAndroid Build Coastguard Worker jmp wq 4247*c0909341SAndroid Build Coastguard Worker.w8: 4248*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4249*c0909341SAndroid Build Coastguard Worker paddw xm0, [tlq+2] 4250*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 4251*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4252*c0909341SAndroid Build Coastguard Worker psrld xm1, xm0, 16 4253*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4254*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm6, 0xAA 4255*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 4256*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4257*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 4258*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4259*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 4260*c0909341SAndroid Build Coastguard Worker cmp hd, 8 4261*c0909341SAndroid Build Coastguard Worker je .w8_end 4262*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 4263*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 4264*c0909341SAndroid Build Coastguard Worker cmp hd, 32 4265*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 4266*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 4267*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 4268*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 4269*c0909341SAndroid Build Coastguard Worker.w8_end: 4270*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 4271*c0909341SAndroid Build Coastguard Worker.s8: 4272*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, alpham 4273*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 4274*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 4275*c0909341SAndroid Build Coastguard Worker psllw m2, 9 4276*c0909341SAndroid Build Coastguard Worker.s8_loop: 4277*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 4278*c0909341SAndroid Build Coastguard Worker mova m5, [acq+32] 4279*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 4280*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 4281*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m6 4282*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m6 4283*c0909341SAndroid Build Coastguard Worker pminsw m4, m7 4284*c0909341SAndroid Build Coastguard Worker pminsw m5, m7 4285*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm4 4286*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm5 4287*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m4, 1 4288*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+r6 ], m5, 1 4289*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4290*c0909341SAndroid Build Coastguard Worker add acq, 64 4291*c0909341SAndroid Build Coastguard Worker sub hd, 4 4292*c0909341SAndroid Build Coastguard Worker jg .s8_loop 4293*c0909341SAndroid Build Coastguard Worker RET 4294*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4295*c0909341SAndroid Build Coastguard Worker.h16: 4296*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] 4297*c0909341SAndroid Build Coastguard Worker jmp wq 4298*c0909341SAndroid Build Coastguard Worker.w16: 4299*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+2] 4300*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4301*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 4302*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4303*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm6 4304*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm6 4305*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4306*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 4307*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4308*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 4309*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4310*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 4311*c0909341SAndroid Build Coastguard Worker cmp hd, 16 4312*c0909341SAndroid Build Coastguard Worker je .w16_end 4313*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 4314*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 4315*c0909341SAndroid Build Coastguard Worker test hb, 8|32 4316*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 4317*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 4318*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 4319*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 4320*c0909341SAndroid Build Coastguard Worker.w16_end: 4321*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 4322*c0909341SAndroid Build Coastguard Worker.s16: 4323*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, alpham 4324*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 4325*c0909341SAndroid Build Coastguard Worker psllw m2, 9 4326*c0909341SAndroid Build Coastguard Worker.s16_loop: 4327*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 4328*c0909341SAndroid Build Coastguard Worker mova m5, [acq+32] 4329*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 4330*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 4331*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m6 4332*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m6 4333*c0909341SAndroid Build Coastguard Worker pminsw m4, m7 4334*c0909341SAndroid Build Coastguard Worker pminsw m5, m7 4335*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m4 4336*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m5 4337*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4338*c0909341SAndroid Build Coastguard Worker add acq, 64 4339*c0909341SAndroid Build Coastguard Worker sub hd, 2 4340*c0909341SAndroid Build Coastguard Worker jg .s16_loop 4341*c0909341SAndroid Build Coastguard Worker RET 4342*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4343*c0909341SAndroid Build Coastguard Worker.h32: 4344*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-64] 4345*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-32] 4346*c0909341SAndroid Build Coastguard Worker jmp wq 4347*c0909341SAndroid Build Coastguard Worker.w32: 4348*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+ 2] 4349*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq+34] 4350*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4351*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 4352*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 4353*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm0, xm6 4354*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm6 4355*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4356*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 4357*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4358*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 8 4359*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 4360*c0909341SAndroid Build Coastguard Worker psrld xm0, xm5 4361*c0909341SAndroid Build Coastguard Worker cmp hd, 32 4362*c0909341SAndroid Build Coastguard Worker je .w32_end 4363*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 4364*c0909341SAndroid Build Coastguard Worker mov r6d, 0x6667AAAB 4365*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, r2d 4366*c0909341SAndroid Build Coastguard Worker movd xm1, r6d 4367*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm1 4368*c0909341SAndroid Build Coastguard Worker psrlw xm0, 1 4369*c0909341SAndroid Build Coastguard Worker.w32_end: 4370*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 4371*c0909341SAndroid Build Coastguard Worker.s32: 4372*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, alpham 4373*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 4374*c0909341SAndroid Build Coastguard Worker psllw m2, 9 4375*c0909341SAndroid Build Coastguard Worker.s32_loop: 4376*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 4377*c0909341SAndroid Build Coastguard Worker mova m5, [acq+32] 4378*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 4379*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 4380*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m6 4381*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m6 4382*c0909341SAndroid Build Coastguard Worker pminsw m4, m7 4383*c0909341SAndroid Build Coastguard Worker pminsw m5, m7 4384*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m4 4385*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m5 4386*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4387*c0909341SAndroid Build Coastguard Worker add acq, 64 4388*c0909341SAndroid Build Coastguard Worker dec hd 4389*c0909341SAndroid Build Coastguard Worker jg .s32_loop 4390*c0909341SAndroid Build Coastguard Worker RET 4391*c0909341SAndroid Build Coastguard Worker 4392*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4393*c0909341SAndroid Build Coastguard Worker mov r6d, r7m 4394*c0909341SAndroid Build Coastguard Worker shr r6d, 11 4395*c0909341SAndroid Build Coastguard Worker lea t0, [ipred_cfl_splat_16bpc_avx2_table] 4396*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 4397*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4398*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 4399*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] 4400*c0909341SAndroid Build Coastguard Worker pxor m6, m6 4401*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, r7m 4402*c0909341SAndroid Build Coastguard Worker add wq, t0 4403*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 4404*c0909341SAndroid Build Coastguard Worker jmp wq 4405*c0909341SAndroid Build Coastguard Worker 4406*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4407*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 4408*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_2] 4409*c0909341SAndroid Build Coastguard Worker mov hd, hm 4410*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 4411*c0909341SAndroid Build Coastguard Worker pxor m4, m4 4412*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 4413*c0909341SAndroid Build Coastguard Worker cmp dword wm, 8 4414*c0909341SAndroid Build Coastguard Worker jg .w16 4415*c0909341SAndroid Build Coastguard Worker je .w8 4416*c0909341SAndroid Build Coastguard Worker.w4: 4417*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 4418*c0909341SAndroid Build Coastguard Worker mov r5, acq 4419*c0909341SAndroid Build Coastguard Worker.w4_loop: 4420*c0909341SAndroid Build Coastguard Worker mova xm0, [ypxq+strideq*2] 4421*c0909341SAndroid Build Coastguard Worker mova xm1, [ypxq+r3 ] 4422*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [ypxq+strideq*0], 1 4423*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [ypxq+strideq*1], 1 4424*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 4425*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5 4426*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 4427*c0909341SAndroid Build Coastguard Worker paddd m0, m1 4428*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4429*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4430*c0909341SAndroid Build Coastguard Worker packssdw xm1, xm0 4431*c0909341SAndroid Build Coastguard Worker mova [acq], xm1 4432*c0909341SAndroid Build Coastguard Worker add acq, 16 4433*c0909341SAndroid Build Coastguard Worker sub hd, 2 4434*c0909341SAndroid Build Coastguard Worker jg .w4_loop 4435*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4436*c0909341SAndroid Build Coastguard Worker jz .dc 4437*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q1111 4438*c0909341SAndroid Build Coastguard Worker pslld xm0, 2 4439*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop: 4440*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4441*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4442*c0909341SAndroid Build Coastguard Worker add acq, 32 4443*c0909341SAndroid Build Coastguard Worker sub hpadd, 4 4444*c0909341SAndroid Build Coastguard Worker jg .w4_hpad_loop 4445*c0909341SAndroid Build Coastguard Worker jmp .dc 4446*c0909341SAndroid Build Coastguard Worker.w8: 4447*c0909341SAndroid Build Coastguard Worker mov r5, acq 4448*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4449*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad1 4450*c0909341SAndroid Build Coastguard Worker.w8_loop: 4451*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0] 4452*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*1] 4453*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4454*c0909341SAndroid Build Coastguard Worker paddd m0, m1 4455*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4456*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4457*c0909341SAndroid Build Coastguard Worker packssdw xm1, xm0, xm1 4458*c0909341SAndroid Build Coastguard Worker mova [acq], xm1 4459*c0909341SAndroid Build Coastguard Worker add acq, 16 4460*c0909341SAndroid Build Coastguard Worker dec hd 4461*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4462*c0909341SAndroid Build Coastguard Worker.w8_hpad: 4463*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4464*c0909341SAndroid Build Coastguard Worker jz .dc 4465*c0909341SAndroid Build Coastguard Worker vinserti128 m1, xm1, 1 4466*c0909341SAndroid Build Coastguard Worker pslld m0, 2 4467*c0909341SAndroid Build Coastguard Worker jmp .hpad 4468*c0909341SAndroid Build Coastguard Worker.w8_wpad1: 4469*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm5, [ypxq+strideq*0] 4470*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm5, [ypxq+strideq*1] 4471*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4472*c0909341SAndroid Build Coastguard Worker paddd xm0, xm3 4473*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm0, q3333 4474*c0909341SAndroid Build Coastguard Worker packssdw xm1, xm0, xm3 4475*c0909341SAndroid Build Coastguard Worker paddd xm0, xm3 4476*c0909341SAndroid Build Coastguard Worker paddd xm4, xm0 4477*c0909341SAndroid Build Coastguard Worker mova [acq], xm1 4478*c0909341SAndroid Build Coastguard Worker add acq, 16 4479*c0909341SAndroid Build Coastguard Worker dec hd 4480*c0909341SAndroid Build Coastguard Worker jg .w8_wpad1 4481*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 4482*c0909341SAndroid Build Coastguard Worker.w16_wpad: 4483*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+strideq*0+ 0] 4484*c0909341SAndroid Build Coastguard Worker mova m1, [ypxq+strideq*1+ 0] 4485*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 4486*c0909341SAndroid Build Coastguard Worker jl .w16_wpad1 4487*c0909341SAndroid Build Coastguard Worker je .w16_wpad2 4488*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [ypxq+strideq*0+12] 4489*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [ypxq+strideq*1+12] 4490*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0xf0 4491*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xf0 4492*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 4493*c0909341SAndroid Build Coastguard Worker.w16_wpad2: 4494*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [ypxq+strideq*0+28] 4495*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [ypxq+strideq*1+28] 4496*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 4497*c0909341SAndroid Build Coastguard Worker.w16_wpad1: 4498*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [ypxq+strideq*0+44] 4499*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [ypxq+strideq*1+44] 4500*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [ypxq+strideq*0+32], 0 4501*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [ypxq+strideq*1+32], 0 4502*c0909341SAndroid Build Coastguard Worker.w16_wpad_end: 4503*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4504*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m5}, m0, m1, m2, m3 4505*c0909341SAndroid Build Coastguard Worker paddd m0, m1 4506*c0909341SAndroid Build Coastguard Worker paddd m2, m3 4507*c0909341SAndroid Build Coastguard Worker packssdw m1, m0, m2 4508*c0909341SAndroid Build Coastguard Worker paddd m0, m2 4509*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 4510*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4511*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4512*c0909341SAndroid Build Coastguard Worker add acq, 32 4513*c0909341SAndroid Build Coastguard Worker dec hd 4514*c0909341SAndroid Build Coastguard Worker jg .w16_wpad 4515*c0909341SAndroid Build Coastguard Worker jmp .w16_hpad 4516*c0909341SAndroid Build Coastguard Worker.w16: 4517*c0909341SAndroid Build Coastguard Worker mov r5, acq 4518*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4519*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad 4520*c0909341SAndroid Build Coastguard Worker.w16_loop: 4521*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0+ 0] 4522*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+strideq*0+32] 4523*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*1+ 0] 4524*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+strideq*1+32] 4525*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4526*c0909341SAndroid Build Coastguard Worker paddd m0, m1 4527*c0909341SAndroid Build Coastguard Worker paddd m2, m3 4528*c0909341SAndroid Build Coastguard Worker packssdw m1, m0, m2 4529*c0909341SAndroid Build Coastguard Worker paddd m0, m2 4530*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 4531*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4532*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4533*c0909341SAndroid Build Coastguard Worker add acq, 32 4534*c0909341SAndroid Build Coastguard Worker dec hd 4535*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4536*c0909341SAndroid Build Coastguard Worker.w16_hpad: 4537*c0909341SAndroid Build Coastguard Worker add hpadd, hpadd 4538*c0909341SAndroid Build Coastguard Worker jz .dc 4539*c0909341SAndroid Build Coastguard Worker paddd m0, m0 4540*c0909341SAndroid Build Coastguard Worker.hpad: 4541*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m1 4542*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4543*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4544*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4545*c0909341SAndroid Build Coastguard Worker sub hpadd, 4 4546*c0909341SAndroid Build Coastguard Worker jg .hpad 4547*c0909341SAndroid Build Coastguard Worker.dc: 4548*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m4, 1 4549*c0909341SAndroid Build Coastguard Worker sub r5, acq ; -w*h*2 4550*c0909341SAndroid Build Coastguard Worker tzcnt r1d, r5d 4551*c0909341SAndroid Build Coastguard Worker paddd xm4, xm1 4552*c0909341SAndroid Build Coastguard Worker sub r1d, 2 4553*c0909341SAndroid Build Coastguard Worker punpckhqdq xm1, xm4, xm4 4554*c0909341SAndroid Build Coastguard Worker movd xm0, r1d 4555*c0909341SAndroid Build Coastguard Worker paddd xm1, xm4 4556*c0909341SAndroid Build Coastguard Worker pshuflw xm4, xm1, q1032 4557*c0909341SAndroid Build Coastguard Worker paddd xm1, xm4 4558*c0909341SAndroid Build Coastguard Worker psrld xm1, xm0 4559*c0909341SAndroid Build Coastguard Worker pxor xm0, xm0 4560*c0909341SAndroid Build Coastguard Worker pavgw xm1, xm0 4561*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, xm1 4562*c0909341SAndroid Build Coastguard Worker.dc_loop: 4563*c0909341SAndroid Build Coastguard Worker mova m0, [acq+r5] 4564*c0909341SAndroid Build Coastguard Worker psubw m0, m1 4565*c0909341SAndroid Build Coastguard Worker mova [acq+r5], m0 4566*c0909341SAndroid Build Coastguard Worker add r5, 32 4567*c0909341SAndroid Build Coastguard Worker jl .dc_loop 4568*c0909341SAndroid Build Coastguard Worker RET 4569*c0909341SAndroid Build Coastguard Worker 4570*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4571*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 4572*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_4] 4573*c0909341SAndroid Build Coastguard Worker mov hd, hm 4574*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 4575*c0909341SAndroid Build Coastguard Worker pxor m4, m4 4576*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 4577*c0909341SAndroid Build Coastguard Worker cmp dword wm, 8 4578*c0909341SAndroid Build Coastguard Worker jg .w16 4579*c0909341SAndroid Build Coastguard Worker je .w8 4580*c0909341SAndroid Build Coastguard Worker.w4: 4581*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 4582*c0909341SAndroid Build Coastguard Worker mov r5, acq 4583*c0909341SAndroid Build Coastguard Worker.w4_loop: 4584*c0909341SAndroid Build Coastguard Worker mova xm0, [ypxq+strideq*0] 4585*c0909341SAndroid Build Coastguard Worker mova xm1, [ypxq+strideq*1] 4586*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [ypxq+strideq*2], 1 4587*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [ypxq+r3 ], 1 4588*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 4589*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5 4590*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 4591*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4592*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 4593*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4594*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4595*c0909341SAndroid Build Coastguard Worker add acq, 32 4596*c0909341SAndroid Build Coastguard Worker sub hd, 4 4597*c0909341SAndroid Build Coastguard Worker jg .w4_loop 4598*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4599*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4600*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 4601*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3333 4602*c0909341SAndroid Build Coastguard Worker pslld xm1, 2 4603*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop: 4604*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4605*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4606*c0909341SAndroid Build Coastguard Worker add acq, 32 4607*c0909341SAndroid Build Coastguard Worker sub hpadd, 4 4608*c0909341SAndroid Build Coastguard Worker jg .w4_hpad_loop 4609*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4610*c0909341SAndroid Build Coastguard Worker.w8: 4611*c0909341SAndroid Build Coastguard Worker mov r5, acq 4612*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4613*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad1 4614*c0909341SAndroid Build Coastguard Worker.w8_loop: 4615*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*0] 4616*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*1] 4617*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4618*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4619*c0909341SAndroid Build Coastguard Worker packssdw m1, m0 4620*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4621*c0909341SAndroid Build Coastguard Worker vpermq m2, m1, q3120 4622*c0909341SAndroid Build Coastguard Worker mova [acq], m2 4623*c0909341SAndroid Build Coastguard Worker add acq, 32 4624*c0909341SAndroid Build Coastguard Worker sub hd, 2 4625*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4626*c0909341SAndroid Build Coastguard Worker.w8_hpad: 4627*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4628*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4629*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3131 4630*c0909341SAndroid Build Coastguard Worker pslld m0, 2 4631*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4632*c0909341SAndroid Build Coastguard Worker.w8_wpad1: 4633*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [ypxq+strideq*0+12] 4634*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [ypxq+strideq*1+12] 4635*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [ypxq+strideq*0+ 0], 0 4636*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [ypxq+strideq*1+ 0], 0 4637*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4638*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 4639*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5 4640*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4641*c0909341SAndroid Build Coastguard Worker packssdw m1, m0 4642*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4643*c0909341SAndroid Build Coastguard Worker vpermq m2, m1, q3120 4644*c0909341SAndroid Build Coastguard Worker mova [acq], m2 4645*c0909341SAndroid Build Coastguard Worker add acq, 32 4646*c0909341SAndroid Build Coastguard Worker sub hd, 2 4647*c0909341SAndroid Build Coastguard Worker jg .w8_wpad1 4648*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 4649*c0909341SAndroid Build Coastguard Worker.w16: 4650*c0909341SAndroid Build Coastguard Worker mov r5, acq 4651*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4652*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad 4653*c0909341SAndroid Build Coastguard Worker.w16_loop: 4654*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+strideq*0+ 0] 4655*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*0+32] 4656*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*1+ 0] 4657*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+strideq*1+32] 4658*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4659*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4660*c0909341SAndroid Build Coastguard Worker packssdw m2, m1 4661*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4662*c0909341SAndroid Build Coastguard Worker packssdw m1, m0, m3 4663*c0909341SAndroid Build Coastguard Worker paddd m0, m3 4664*c0909341SAndroid Build Coastguard Worker vpermq m2, m2, q3120 4665*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4666*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 4667*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m2 4668*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4669*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4670*c0909341SAndroid Build Coastguard Worker sub hd, 2 4671*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4672*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad 4673*c0909341SAndroid Build Coastguard Worker.w16_wpad: 4674*c0909341SAndroid Build Coastguard Worker mova m2, [ypxq+strideq*0+ 0] 4675*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+strideq*1+ 0] 4676*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 4677*c0909341SAndroid Build Coastguard Worker jl .w16_wpad1 4678*c0909341SAndroid Build Coastguard Worker je .w16_wpad2 4679*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [ypxq+strideq*0+12] 4680*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [ypxq+strideq*1+12] 4681*c0909341SAndroid Build Coastguard Worker vpblendd m2, m1, 0xf0 4682*c0909341SAndroid Build Coastguard Worker vpblendd m0, m3, 0xf0 4683*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 4684*c0909341SAndroid Build Coastguard Worker.w16_wpad2: 4685*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [ypxq+strideq*0+28] 4686*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [ypxq+strideq*1+28] 4687*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 4688*c0909341SAndroid Build Coastguard Worker.w16_wpad1: 4689*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [ypxq+strideq*0+44] 4690*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [ypxq+strideq*1+44] 4691*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [ypxq+strideq*0+32], 0 4692*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [ypxq+strideq*1+32], 0 4693*c0909341SAndroid Build Coastguard Worker.w16_wpad_end: 4694*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4695*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m5}, m2, m0, m1, m3 4696*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4697*c0909341SAndroid Build Coastguard Worker packssdw m2, m1 4698*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4699*c0909341SAndroid Build Coastguard Worker packssdw m1, m0, m3 4700*c0909341SAndroid Build Coastguard Worker paddd m0, m3 4701*c0909341SAndroid Build Coastguard Worker vpermq m2, m2, q3120 4702*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4703*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 4704*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m2 4705*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4706*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4707*c0909341SAndroid Build Coastguard Worker sub hd, 2 4708*c0909341SAndroid Build Coastguard Worker jg .w16_wpad 4709*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad 4710*c0909341SAndroid Build Coastguard Worker 4711*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4712*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] 4713*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4714*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 4715*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_1] 4716*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 4717*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 4718*c0909341SAndroid Build Coastguard Worker add wq, r6 4719*c0909341SAndroid Build Coastguard Worker mov hd, hm 4720*c0909341SAndroid Build Coastguard Worker pxor m4, m4 4721*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 4722*c0909341SAndroid Build Coastguard Worker jmp wq 4723*c0909341SAndroid Build Coastguard Worker.w4: 4724*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 4725*c0909341SAndroid Build Coastguard Worker mov r5, acq 4726*c0909341SAndroid Build Coastguard Worker.w4_loop: 4727*c0909341SAndroid Build Coastguard Worker movq xm0, [ypxq+strideq*0] 4728*c0909341SAndroid Build Coastguard Worker movhps xm0, [ypxq+strideq*1] 4729*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [ypxq+strideq*2] 4730*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [ypxq+r3 ] 4731*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 4732*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0x30 4733*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0xc0 4734*c0909341SAndroid Build Coastguard Worker psllw m0, 3 4735*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m0, m5 4736*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4737*c0909341SAndroid Build Coastguard Worker add acq, 32 4738*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4739*c0909341SAndroid Build Coastguard Worker sub hd, 4 4740*c0909341SAndroid Build Coastguard Worker jg .w4_loop 4741*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4742*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4743*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3333 4744*c0909341SAndroid Build Coastguard Worker paddd m1, m1 4745*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m0 4746*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3333 4747*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m0 4748*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4749*c0909341SAndroid Build Coastguard Worker paddd m4, m1 4750*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4751*c0909341SAndroid Build Coastguard Worker.w8: 4752*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 4753*c0909341SAndroid Build Coastguard Worker mov r5, acq 4754*c0909341SAndroid Build Coastguard Worker.w8_loop: 4755*c0909341SAndroid Build Coastguard Worker mova xm2, [ypxq+strideq*0] 4756*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [ypxq+strideq*1], 1 4757*c0909341SAndroid Build Coastguard Worker mova xm1, [ypxq+strideq*2] 4758*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [ypxq+r3 ], 1 4759*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 4760*c0909341SAndroid Build Coastguard Worker psllw m2, 3 4761*c0909341SAndroid Build Coastguard Worker psllw m1, 3 4762*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m2 4763*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 4764*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4765*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m1, m5 4766*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4767*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4768*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4769*c0909341SAndroid Build Coastguard Worker sub hd, 4 4770*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4771*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4772*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4773*c0909341SAndroid Build Coastguard Worker vperm2i128 m1, m1, 0x11 4774*c0909341SAndroid Build Coastguard Worker pslld m0, 2 4775*c0909341SAndroid Build Coastguard Worker pxor m2, m2 4776*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x0f 4777*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4778*c0909341SAndroid Build Coastguard Worker.w16_wpad2: 4779*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [ypxq+strideq*0+14] 4780*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [ypxq+strideq*1+14] 4781*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0xf0 4782*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0xf0 4783*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 4784*c0909341SAndroid Build Coastguard Worker.w16: 4785*c0909341SAndroid Build Coastguard Worker mov r5, acq 4786*c0909341SAndroid Build Coastguard Worker.w16_loop: 4787*c0909341SAndroid Build Coastguard Worker mova m2, [ypxq+strideq*0] 4788*c0909341SAndroid Build Coastguard Worker mova m1, [ypxq+strideq*1] 4789*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4790*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad2 4791*c0909341SAndroid Build Coastguard Worker.w16_wpad_end: 4792*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 4793*c0909341SAndroid Build Coastguard Worker psllw m2, 3 4794*c0909341SAndroid Build Coastguard Worker psllw m1, 3 4795*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m2 4796*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 4797*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4798*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m1, m5 4799*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4800*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4801*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4802*c0909341SAndroid Build Coastguard Worker sub hd, 2 4803*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4804*c0909341SAndroid Build Coastguard Worker add hpadd, hpadd 4805*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4806*c0909341SAndroid Build Coastguard Worker paddd m0, m0 4807*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4808*c0909341SAndroid Build Coastguard Worker.w32: 4809*c0909341SAndroid Build Coastguard Worker mov r5, acq 4810*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4811*c0909341SAndroid Build Coastguard Worker jnz .w32_wpad 4812*c0909341SAndroid Build Coastguard Worker.w32_loop: 4813*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+ 0] 4814*c0909341SAndroid Build Coastguard Worker mova m1, [ypxq+32] 4815*c0909341SAndroid Build Coastguard Worker add ypxq, strideq 4816*c0909341SAndroid Build Coastguard Worker psllw m0, 3 4817*c0909341SAndroid Build Coastguard Worker psllw m1, 3 4818*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m5 4819*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m0 4820*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m5 4821*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4822*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4823*c0909341SAndroid Build Coastguard Worker paddd m2, m3 4824*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4825*c0909341SAndroid Build Coastguard Worker dec hd 4826*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4827*c0909341SAndroid Build Coastguard Worker.w32_hpad: 4828*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4829*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4830*c0909341SAndroid Build Coastguard Worker paddd m2, m2 4831*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop: 4832*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m0 4833*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4834*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4835*c0909341SAndroid Build Coastguard Worker mova [acq+32*2], m0 4836*c0909341SAndroid Build Coastguard Worker mova [acq+32*3], m1 4837*c0909341SAndroid Build Coastguard Worker add acq, 32*4 4838*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4839*c0909341SAndroid Build Coastguard Worker jg .w32_hpad_loop 4840*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4841*c0909341SAndroid Build Coastguard Worker.w32_wpad: 4842*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+ 0] 4843*c0909341SAndroid Build Coastguard Worker cmp wpadd, 4 4844*c0909341SAndroid Build Coastguard Worker jl .w32_wpad2 4845*c0909341SAndroid Build Coastguard Worker je .w32_wpad4 4846*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [ypxq+14] 4847*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0xf0 4848*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_end 4849*c0909341SAndroid Build Coastguard Worker.w32_wpad4: 4850*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [ypxq+30] 4851*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_end 4852*c0909341SAndroid Build Coastguard Worker.w32_wpad2: 4853*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [ypxq+46] 4854*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [ypxq+32], 0 4855*c0909341SAndroid Build Coastguard Worker.w32_wpad_end: 4856*c0909341SAndroid Build Coastguard Worker add ypxq, strideq 4857*c0909341SAndroid Build Coastguard Worker psllw m0, 3 4858*c0909341SAndroid Build Coastguard Worker psllw m1, 3 4859*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m5 4860*c0909341SAndroid Build Coastguard Worker mova [acq+32*0], m0 4861*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m5 4862*c0909341SAndroid Build Coastguard Worker mova [acq+32*1], m1 4863*c0909341SAndroid Build Coastguard Worker add acq, 32*2 4864*c0909341SAndroid Build Coastguard Worker paddd m2, m3 4865*c0909341SAndroid Build Coastguard Worker paddd m4, m2 4866*c0909341SAndroid Build Coastguard Worker dec hd 4867*c0909341SAndroid Build Coastguard Worker jg .w32_wpad 4868*c0909341SAndroid Build Coastguard Worker jmp .w32_hpad 4869*c0909341SAndroid Build Coastguard Worker 4870*c0909341SAndroid Build Coastguard Workercglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h 4871*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [palq] 4872*c0909341SAndroid Build Coastguard Worker lea r2, [pal_pred_16bpc_avx2_table] 4873*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4874*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [pal_pred_shuf] 4875*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4876*c0909341SAndroid Build Coastguard Worker movsxd wq, [r2+wq*4] 4877*c0909341SAndroid Build Coastguard Worker pshufb m4, m5 4878*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m4, m4 4879*c0909341SAndroid Build Coastguard Worker add wq, r2 4880*c0909341SAndroid Build Coastguard WorkerDEFINE_ARGS dst, stride, stride3, idx, w, h 4881*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4882*c0909341SAndroid Build Coastguard Worker jmp wq 4883*c0909341SAndroid Build Coastguard Worker.w4: 4884*c0909341SAndroid Build Coastguard Worker movq xm0, [idxq] 4885*c0909341SAndroid Build Coastguard Worker add idxq, 8 4886*c0909341SAndroid Build Coastguard Worker psrlw xm1, xm0, 4 4887*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1 4888*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm4, xm0 4889*c0909341SAndroid Build Coastguard Worker pshufb xm2, xm5, xm0 4890*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1, xm2 4891*c0909341SAndroid Build Coastguard Worker punpckhbw xm1, xm2 4892*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4893*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 4894*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 4895*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 4896*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4897*c0909341SAndroid Build Coastguard Worker sub hd, 4 4898*c0909341SAndroid Build Coastguard Worker jg .w4 4899*c0909341SAndroid Build Coastguard Worker RET 4900*c0909341SAndroid Build Coastguard Worker.w8: 4901*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [idxq] 4902*c0909341SAndroid Build Coastguard Worker add idxq, 16 4903*c0909341SAndroid Build Coastguard Worker psllw m1, m2, 4 4904*c0909341SAndroid Build Coastguard Worker por m2, m1 4905*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m2 4906*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m2 4907*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4908*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4909*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 4910*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm1 4911*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*2], m0, 1 4912*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 4913*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4914*c0909341SAndroid Build Coastguard Worker sub hd, 4 4915*c0909341SAndroid Build Coastguard Worker jg .w8 4916*c0909341SAndroid Build Coastguard Worker RET 4917*c0909341SAndroid Build Coastguard Worker.w16: 4918*c0909341SAndroid Build Coastguard Worker pshufd m3, [idxq], q3120 4919*c0909341SAndroid Build Coastguard Worker add idxq, 32 4920*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 4921*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4922*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m1 4923*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4924*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m2 4925*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m2 4926*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4927*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4928*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 4929*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 4930*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4931*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m3 4932*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m3 4933*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 4934*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 4935*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m1 4936*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4937*c0909341SAndroid Build Coastguard Worker sub hd, 4 4938*c0909341SAndroid Build Coastguard Worker jg .w16 4939*c0909341SAndroid Build Coastguard Worker RET 4940*c0909341SAndroid Build Coastguard Worker.w32: 4941*c0909341SAndroid Build Coastguard Worker pshufd m3, [idxq], q3120 4942*c0909341SAndroid Build Coastguard Worker add idxq, 32 4943*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 4944*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4945*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m1 4946*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4947*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m2 4948*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m2 4949*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4950*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4951*c0909341SAndroid Build Coastguard Worker mova [dstq+ 0], m0 4952*c0909341SAndroid Build Coastguard Worker mova [dstq+32], m1 4953*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4954*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m3 4955*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m3 4956*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 4957*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+ 0], m0 4958*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+32], m1 4959*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4960*c0909341SAndroid Build Coastguard Worker sub hd, 2 4961*c0909341SAndroid Build Coastguard Worker jg .w32 4962*c0909341SAndroid Build Coastguard Worker RET 4963*c0909341SAndroid Build Coastguard Worker.w64: 4964*c0909341SAndroid Build Coastguard Worker pshufd m3, [idxq], q3120 4965*c0909341SAndroid Build Coastguard Worker add idxq, 32 4966*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 4967*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4968*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m1 4969*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4970*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m2 4971*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m2 4972*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4973*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4974*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 4975*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 4976*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4977*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m3 4978*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m3 4979*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 4980*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 4981*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m1 4982*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4983*c0909341SAndroid Build Coastguard Worker dec hd 4984*c0909341SAndroid Build Coastguard Worker jg .w64 4985*c0909341SAndroid Build Coastguard Worker RET 4986*c0909341SAndroid Build Coastguard Worker 4987*c0909341SAndroid Build Coastguard Worker%endif 4988