1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Worker; dav1d_obmc_masks[] * -512 34*c0909341SAndroid Build Coastguard Workerconst obmc_masks_avx2 35*c0909341SAndroid Build Coastguard Worker dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 36*c0909341SAndroid Build Coastguard Worker dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 37*c0909341SAndroid Build Coastguard Worker dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 38*c0909341SAndroid Build Coastguard Worker dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 39*c0909341SAndroid Build Coastguard Worker dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 40*c0909341SAndroid Build Coastguard Worker dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 41*c0909341SAndroid Build Coastguard Worker dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 42*c0909341SAndroid Build Coastguard Worker dw 0, 0, 0, 0, 0, 0, 0, 0 43*c0909341SAndroid Build Coastguard Worker 44*c0909341SAndroid Build Coastguard Workerdeint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 45*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 46*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 47*c0909341SAndroid Build Coastguard Workersubpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 48*c0909341SAndroid Build Coastguard Workersubpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 49*c0909341SAndroid Build Coastguard Workersubpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 50*c0909341SAndroid Build Coastguard Workerrescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 51*c0909341SAndroid Build Coastguard Workerrescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 52*c0909341SAndroid Build Coastguard Workerresize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 53*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 54*c0909341SAndroid Build Coastguard Workerblend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 55*c0909341SAndroid Build Coastguard Workerwswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 56*c0909341SAndroid Build Coastguard Workerbdct_lb_q: times 8 db 0 57*c0909341SAndroid Build Coastguard Worker times 8 db 4 58*c0909341SAndroid Build Coastguard Worker times 8 db 8 59*c0909341SAndroid Build Coastguard Worker times 8 db 12 60*c0909341SAndroid Build Coastguard Worker 61*c0909341SAndroid Build Coastguard Workerprep_mul: dw 16, 16, 4, 4 62*c0909341SAndroid Build Coastguard Workerput_bilin_h_rnd: dw 8, 8, 10, 10 63*c0909341SAndroid Build Coastguard Workerput_8tap_h_rnd: dd 34, 40 64*c0909341SAndroid Build Coastguard Workers_8tap_h_rnd: dd 2, 8 65*c0909341SAndroid Build Coastguard Workers_8tap_h_sh: dd 2, 4 66*c0909341SAndroid Build Coastguard Workerput_s_8tap_v_rnd: dd 512, 128 67*c0909341SAndroid Build Coastguard Workerput_s_8tap_v_sh: dd 10, 8 68*c0909341SAndroid Build Coastguard Workerprep_8tap_1d_rnd: dd 8 - (8192 << 4) 69*c0909341SAndroid Build Coastguard Workerprep_8tap_2d_rnd: dd 32 - (8192 << 5) 70*c0909341SAndroid Build Coastguard Workerwarp8x8t_rnd: dd 16384 - (8192 << 15) 71*c0909341SAndroid Build Coastguard Workerwarp8x8_shift: dd 5, 3 72*c0909341SAndroid Build Coastguard Workerwarp8x8_rnd: dw 4096, 4096, 16384, 16384 73*c0909341SAndroid Build Coastguard Workerbidir_rnd: dw -16400, -16400, -16388, -16388 74*c0909341SAndroid Build Coastguard Workerbidir_mul: dw 2048, 2048, 8192, 8192 75*c0909341SAndroid Build Coastguard Worker 76*c0909341SAndroid Build Coastguard Worker%define pw_16 prep_mul 77*c0909341SAndroid Build Coastguard Worker%define pd_512 put_s_8tap_v_rnd 78*c0909341SAndroid Build Coastguard Worker 79*c0909341SAndroid Build Coastguard Workerpw_2: times 2 dw 2 80*c0909341SAndroid Build Coastguard Workerpw_64: times 2 dw 64 81*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 82*c0909341SAndroid Build Coastguard Workerpw_8192: times 2 dw 8192 83*c0909341SAndroid Build Coastguard Workerpw_27615: times 2 dw 27615 84*c0909341SAndroid Build Coastguard Workerpw_32766: times 2 dw 32766 85*c0909341SAndroid Build Coastguard Workerpw_m512: times 2 dw -512 86*c0909341SAndroid Build Coastguard Workerpd_32: dd 32 87*c0909341SAndroid Build Coastguard Workerpd_63: dd 63 88*c0909341SAndroid Build Coastguard Workerpd_64: dd 64 89*c0909341SAndroid Build Coastguard Workerpd_32768: dd 32768 90*c0909341SAndroid Build Coastguard Workerpd_65538: dd 65538 91*c0909341SAndroid Build Coastguard Workerpd_m524256: dd -524256 ; -8192 << 6 + 32 92*c0909341SAndroid Build Coastguard Workerpd_0x3ff: dd 0x3ff 93*c0909341SAndroid Build Coastguard Workerpq_0x40000000: dq 0x40000000 94*c0909341SAndroid Build Coastguard Worker dd 0 95*c0909341SAndroid Build Coastguard Worker 96*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-* 97*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*%3) 98*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2_table 99*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 100*c0909341SAndroid Build Coastguard Worker %%table: 101*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 102*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .w%3 - %%base 103*c0909341SAndroid Build Coastguard Worker %rotate 1 104*c0909341SAndroid Build Coastguard Worker %endrep 105*c0909341SAndroid Build Coastguard Worker%endmacro 106*c0909341SAndroid Build Coastguard Worker 107*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 108*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 109*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 110*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 111*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 112*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 113*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 114*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 115*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 116*c0909341SAndroid Build Coastguard Worker 117*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-* 118*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - %3) 119*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2 120*c0909341SAndroid Build Coastguard Worker %%table: 121*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 122*c0909341SAndroid Build Coastguard Worker dw %%base %+ _w%3 - %%base 123*c0909341SAndroid Build Coastguard Worker %rotate 1 124*c0909341SAndroid Build Coastguard Worker %endrep 125*c0909341SAndroid Build Coastguard Worker%endmacro 126*c0909341SAndroid Build Coastguard Worker 127*c0909341SAndroid Build Coastguard Worker%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) 128*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) 129*c0909341SAndroid Build Coastguard Worker 130*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 131*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 132*c0909341SAndroid Build Coastguard Worker 133*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-* 134*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) 135*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%3 136*c0909341SAndroid Build Coastguard Worker %assign %%types %4 137*c0909341SAndroid Build Coastguard Worker %if %%types & 1 138*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_h_%3_table (%%h - %5) 139*c0909341SAndroid Build Coastguard Worker %%h: 140*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 141*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .h_w%5 - %%base 142*c0909341SAndroid Build Coastguard Worker %rotate 1 143*c0909341SAndroid Build Coastguard Worker %endrep 144*c0909341SAndroid Build Coastguard Worker %rotate 4 145*c0909341SAndroid Build Coastguard Worker %endif 146*c0909341SAndroid Build Coastguard Worker %if %%types & 2 147*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_v_%3_table (%%v - %5) 148*c0909341SAndroid Build Coastguard Worker %%v: 149*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 150*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .v_w%5 - %%base 151*c0909341SAndroid Build Coastguard Worker %rotate 1 152*c0909341SAndroid Build Coastguard Worker %endrep 153*c0909341SAndroid Build Coastguard Worker %rotate 4 154*c0909341SAndroid Build Coastguard Worker %endif 155*c0909341SAndroid Build Coastguard Worker %if %%types & 4 156*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_hv_%3_table (%%hv - %5) 157*c0909341SAndroid Build Coastguard Worker %%hv: 158*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 159*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .hv_w%5 - %%base 160*c0909341SAndroid Build Coastguard Worker %rotate 1 161*c0909341SAndroid Build Coastguard Worker %endrep 162*c0909341SAndroid Build Coastguard Worker %endif 163*c0909341SAndroid Build Coastguard Worker%endmacro 164*c0909341SAndroid Build Coastguard Worker 165*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 166*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 167*c0909341SAndroid Build Coastguard Worker 168*c0909341SAndroid Build Coastguard Worker%macro SCALED_JMP_TABLE 2-* 169*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - %3) 170*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) 171*c0909341SAndroid Build Coastguard Worker%%table: 172*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 173*c0909341SAndroid Build Coastguard Worker dw %%base %+ .w%3 - %%base 174*c0909341SAndroid Build Coastguard Worker %rotate 1 175*c0909341SAndroid Build Coastguard Worker %endrep 176*c0909341SAndroid Build Coastguard Worker %rotate 2 177*c0909341SAndroid Build Coastguard Worker %%dy_1024: 178*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 179*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 180*c0909341SAndroid Build Coastguard Worker dw %%base %+ .dy1_w%3 - %%base 181*c0909341SAndroid Build Coastguard Worker %rotate 1 182*c0909341SAndroid Build Coastguard Worker %endrep 183*c0909341SAndroid Build Coastguard Worker %rotate 2 184*c0909341SAndroid Build Coastguard Worker %%dy_2048: 185*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 186*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 187*c0909341SAndroid Build Coastguard Worker dw %%base %+ .dy2_w%3 - %%base 188*c0909341SAndroid Build Coastguard Worker %rotate 1 189*c0909341SAndroid Build Coastguard Worker %endrep 190*c0909341SAndroid Build Coastguard Worker%endmacro 191*c0909341SAndroid Build Coastguard Worker 192*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 193*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 194*c0909341SAndroid Build Coastguard Worker 195*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 196*c0909341SAndroid Build Coastguard Worker 197*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters 198*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 199*c0909341SAndroid Build Coastguard Worker 200*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter 201*c0909341SAndroid Build Coastguard Workercextern resize_filter 202*c0909341SAndroid Build Coastguard Worker 203*c0909341SAndroid Build Coastguard WorkerSECTION .text 204*c0909341SAndroid Build Coastguard Worker 205*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 206*c0909341SAndroid Build Coastguard Workercglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy 207*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; mx 208*c0909341SAndroid Build Coastguard Worker lea r7, [put_avx2] 209*c0909341SAndroid Build Coastguard Worker%if UNIX64 210*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 8 211*c0909341SAndroid Build Coastguard Worker %define org_w r8d 212*c0909341SAndroid Build Coastguard Worker mov r8d, wd 213*c0909341SAndroid Build Coastguard Worker%else 214*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 7 215*c0909341SAndroid Build Coastguard Worker %define org_w wm 216*c0909341SAndroid Build Coastguard Worker%endif 217*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 218*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 219*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 220*c0909341SAndroid Build Coastguard Worker jnz .h 221*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 222*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 223*c0909341SAndroid Build Coastguard Worker jnz .v 224*c0909341SAndroid Build Coastguard Worker.put: 225*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put,)] 226*c0909341SAndroid Build Coastguard Worker add wq, r7 227*c0909341SAndroid Build Coastguard Worker jmp wq 228*c0909341SAndroid Build Coastguard Worker.put_w2: 229*c0909341SAndroid Build Coastguard Worker mov r6d, [srcq+ssq*0] 230*c0909341SAndroid Build Coastguard Worker mov r7d, [srcq+ssq*1] 231*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 232*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6d 233*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7d 234*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 235*c0909341SAndroid Build Coastguard Worker sub hd, 2 236*c0909341SAndroid Build Coastguard Worker jg .put_w2 237*c0909341SAndroid Build Coastguard Worker RET 238*c0909341SAndroid Build Coastguard Worker.put_w4: 239*c0909341SAndroid Build Coastguard Worker mov r6, [srcq+ssq*0] 240*c0909341SAndroid Build Coastguard Worker mov r7, [srcq+ssq*1] 241*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 242*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6 243*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7 244*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 245*c0909341SAndroid Build Coastguard Worker sub hd, 2 246*c0909341SAndroid Build Coastguard Worker jg .put_w4 247*c0909341SAndroid Build Coastguard Worker RET 248*c0909341SAndroid Build Coastguard Worker.put_w8: 249*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 250*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*1] 251*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 252*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 253*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 254*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 255*c0909341SAndroid Build Coastguard Worker sub hd, 2 256*c0909341SAndroid Build Coastguard Worker jg .put_w8 257*c0909341SAndroid Build Coastguard Worker RET 258*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 259*c0909341SAndroid Build Coastguard Worker.put_w16: 260*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 261*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*1] 262*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 263*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 264*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 265*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 266*c0909341SAndroid Build Coastguard Worker sub hd, 2 267*c0909341SAndroid Build Coastguard Worker jg .put_w16 268*c0909341SAndroid Build Coastguard Worker RET 269*c0909341SAndroid Build Coastguard Worker.put_w32: 270*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+32*0] 271*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+32*1] 272*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*1+32*0] 273*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1+32*1] 274*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 275*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*0], m0 276*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*1], m1 277*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*0], m2 278*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*1], m3 279*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 280*c0909341SAndroid Build Coastguard Worker sub hd, 2 281*c0909341SAndroid Build Coastguard Worker jg .put_w32 282*c0909341SAndroid Build Coastguard Worker RET 283*c0909341SAndroid Build Coastguard Worker.put_w64: 284*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+32*0] 285*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+32*1] 286*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+32*2] 287*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+32*3] 288*c0909341SAndroid Build Coastguard Worker add srcq, ssq 289*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 290*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 291*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 292*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 293*c0909341SAndroid Build Coastguard Worker add dstq, dsq 294*c0909341SAndroid Build Coastguard Worker dec hd 295*c0909341SAndroid Build Coastguard Worker jg .put_w64 296*c0909341SAndroid Build Coastguard Worker RET 297*c0909341SAndroid Build Coastguard Worker.put_w128: 298*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+32*0] 299*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+32*1] 300*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+32*2] 301*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+32*3] 302*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 303*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 304*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 305*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 306*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+32*4] 307*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+32*5] 308*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+32*6] 309*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+32*7] 310*c0909341SAndroid Build Coastguard Worker add srcq, ssq 311*c0909341SAndroid Build Coastguard Worker mova [dstq+32*4], m0 312*c0909341SAndroid Build Coastguard Worker mova [dstq+32*5], m1 313*c0909341SAndroid Build Coastguard Worker mova [dstq+32*6], m2 314*c0909341SAndroid Build Coastguard Worker mova [dstq+32*7], m3 315*c0909341SAndroid Build Coastguard Worker add dstq, dsq 316*c0909341SAndroid Build Coastguard Worker dec hd 317*c0909341SAndroid Build Coastguard Worker jg .put_w128 318*c0909341SAndroid Build Coastguard Worker RET 319*c0909341SAndroid Build Coastguard Worker.h: 320*c0909341SAndroid Build Coastguard Worker movd xm5, mxyd 321*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 322*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_16] 323*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 324*c0909341SAndroid Build Coastguard Worker psubw m4, m5 325*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 326*c0909341SAndroid Build Coastguard Worker jnz .hv 327*c0909341SAndroid Build Coastguard Worker ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 328*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 329*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; bitdepth_max 330*c0909341SAndroid Build Coastguard Worker add wq, r7 331*c0909341SAndroid Build Coastguard Worker shr r6d, 11 332*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] 333*c0909341SAndroid Build Coastguard Worker jmp wq 334*c0909341SAndroid Build Coastguard Worker.h_w2: 335*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*0] 336*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ssq*1] 337*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 338*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, xm1 339*c0909341SAndroid Build Coastguard Worker psrlq xm1, 16 340*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5 341*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 342*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 343*c0909341SAndroid Build Coastguard Worker psrlw xm0, 4 344*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 345*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 2 346*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 347*c0909341SAndroid Build Coastguard Worker sub hd, 2 348*c0909341SAndroid Build Coastguard Worker jg .h_w2 349*c0909341SAndroid Build Coastguard Worker RET 350*c0909341SAndroid Build Coastguard Worker.h_w4: 351*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 352*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*1] 353*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*0+2] 354*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ssq*1+2] 355*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 356*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4 357*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5 358*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 359*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 360*c0909341SAndroid Build Coastguard Worker psrlw xm0, 4 361*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 362*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 363*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 364*c0909341SAndroid Build Coastguard Worker sub hd, 2 365*c0909341SAndroid Build Coastguard Worker jg .h_w4 366*c0909341SAndroid Build Coastguard Worker RET 367*c0909341SAndroid Build Coastguard Worker.h_w8: 368*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 369*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 370*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0+2] 371*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1+2], 1 372*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 373*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 374*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 375*c0909341SAndroid Build Coastguard Worker paddw m0, m3 376*c0909341SAndroid Build Coastguard Worker paddw m0, m1 377*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 378*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 379*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 380*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 381*c0909341SAndroid Build Coastguard Worker sub hd, 2 382*c0909341SAndroid Build Coastguard Worker jg .h_w8 383*c0909341SAndroid Build Coastguard Worker RET 384*c0909341SAndroid Build Coastguard Worker.h_w16: 385*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ssq*0] 386*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+ssq*0+2] 387*c0909341SAndroid Build Coastguard Worker paddw m0, m3 388*c0909341SAndroid Build Coastguard Worker paddw m0, m1 389*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+ssq*1] 390*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ssq*1+2] 391*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 392*c0909341SAndroid Build Coastguard Worker paddw m1, m3 393*c0909341SAndroid Build Coastguard Worker paddw m1, m2 394*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 395*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 396*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 397*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 398*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 399*c0909341SAndroid Build Coastguard Worker sub hd, 2 400*c0909341SAndroid Build Coastguard Worker jg .h_w16 401*c0909341SAndroid Build Coastguard Worker RET 402*c0909341SAndroid Build Coastguard Worker.h_w32: 403*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+32*0] 404*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+32*0+2] 405*c0909341SAndroid Build Coastguard Worker paddw m0, m3 406*c0909341SAndroid Build Coastguard Worker paddw m0, m1 407*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+32*1] 408*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+32*1+2] 409*c0909341SAndroid Build Coastguard Worker add srcq, ssq 410*c0909341SAndroid Build Coastguard Worker paddw m1, m3 411*c0909341SAndroid Build Coastguard Worker paddw m1, m2 412*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 413*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 414*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 415*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 416*c0909341SAndroid Build Coastguard Worker add dstq, dsq 417*c0909341SAndroid Build Coastguard Worker dec hd 418*c0909341SAndroid Build Coastguard Worker jg .h_w32 419*c0909341SAndroid Build Coastguard Worker RET 420*c0909341SAndroid Build Coastguard Worker.h_w64: 421*c0909341SAndroid Build Coastguard Worker.h_w128: 422*c0909341SAndroid Build Coastguard Worker movifnidn t0d, org_w 423*c0909341SAndroid Build Coastguard Worker.h_w64_loop0: 424*c0909341SAndroid Build Coastguard Worker mov r6d, t0d 425*c0909341SAndroid Build Coastguard Worker.h_w64_loop: 426*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+r6*2-32*1] 427*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+r6*2-32*1+2] 428*c0909341SAndroid Build Coastguard Worker paddw m0, m3 429*c0909341SAndroid Build Coastguard Worker paddw m0, m1 430*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+r6*2-32*2] 431*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+r6*2-32*2+2] 432*c0909341SAndroid Build Coastguard Worker paddw m1, m3 433*c0909341SAndroid Build Coastguard Worker paddw m1, m2 434*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 435*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 436*c0909341SAndroid Build Coastguard Worker mova [dstq+r6*2-32*1], m0 437*c0909341SAndroid Build Coastguard Worker mova [dstq+r6*2-32*2], m1 438*c0909341SAndroid Build Coastguard Worker sub r6d, 32 439*c0909341SAndroid Build Coastguard Worker jg .h_w64_loop 440*c0909341SAndroid Build Coastguard Worker add srcq, ssq 441*c0909341SAndroid Build Coastguard Worker add dstq, dsq 442*c0909341SAndroid Build Coastguard Worker dec hd 443*c0909341SAndroid Build Coastguard Worker jg .h_w64_loop0 444*c0909341SAndroid Build Coastguard Worker RET 445*c0909341SAndroid Build Coastguard Worker.v: 446*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 447*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 448*c0909341SAndroid Build Coastguard Worker movd xm5, mxyd 449*c0909341SAndroid Build Coastguard Worker add wq, r7 450*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 451*c0909341SAndroid Build Coastguard Worker jmp wq 452*c0909341SAndroid Build Coastguard Worker.v_w2: 453*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 454*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 455*c0909341SAndroid Build Coastguard Worker movd xm1, [srcq+ssq*1] 456*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 457*c0909341SAndroid Build Coastguard Worker punpckldq xm2, xm0, xm1 458*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 459*c0909341SAndroid Build Coastguard Worker punpckldq xm1, xm0 460*c0909341SAndroid Build Coastguard Worker psubw xm1, xm2 461*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 462*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 463*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm1 464*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm1, 1 465*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 466*c0909341SAndroid Build Coastguard Worker sub hd, 2 467*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 468*c0909341SAndroid Build Coastguard Worker RET 469*c0909341SAndroid Build Coastguard Worker.v_w4: 470*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 471*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 472*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*1] 473*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 474*c0909341SAndroid Build Coastguard Worker punpcklqdq xm2, xm0, xm1 475*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 476*c0909341SAndroid Build Coastguard Worker punpcklqdq xm1, xm0 477*c0909341SAndroid Build Coastguard Worker psubw xm1, xm2 478*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 479*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 480*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm1 481*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm1 482*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 483*c0909341SAndroid Build Coastguard Worker sub hd, 2 484*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 485*c0909341SAndroid Build Coastguard Worker RET 486*c0909341SAndroid Build Coastguard Worker.v_w8: 487*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 488*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 489*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [srcq+ssq*1] 490*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 491*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, m1, 0xf0 492*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 493*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0xf0 494*c0909341SAndroid Build Coastguard Worker psubw m1, m2 495*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 496*c0909341SAndroid Build Coastguard Worker paddw m1, m2 497*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm1 498*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m1, 1 499*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 500*c0909341SAndroid Build Coastguard Worker sub hd, 2 501*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 502*c0909341SAndroid Build Coastguard Worker RET 503*c0909341SAndroid Build Coastguard Worker.v_w32: 504*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+32*0] 505*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+32*1] 506*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 507*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*1+32*0] 508*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1+32*1] 509*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 510*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m0 511*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 512*c0909341SAndroid Build Coastguard Worker paddw m4, m0 513*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+32*0] 514*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*0], m4 515*c0909341SAndroid Build Coastguard Worker psubw m4, m3, m1 516*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 517*c0909341SAndroid Build Coastguard Worker paddw m4, m1 518*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+32*1] 519*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*1], m4 520*c0909341SAndroid Build Coastguard Worker psubw m4, m0, m2 521*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 522*c0909341SAndroid Build Coastguard Worker paddw m4, m2 523*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*0], m4 524*c0909341SAndroid Build Coastguard Worker psubw m4, m1, m3 525*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 526*c0909341SAndroid Build Coastguard Worker paddw m4, m3 527*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*1], m4 528*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 529*c0909341SAndroid Build Coastguard Worker sub hd, 2 530*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 531*c0909341SAndroid Build Coastguard Worker RET 532*c0909341SAndroid Build Coastguard Worker.v_w16: 533*c0909341SAndroid Build Coastguard Worker.v_w64: 534*c0909341SAndroid Build Coastguard Worker.v_w128: 535*c0909341SAndroid Build Coastguard Worker movifnidn t0d, org_w 536*c0909341SAndroid Build Coastguard Worker add t0d, t0d 537*c0909341SAndroid Build Coastguard Worker mov r4, srcq 538*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+t0*8-256] 539*c0909341SAndroid Build Coastguard Worker mov r7, dstq 540*c0909341SAndroid Build Coastguard Worker.v_w16_loop0: 541*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 542*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 543*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1] 544*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 545*c0909341SAndroid Build Coastguard Worker psubw m1, m3, m0 546*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 547*c0909341SAndroid Build Coastguard Worker paddw m1, m0 548*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 549*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m3 550*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 551*c0909341SAndroid Build Coastguard Worker paddw m2, m3 552*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m1 553*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m2 554*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 555*c0909341SAndroid Build Coastguard Worker sub hd, 2 556*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 557*c0909341SAndroid Build Coastguard Worker add r4, 32 558*c0909341SAndroid Build Coastguard Worker add r7, 32 559*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 560*c0909341SAndroid Build Coastguard Worker mov srcq, r4 561*c0909341SAndroid Build Coastguard Worker mov dstq, r7 562*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 563*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop0 564*c0909341SAndroid Build Coastguard Worker RET 565*c0909341SAndroid Build Coastguard Worker.hv: 566*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 567*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 568*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 569*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_2] 570*c0909341SAndroid Build Coastguard Worker movd xm6, mxyd 571*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 572*c0909341SAndroid Build Coastguard Worker add wq, r7 573*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 574*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 575*c0909341SAndroid Build Coastguard Worker jnz .hv_12bpc 576*c0909341SAndroid Build Coastguard Worker psllw m4, 2 577*c0909341SAndroid Build Coastguard Worker psllw m5, 2 578*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_2048] 579*c0909341SAndroid Build Coastguard Worker.hv_12bpc: 580*c0909341SAndroid Build Coastguard Worker jmp wq 581*c0909341SAndroid Build Coastguard Worker.hv_w2: 582*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm1, [srcq+ssq*0] 583*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, xm1 584*c0909341SAndroid Build Coastguard Worker psrlq xm1, 16 585*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5 586*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 587*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 588*c0909341SAndroid Build Coastguard Worker psrlw xm0, 2 589*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 590*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+ssq*1] 591*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 592*c0909341SAndroid Build Coastguard Worker movhps xm2, [srcq+ssq*0] 593*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm4, xm2 594*c0909341SAndroid Build Coastguard Worker psrlq xm2, 16 595*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm5 596*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 597*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 598*c0909341SAndroid Build Coastguard Worker psrlw xm1, 2 ; 1 _ 2 _ 599*c0909341SAndroid Build Coastguard Worker shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ 600*c0909341SAndroid Build Coastguard Worker mova xm0, xm1 601*c0909341SAndroid Build Coastguard Worker psubw xm1, xm2 602*c0909341SAndroid Build Coastguard Worker paddw xm1, xm1 603*c0909341SAndroid Build Coastguard Worker pmulhw xm1, xm6 604*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 605*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm7 606*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm1 607*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm1, 2 608*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 609*c0909341SAndroid Build Coastguard Worker sub hd, 2 610*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 611*c0909341SAndroid Build Coastguard Worker RET 612*c0909341SAndroid Build Coastguard Worker.hv_w4: 613*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, [srcq+ssq*0-8] 614*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5, [srcq+ssq*0-6] 615*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 616*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 617*c0909341SAndroid Build Coastguard Worker psrlw xm0, 2 618*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 619*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*1] 620*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+ssq*1+2] 621*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 622*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ssq*0] 623*c0909341SAndroid Build Coastguard Worker movhps xm2, [srcq+ssq*0+2] 624*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm4 625*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm5 626*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 627*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 628*c0909341SAndroid Build Coastguard Worker psrlw xm1, 2 ; 1 2 629*c0909341SAndroid Build Coastguard Worker shufpd xm2, xm0, xm1, 0x01 ; 0 1 630*c0909341SAndroid Build Coastguard Worker mova xm0, xm1 631*c0909341SAndroid Build Coastguard Worker psubw xm1, xm2 632*c0909341SAndroid Build Coastguard Worker paddw xm1, xm1 633*c0909341SAndroid Build Coastguard Worker pmulhw xm1, xm6 634*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 635*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm7 636*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm1 637*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm1 638*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 639*c0909341SAndroid Build Coastguard Worker sub hd, 2 640*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 641*c0909341SAndroid Build Coastguard Worker RET 642*c0909341SAndroid Build Coastguard Worker.hv_w8: 643*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, [srcq+ssq*0] 644*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5, [srcq+ssq*0+2] 645*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 646*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 647*c0909341SAndroid Build Coastguard Worker psrlw xm0, 2 648*c0909341SAndroid Build Coastguard Worker vinserti128 m0, xm0, 1 649*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 650*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 651*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*1+2] 652*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 653*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*0], 1 654*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*0+2], 1 655*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 656*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 657*c0909341SAndroid Build Coastguard Worker paddw m1, m3 658*c0909341SAndroid Build Coastguard Worker paddw m1, m2 659*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 ; 1 2 660*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m0, m1, 0x21 ; 0 1 661*c0909341SAndroid Build Coastguard Worker mova m0, m1 662*c0909341SAndroid Build Coastguard Worker psubw m1, m2 663*c0909341SAndroid Build Coastguard Worker paddw m1, m1 664*c0909341SAndroid Build Coastguard Worker pmulhw m1, m6 665*c0909341SAndroid Build Coastguard Worker paddw m1, m2 666*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 667*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm1 668*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m1, 1 669*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 670*c0909341SAndroid Build Coastguard Worker sub hd, 2 671*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 672*c0909341SAndroid Build Coastguard Worker RET 673*c0909341SAndroid Build Coastguard Worker.hv_w16: 674*c0909341SAndroid Build Coastguard Worker.hv_w32: 675*c0909341SAndroid Build Coastguard Worker.hv_w64: 676*c0909341SAndroid Build Coastguard Worker.hv_w128: 677*c0909341SAndroid Build Coastguard Worker%if UNIX64 678*c0909341SAndroid Build Coastguard Worker lea r6d, [r8*2-32] 679*c0909341SAndroid Build Coastguard Worker%else 680*c0909341SAndroid Build Coastguard Worker mov r6d, wm 681*c0909341SAndroid Build Coastguard Worker lea r6d, [r6*2-32] 682*c0909341SAndroid Build Coastguard Worker%endif 683*c0909341SAndroid Build Coastguard Worker mov r4, srcq 684*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*8] 685*c0909341SAndroid Build Coastguard Worker mov r7, dstq 686*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 687*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ssq*0] 688*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+ssq*0+2] 689*c0909341SAndroid Build Coastguard Worker paddw m0, m3 690*c0909341SAndroid Build Coastguard Worker paddw m0, m1 691*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 692*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 693*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+ssq*1] 694*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ssq*1+2] 695*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 696*c0909341SAndroid Build Coastguard Worker paddw m1, m3 697*c0909341SAndroid Build Coastguard Worker paddw m1, m2 698*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 699*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m0 700*c0909341SAndroid Build Coastguard Worker paddw m2, m2 701*c0909341SAndroid Build Coastguard Worker pmulhw m2, m6 702*c0909341SAndroid Build Coastguard Worker paddw m2, m0 703*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m7 704*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m2 705*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ssq*0] 706*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ssq*0+2] 707*c0909341SAndroid Build Coastguard Worker paddw m0, m3 708*c0909341SAndroid Build Coastguard Worker paddw m0, m2 709*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 710*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m1 711*c0909341SAndroid Build Coastguard Worker paddw m2, m2 712*c0909341SAndroid Build Coastguard Worker pmulhw m2, m6 713*c0909341SAndroid Build Coastguard Worker paddw m2, m1 714*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m7 715*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m2 716*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 717*c0909341SAndroid Build Coastguard Worker sub hd, 2 718*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 719*c0909341SAndroid Build Coastguard Worker add r4, 32 720*c0909341SAndroid Build Coastguard Worker add r7, 32 721*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 722*c0909341SAndroid Build Coastguard Worker mov srcq, r4 723*c0909341SAndroid Build Coastguard Worker mov dstq, r7 724*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 725*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 726*c0909341SAndroid Build Coastguard Worker RET 727*c0909341SAndroid Build Coastguard Worker 728*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 729*c0909341SAndroid Build Coastguard Worker movifnidn mxyd, r5m ; mx 730*c0909341SAndroid Build Coastguard Worker lea r6, [prep_avx2] 731*c0909341SAndroid Build Coastguard Worker%if UNIX64 732*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 7 733*c0909341SAndroid Build Coastguard Worker %define org_w r7d 734*c0909341SAndroid Build Coastguard Worker%else 735*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 6 736*c0909341SAndroid Build Coastguard Worker %define org_w r5m 737*c0909341SAndroid Build Coastguard Worker%endif 738*c0909341SAndroid Build Coastguard Worker mov org_w, wd 739*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 740*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 741*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 742*c0909341SAndroid Build Coastguard Worker jnz .h 743*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 744*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 745*c0909341SAndroid Build Coastguard Worker jnz .v 746*c0909341SAndroid Build Coastguard Worker.prep: 747*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep,)] 748*c0909341SAndroid Build Coastguard Worker mov r5d, r7m ; bitdepth_max 749*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [r6-prep_avx2+pw_8192] 750*c0909341SAndroid Build Coastguard Worker add wq, r6 751*c0909341SAndroid Build Coastguard Worker shr r5d, 11 752*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] 753*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 754*c0909341SAndroid Build Coastguard Worker jmp wq 755*c0909341SAndroid Build Coastguard Worker.prep_w4: 756*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 757*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+strideq*1] 758*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+strideq*2] 759*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+stride3q ] 760*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 761*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0x30 762*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0xc0 763*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 764*c0909341SAndroid Build Coastguard Worker psubw m0, m5 765*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 766*c0909341SAndroid Build Coastguard Worker add tmpq, 32 767*c0909341SAndroid Build Coastguard Worker sub hd, 4 768*c0909341SAndroid Build Coastguard Worker jg .prep_w4 769*c0909341SAndroid Build Coastguard Worker RET 770*c0909341SAndroid Build Coastguard Worker.prep_w8: 771*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 772*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*1], 1 773*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*2] 774*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+stride3q ], 1 775*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 776*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 777*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 778*c0909341SAndroid Build Coastguard Worker psubw m0, m5 779*c0909341SAndroid Build Coastguard Worker psubw m1, m5 780*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 781*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 782*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 783*c0909341SAndroid Build Coastguard Worker sub hd, 4 784*c0909341SAndroid Build Coastguard Worker jg .prep_w8 785*c0909341SAndroid Build Coastguard Worker RET 786*c0909341SAndroid Build Coastguard Worker.prep_w16: 787*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0] 788*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*1] 789*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+strideq*2] 790*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+stride3q ] 791*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 792*c0909341SAndroid Build Coastguard Worker psubw m0, m5 793*c0909341SAndroid Build Coastguard Worker psubw m1, m5 794*c0909341SAndroid Build Coastguard Worker psubw m2, m5 795*c0909341SAndroid Build Coastguard Worker psubw m3, m5 796*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 797*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 798*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 799*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 800*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 801*c0909341SAndroid Build Coastguard Worker sub hd, 4 802*c0909341SAndroid Build Coastguard Worker jg .prep_w16 803*c0909341SAndroid Build Coastguard Worker RET 804*c0909341SAndroid Build Coastguard Worker.prep_w32: 805*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0+32*0] 806*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*0+32*1] 807*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+strideq*1+32*0] 808*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+strideq*1+32*1] 809*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 810*c0909341SAndroid Build Coastguard Worker psubw m0, m5 811*c0909341SAndroid Build Coastguard Worker psubw m1, m5 812*c0909341SAndroid Build Coastguard Worker psubw m2, m5 813*c0909341SAndroid Build Coastguard Worker psubw m3, m5 814*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 815*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 816*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 817*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 818*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 819*c0909341SAndroid Build Coastguard Worker sub hd, 2 820*c0909341SAndroid Build Coastguard Worker jg .prep_w32 821*c0909341SAndroid Build Coastguard Worker RET 822*c0909341SAndroid Build Coastguard Worker.prep_w64: 823*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+32*0] 824*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+32*1] 825*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+32*2] 826*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+32*3] 827*c0909341SAndroid Build Coastguard Worker add srcq, strideq 828*c0909341SAndroid Build Coastguard Worker psubw m0, m5 829*c0909341SAndroid Build Coastguard Worker psubw m1, m5 830*c0909341SAndroid Build Coastguard Worker psubw m2, m5 831*c0909341SAndroid Build Coastguard Worker psubw m3, m5 832*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 833*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 834*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 835*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 836*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 837*c0909341SAndroid Build Coastguard Worker dec hd 838*c0909341SAndroid Build Coastguard Worker jg .prep_w64 839*c0909341SAndroid Build Coastguard Worker RET 840*c0909341SAndroid Build Coastguard Worker.prep_w128: 841*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+32*0] 842*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+32*1] 843*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+32*2] 844*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+32*3] 845*c0909341SAndroid Build Coastguard Worker psubw m0, m5 846*c0909341SAndroid Build Coastguard Worker psubw m1, m5 847*c0909341SAndroid Build Coastguard Worker psubw m2, m5 848*c0909341SAndroid Build Coastguard Worker psubw m3, m5 849*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 850*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 851*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 852*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 853*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+32*4] 854*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+32*5] 855*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+32*6] 856*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+32*7] 857*c0909341SAndroid Build Coastguard Worker add tmpq, 32*8 858*c0909341SAndroid Build Coastguard Worker add srcq, strideq 859*c0909341SAndroid Build Coastguard Worker psubw m0, m5 860*c0909341SAndroid Build Coastguard Worker psubw m1, m5 861*c0909341SAndroid Build Coastguard Worker psubw m2, m5 862*c0909341SAndroid Build Coastguard Worker psubw m3, m5 863*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*4], m0 864*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*3], m1 865*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*2], m2 866*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*1], m3 867*c0909341SAndroid Build Coastguard Worker dec hd 868*c0909341SAndroid Build Coastguard Worker jg .prep_w128 869*c0909341SAndroid Build Coastguard Worker RET 870*c0909341SAndroid Build Coastguard Worker.h: 871*c0909341SAndroid Build Coastguard Worker movd xm5, mxyd 872*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 873*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_16] 874*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 875*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_32766] 876*c0909341SAndroid Build Coastguard Worker psubw m4, m5 877*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 878*c0909341SAndroid Build Coastguard Worker jnz .h_12bpc 879*c0909341SAndroid Build Coastguard Worker psllw m4, 2 880*c0909341SAndroid Build Coastguard Worker psllw m5, 2 881*c0909341SAndroid Build Coastguard Worker.h_12bpc: 882*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 883*c0909341SAndroid Build Coastguard Worker jnz .hv 884*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 885*c0909341SAndroid Build Coastguard Worker add wq, r6 886*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 887*c0909341SAndroid Build Coastguard Worker jmp wq 888*c0909341SAndroid Build Coastguard Worker.h_w4: 889*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0] 890*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*2], 1 891*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*1] 892*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+stride3q ], 1 893*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 894*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 895*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 896*c0909341SAndroid Build Coastguard Worker pslldq m2, 6 897*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 898*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0xcc 899*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 900*c0909341SAndroid Build Coastguard Worker psubw m0, m3 901*c0909341SAndroid Build Coastguard Worker paddw m0, m1 902*c0909341SAndroid Build Coastguard Worker psraw m0, 2 903*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 904*c0909341SAndroid Build Coastguard Worker add tmpq, 32 905*c0909341SAndroid Build Coastguard Worker sub hd, 4 906*c0909341SAndroid Build Coastguard Worker jg .h_w4 907*c0909341SAndroid Build Coastguard Worker RET 908*c0909341SAndroid Build Coastguard Worker.h_w8: 909*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 910*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*1], 1 911*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0+2] 912*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*1+2], 1 913*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 914*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 915*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 916*c0909341SAndroid Build Coastguard Worker psubw m0, m3 917*c0909341SAndroid Build Coastguard Worker paddw m0, m1 918*c0909341SAndroid Build Coastguard Worker psraw m0, 2 919*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 920*c0909341SAndroid Build Coastguard Worker add tmpq, 32 921*c0909341SAndroid Build Coastguard Worker sub hd, 2 922*c0909341SAndroid Build Coastguard Worker jg .h_w8 923*c0909341SAndroid Build Coastguard Worker RET 924*c0909341SAndroid Build Coastguard Worker.h_w16: 925*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0] 926*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+strideq*0+2] 927*c0909341SAndroid Build Coastguard Worker psubw m0, m3 928*c0909341SAndroid Build Coastguard Worker paddw m0, m1 929*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*1] 930*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+strideq*1+2] 931*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 932*c0909341SAndroid Build Coastguard Worker psubw m1, m3 933*c0909341SAndroid Build Coastguard Worker paddw m1, m2 934*c0909341SAndroid Build Coastguard Worker psraw m0, 2 935*c0909341SAndroid Build Coastguard Worker psraw m1, 2 936*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 937*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 938*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 939*c0909341SAndroid Build Coastguard Worker sub hd, 2 940*c0909341SAndroid Build Coastguard Worker jg .h_w16 941*c0909341SAndroid Build Coastguard Worker RET 942*c0909341SAndroid Build Coastguard Worker.h_w32: 943*c0909341SAndroid Build Coastguard Worker.h_w64: 944*c0909341SAndroid Build Coastguard Worker.h_w128: 945*c0909341SAndroid Build Coastguard Worker movifnidn t0d, org_w 946*c0909341SAndroid Build Coastguard Worker.h_w32_loop0: 947*c0909341SAndroid Build Coastguard Worker mov r3d, t0d 948*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 949*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+r3*2-32*1] 950*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+r3*2-32*1+2] 951*c0909341SAndroid Build Coastguard Worker psubw m0, m3 952*c0909341SAndroid Build Coastguard Worker paddw m0, m1 953*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+r3*2-32*2] 954*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+r3*2-32*2+2] 955*c0909341SAndroid Build Coastguard Worker psubw m1, m3 956*c0909341SAndroid Build Coastguard Worker paddw m1, m2 957*c0909341SAndroid Build Coastguard Worker psraw m0, 2 958*c0909341SAndroid Build Coastguard Worker psraw m1, 2 959*c0909341SAndroid Build Coastguard Worker mova [tmpq+r3*2-32*1], m0 960*c0909341SAndroid Build Coastguard Worker mova [tmpq+r3*2-32*2], m1 961*c0909341SAndroid Build Coastguard Worker sub r3d, 32 962*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop 963*c0909341SAndroid Build Coastguard Worker add srcq, strideq 964*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+t0*2] 965*c0909341SAndroid Build Coastguard Worker dec hd 966*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop0 967*c0909341SAndroid Build Coastguard Worker RET 968*c0909341SAndroid Build Coastguard Worker.v: 969*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 970*c0909341SAndroid Build Coastguard Worker movd xm5, mxyd 971*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_16] 972*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 973*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_32766] 974*c0909341SAndroid Build Coastguard Worker add wq, r6 975*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 976*c0909341SAndroid Build Coastguard Worker psubw m4, m5 977*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 978*c0909341SAndroid Build Coastguard Worker jnz .v_12bpc 979*c0909341SAndroid Build Coastguard Worker psllw m4, 2 980*c0909341SAndroid Build Coastguard Worker psllw m5, 2 981*c0909341SAndroid Build Coastguard Worker.v_12bpc: 982*c0909341SAndroid Build Coastguard Worker jmp wq 983*c0909341SAndroid Build Coastguard Worker.v_w4: 984*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 985*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 986*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+strideq*2] 987*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm1, [srcq+strideq*1] 988*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, 0x03 ; 0 2 2 2 989*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+stride3q ] 990*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 991*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0xf0 ; 1 1 3 3 992*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*0] 993*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0x33 ; 0 1 2 3 994*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x0c ; 4 2 4 4 995*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m1, m0 ; 1 2 3 4 996*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 997*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 998*c0909341SAndroid Build Coastguard Worker psubw m1, m3 999*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1000*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1001*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1002*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1003*c0909341SAndroid Build Coastguard Worker sub hd, 4 1004*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1005*c0909341SAndroid Build Coastguard Worker RET 1006*c0909341SAndroid Build Coastguard Worker.v_w8: 1007*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 1008*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1009*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [srcq+strideq*1] 1010*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1011*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, m2, 0xf0 ; 0 1 1012*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+strideq*0] 1013*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, 0xf0 ; 1 2 1014*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 1015*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 1016*c0909341SAndroid Build Coastguard Worker psubw m1, m3 1017*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1018*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1019*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1020*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1021*c0909341SAndroid Build Coastguard Worker sub hd, 2 1022*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1023*c0909341SAndroid Build Coastguard Worker RET 1024*c0909341SAndroid Build Coastguard Worker.v_w16: 1025*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 1026*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1027*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+strideq*1] 1028*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1029*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 1030*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, m2 1031*c0909341SAndroid Build Coastguard Worker psubw m0, m3 1032*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1033*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 1034*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1035*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 1036*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m1 1037*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, m0 1038*c0909341SAndroid Build Coastguard Worker psubw m2, m3 1039*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1040*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1041*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 1042*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 1043*c0909341SAndroid Build Coastguard Worker sub hd, 2 1044*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1045*c0909341SAndroid Build Coastguard Worker RET 1046*c0909341SAndroid Build Coastguard Worker.v_w32: 1047*c0909341SAndroid Build Coastguard Worker.v_w64: 1048*c0909341SAndroid Build Coastguard Worker.v_w128: 1049*c0909341SAndroid Build Coastguard Worker%if WIN64 1050*c0909341SAndroid Build Coastguard Worker PUSH r7 1051*c0909341SAndroid Build Coastguard Worker%endif 1052*c0909341SAndroid Build Coastguard Worker movifnidn r7d, org_w 1053*c0909341SAndroid Build Coastguard Worker add r7d, r7d 1054*c0909341SAndroid Build Coastguard Worker mov r3, srcq 1055*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r7*8-256] 1056*c0909341SAndroid Build Coastguard Worker mov r5, tmpq 1057*c0909341SAndroid Build Coastguard Worker.v_w32_loop0: 1058*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 1059*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 1060*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+strideq*1] 1061*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1062*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 1063*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, m2 1064*c0909341SAndroid Build Coastguard Worker psubw m0, m3 1065*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1066*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 1067*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1068*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 1069*c0909341SAndroid Build Coastguard Worker mova [tmpq+r7*0], m1 1070*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, m0 1071*c0909341SAndroid Build Coastguard Worker psubw m2, m3 1072*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1073*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1074*c0909341SAndroid Build Coastguard Worker mova [tmpq+r7*1], m1 1075*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+r7*2] 1076*c0909341SAndroid Build Coastguard Worker sub hd, 2 1077*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 1078*c0909341SAndroid Build Coastguard Worker add r3, 32 1079*c0909341SAndroid Build Coastguard Worker add r5, 32 1080*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 1081*c0909341SAndroid Build Coastguard Worker mov srcq, r3 1082*c0909341SAndroid Build Coastguard Worker mov tmpq, r5 1083*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 1084*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop0 1085*c0909341SAndroid Build Coastguard Worker%if WIN64 1086*c0909341SAndroid Build Coastguard Worker POP r7 1087*c0909341SAndroid Build Coastguard Worker%endif 1088*c0909341SAndroid Build Coastguard Worker RET 1089*c0909341SAndroid Build Coastguard Worker.hv: 1090*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 1091*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1092*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 1093*c0909341SAndroid Build Coastguard Worker movd xm6, mxyd 1094*c0909341SAndroid Build Coastguard Worker add wq, r6 1095*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1096*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 1097*c0909341SAndroid Build Coastguard Worker jmp wq 1098*c0909341SAndroid Build Coastguard Worker.hv_w4: 1099*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0] 1100*c0909341SAndroid Build Coastguard Worker%if WIN64 1101*c0909341SAndroid Build Coastguard Worker movaps [rsp+24], xmm7 1102*c0909341SAndroid Build Coastguard Worker%endif 1103*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, xm1 1104*c0909341SAndroid Build Coastguard Worker psrldq xm1, 2 1105*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5 1106*c0909341SAndroid Build Coastguard Worker psubw xm0, xm3 1107*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 1108*c0909341SAndroid Build Coastguard Worker psraw xm0, 2 1109*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, xm0 1110*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1111*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1] 1112*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+stride3q ], 1 1113*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*2] 1114*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1115*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*0], 1 1116*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m1, m2 1117*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 1118*c0909341SAndroid Build Coastguard Worker pslldq m2, 6 1119*c0909341SAndroid Build Coastguard Worker pmullw m7, m4 1120*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0xcc 1121*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 1122*c0909341SAndroid Build Coastguard Worker psubw m7, m3 1123*c0909341SAndroid Build Coastguard Worker paddw m1, m7 1124*c0909341SAndroid Build Coastguard Worker psraw m1, 2 ; 1 2 3 4 1125*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0x3f 1126*c0909341SAndroid Build Coastguard Worker vpermq m2, m0, q2103 ; 0 1 2 3 1127*c0909341SAndroid Build Coastguard Worker mova m0, m1 1128*c0909341SAndroid Build Coastguard Worker psubw m1, m2 1129*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1130*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1131*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1132*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1133*c0909341SAndroid Build Coastguard Worker sub hd, 4 1134*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1135*c0909341SAndroid Build Coastguard Worker%if WIN64 1136*c0909341SAndroid Build Coastguard Worker movaps xmm7, [rsp+24] 1137*c0909341SAndroid Build Coastguard Worker%endif 1138*c0909341SAndroid Build Coastguard Worker RET 1139*c0909341SAndroid Build Coastguard Worker.hv_w8: 1140*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, [srcq+strideq*0] 1141*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5, [srcq+strideq*0+2] 1142*c0909341SAndroid Build Coastguard Worker psubw xm0, xm3 1143*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 1144*c0909341SAndroid Build Coastguard Worker psraw xm0, 2 1145*c0909341SAndroid Build Coastguard Worker vinserti128 m0, xm0, 1 1146*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1147*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1] 1148*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*1+2] 1149*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1150*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*0], 1 1151*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*0+2], 1 1152*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 1153*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 1154*c0909341SAndroid Build Coastguard Worker psubw m1, m3 1155*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1156*c0909341SAndroid Build Coastguard Worker psraw m1, 2 ; 1 2 1157*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m0, m1, 0x21 ; 0 1 1158*c0909341SAndroid Build Coastguard Worker mova m0, m1 1159*c0909341SAndroid Build Coastguard Worker psubw m1, m2 1160*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1161*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1162*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1163*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1164*c0909341SAndroid Build Coastguard Worker sub hd, 2 1165*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 1166*c0909341SAndroid Build Coastguard Worker RET 1167*c0909341SAndroid Build Coastguard Worker.hv_w16: 1168*c0909341SAndroid Build Coastguard Worker.hv_w32: 1169*c0909341SAndroid Build Coastguard Worker.hv_w64: 1170*c0909341SAndroid Build Coastguard Worker.hv_w128: 1171*c0909341SAndroid Build Coastguard Worker%if WIN64 1172*c0909341SAndroid Build Coastguard Worker PUSH r7 1173*c0909341SAndroid Build Coastguard Worker%endif 1174*c0909341SAndroid Build Coastguard Worker movifnidn r7d, org_w 1175*c0909341SAndroid Build Coastguard Worker add r7d, r7d 1176*c0909341SAndroid Build Coastguard Worker mov r3, srcq 1177*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r7*8-256] 1178*c0909341SAndroid Build Coastguard Worker mov r5, tmpq 1179*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 1180*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq] 1181*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+2] 1182*c0909341SAndroid Build Coastguard Worker psubw m0, m3 1183*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1184*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1185*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 1186*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*1] 1187*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+strideq*1+2] 1188*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1189*c0909341SAndroid Build Coastguard Worker psubw m1, m3 1190*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1191*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1192*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m0 1193*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 1194*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1195*c0909341SAndroid Build Coastguard Worker mova [tmpq+r7*0], m2 1196*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0] 1197*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+strideq*0+2] 1198*c0909341SAndroid Build Coastguard Worker psubw m0, m3 1199*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1200*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1201*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m1 1202*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 1203*c0909341SAndroid Build Coastguard Worker paddw m2, m1 1204*c0909341SAndroid Build Coastguard Worker mova [tmpq+r7*1], m2 1205*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+r7*2] 1206*c0909341SAndroid Build Coastguard Worker sub hd, 2 1207*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 1208*c0909341SAndroid Build Coastguard Worker add r3, 32 1209*c0909341SAndroid Build Coastguard Worker add r5, 32 1210*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 1211*c0909341SAndroid Build Coastguard Worker mov srcq, r3 1212*c0909341SAndroid Build Coastguard Worker mov tmpq, r5 1213*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 1214*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 1215*c0909341SAndroid Build Coastguard Worker%if WIN64 1216*c0909341SAndroid Build Coastguard Worker POP r7 1217*c0909341SAndroid Build Coastguard Worker%endif 1218*c0909341SAndroid Build Coastguard Worker RET 1219*c0909341SAndroid Build Coastguard Worker 1220*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8] 1221*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15 1222*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1223*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP (2*15 << 16) | 3*15 1224*c0909341SAndroid Build Coastguard Worker 1225*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1226*c0909341SAndroid Build Coastguard Workercglobal %1_%2_16bpc 1227*c0909341SAndroid Build Coastguard Worker mov t0d, FILTER_%3 1228*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4 1229*c0909341SAndroid Build Coastguard Worker mov t1d, t0d 1230*c0909341SAndroid Build Coastguard Worker%else 1231*c0909341SAndroid Build Coastguard Worker mov t1d, FILTER_%4 1232*c0909341SAndroid Build Coastguard Worker%endif 1233*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter 1234*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1235*c0909341SAndroid Build Coastguard Worker%endif 1236*c0909341SAndroid Build Coastguard Worker%endmacro 1237*c0909341SAndroid Build Coastguard Worker 1238*c0909341SAndroid Build Coastguard Worker%if WIN64 1239*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5 1240*c0909341SAndroid Build Coastguard Worker%else 1241*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8 1242*c0909341SAndroid Build Coastguard Worker%endif 1243*c0909341SAndroid Build Coastguard Worker 1244*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap, 1245*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc 1246*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc 1247*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc 1248*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular, REGULAR, REGULAR 1249*c0909341SAndroid Build Coastguard Worker 1250*c0909341SAndroid Build Coastguard Workercglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1251*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx2 1252*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 1253*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 1254*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 1255*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 1256*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx2] 1257*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1258*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1259*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 1260*c0909341SAndroid Build Coastguard Worker jnz .h 1261*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1262*c0909341SAndroid Build Coastguard Worker jnz .v 1263*c0909341SAndroid Build Coastguard Worker.put: 1264*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1265*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put,)] 1266*c0909341SAndroid Build Coastguard Worker add wq, r8 1267*c0909341SAndroid Build Coastguard Worker%if WIN64 1268*c0909341SAndroid Build Coastguard Worker pop r8 1269*c0909341SAndroid Build Coastguard Worker%endif 1270*c0909341SAndroid Build Coastguard Worker jmp wq 1271*c0909341SAndroid Build Coastguard Worker.h_w2: 1272*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1273*c0909341SAndroid Build Coastguard Worker sub srcq, 2 1274*c0909341SAndroid Build Coastguard Worker mova xm2, [subpel_h_shuf2] 1275*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] 1276*c0909341SAndroid Build Coastguard Worker pmovsxbw xm3, xm3 1277*c0909341SAndroid Build Coastguard Worker.h_w2_loop: 1278*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 1279*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 1280*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1281*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm2 1282*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm2 1283*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm3 1284*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm3 1285*c0909341SAndroid Build Coastguard Worker phaddd xm0, xm1 1286*c0909341SAndroid Build Coastguard Worker paddd xm0, xm4 1287*c0909341SAndroid Build Coastguard Worker psrad xm0, 6 1288*c0909341SAndroid Build Coastguard Worker packusdw xm0, xm0 1289*c0909341SAndroid Build Coastguard Worker pminsw xm0, xm5 1290*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 1291*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 1 1292*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1293*c0909341SAndroid Build Coastguard Worker sub hd, 2 1294*c0909341SAndroid Build Coastguard Worker jg .h_w2_loop 1295*c0909341SAndroid Build Coastguard Worker RET 1296*c0909341SAndroid Build Coastguard Worker.h_w4: 1297*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1298*c0909341SAndroid Build Coastguard Worker sub srcq, 2 1299*c0909341SAndroid Build Coastguard Worker pmovsxbw xm3, [base+subpel_filters+mxq*8] 1300*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 1301*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufA] 1302*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufB] 1303*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm3, q2211 1304*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, xm3 1305*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q1111 1306*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 1307*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0] 1308*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1], 1 1309*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1310*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 1311*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 ; 2 3 3 4 4 5 5 6 1312*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m2 1313*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m3 1314*c0909341SAndroid Build Coastguard Worker paddd m0, m4 1315*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1316*c0909341SAndroid Build Coastguard Worker psrad m0, 6 1317*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1318*c0909341SAndroid Build Coastguard Worker packusdw xm0, xm1 1319*c0909341SAndroid Build Coastguard Worker pminsw xm0, xm5 1320*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 1321*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 1322*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1323*c0909341SAndroid Build Coastguard Worker sub hd, 2 1324*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 1325*c0909341SAndroid Build Coastguard Worker RET 1326*c0909341SAndroid Build Coastguard Worker.h: 1327*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1328*c0909341SAndroid Build Coastguard Worker jnz .hv 1329*c0909341SAndroid Build Coastguard Worker mov r7d, r8m 1330*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, r8m 1331*c0909341SAndroid Build Coastguard Worker shr r7d, 11 1332*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] 1333*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1334*c0909341SAndroid Build Coastguard Worker je .h_w4 1335*c0909341SAndroid Build Coastguard Worker jl .h_w2 1336*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 11 1337*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1338*c0909341SAndroid Build Coastguard Worker sub srcq, 4 1339*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+1+mxq*8] 1340*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_h_shufA] 1341*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 1342*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 1343*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q0000 1344*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q1111 1345*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q2222 1346*c0909341SAndroid Build Coastguard Worker sub wd, 16 1347*c0909341SAndroid Build Coastguard Worker jge .h_w16 1348*c0909341SAndroid Build Coastguard Worker.h_w8: 1349*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 1350*c0909341SAndroid Build Coastguard Worker pshufb m%1, m6 ; 01 12 23 34 1351*c0909341SAndroid Build Coastguard Worker pshufb m%2, m6 ; 45 56 67 78 1352*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m7, m%1 ; a0 1353*c0909341SAndroid Build Coastguard Worker pshufb m%3, m6 ; 89 9a ab bc 1354*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%2 ; a2 1355*c0909341SAndroid Build Coastguard Worker shufpd m%1, m%2, 0x05 ; 23 34 45 56 1356*c0909341SAndroid Build Coastguard Worker paddd m%4, m%5 ; a0+a2 1357*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m7, m%2 ; b0 1358*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 67 78 89 9a 1359*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m9 ; b2 1360*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m8 ; a1 1361*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m8 ; b1 1362*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 ; b0+b2 1363*c0909341SAndroid Build Coastguard Worker paddd m%4, m4 1364*c0909341SAndroid Build Coastguard Worker paddd m%3, m4 1365*c0909341SAndroid Build Coastguard Worker paddd m%1, m%4 1366*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 1367*c0909341SAndroid Build Coastguard Worker psrad m%1, 6 1368*c0909341SAndroid Build Coastguard Worker psrad m%2, 6 1369*c0909341SAndroid Build Coastguard Worker packusdw m%1, m%2 1370*c0909341SAndroid Build Coastguard Worker pminsw m%1, m5 1371*c0909341SAndroid Build Coastguard Worker%endmacro 1372*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+ 0] 1373*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+ 0], 1 1374*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0+16] 1375*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1+16], 1 1376*c0909341SAndroid Build Coastguard Worker shufpd m1, m0, m2, 0x05 1377*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1378*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 0, 1, 2, 3, 10 1379*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 1380*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 1381*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1382*c0909341SAndroid Build Coastguard Worker sub hd, 2 1383*c0909341SAndroid Build Coastguard Worker jg .h_w8 1384*c0909341SAndroid Build Coastguard Worker RET 1385*c0909341SAndroid Build Coastguard Worker.h_w16: 1386*c0909341SAndroid Build Coastguard Worker mov r6d, wd 1387*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 1388*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6*2+ 0] 1389*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6*2+ 8] 1390*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6*2+16] 1391*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 0, 1, 2, 3, 10 1392*c0909341SAndroid Build Coastguard Worker mova [dstq+r6*2], m0 1393*c0909341SAndroid Build Coastguard Worker sub r6d, 16 1394*c0909341SAndroid Build Coastguard Worker jge .h_w16_loop 1395*c0909341SAndroid Build Coastguard Worker add srcq, ssq 1396*c0909341SAndroid Build Coastguard Worker add dstq, dsq 1397*c0909341SAndroid Build Coastguard Worker dec hd 1398*c0909341SAndroid Build Coastguard Worker jg .h_w16 1399*c0909341SAndroid Build Coastguard Worker RET 1400*c0909341SAndroid Build Coastguard Worker.v: 1401*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1402*c0909341SAndroid Build Coastguard Worker shr myd, 16 1403*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1404*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1405*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+1+myq*8] 1406*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 10, 12 1407*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pd_32] 1408*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, r8m 1409*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 1410*c0909341SAndroid Build Coastguard Worker mov r6, ssq 1411*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 1412*c0909341SAndroid Build Coastguard Worker neg r6 1413*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q0000 1414*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q1111 1415*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q2222 1416*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1417*c0909341SAndroid Build Coastguard Worker jg .v_w8 1418*c0909341SAndroid Build Coastguard Worker je .v_w4 1419*c0909341SAndroid Build Coastguard Worker.v_w2: 1420*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+r6 *2] 1421*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+r6 *1], 1 1422*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*0], 2 1423*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 1424*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1425*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 1426*c0909341SAndroid Build Coastguard Worker palignr xm3, xm0, xm2, 4 ; 1 2 3 4 1427*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm2, xm3 ; 01 12 1428*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm3 ; 23 34 1429*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 1430*c0909341SAndroid Build Coastguard Worker movd xm3, [srcq+ssq*1] 1431*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm7, xm1 ; a0 b0 1432*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1433*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm8 ; a1 b1 1434*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1435*c0909341SAndroid Build Coastguard Worker paddd xm4, xm2 1436*c0909341SAndroid Build Coastguard Worker punpckldq xm2, xm0, xm3 ; 4 5 1437*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 1438*c0909341SAndroid Build Coastguard Worker punpckldq xm3, xm0 ; 5 6 1439*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm3 ; 45 56 1440*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm9, xm2 ; a2 b2 1441*c0909341SAndroid Build Coastguard Worker paddd xm4, xm5 1442*c0909341SAndroid Build Coastguard Worker paddd xm4, xm3 1443*c0909341SAndroid Build Coastguard Worker psrad xm4, 6 1444*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm4 1445*c0909341SAndroid Build Coastguard Worker pminsw xm4, xm6 1446*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm4 1447*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm4, 1 1448*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1449*c0909341SAndroid Build Coastguard Worker sub hd, 2 1450*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 1451*c0909341SAndroid Build Coastguard Worker RET 1452*c0909341SAndroid Build Coastguard Worker.v_w4: 1453*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+r6 *2] 1454*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+r6 *1] 1455*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 1456*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 1457*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1458*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 1459*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0x30 1460*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x30 1461*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; 01 12 1462*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0x30 1463*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 1464*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 ; 23 34 1465*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1466*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 1467*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7, m1 ; a0 b0 1468*c0909341SAndroid Build Coastguard Worker mova m1, m2 1469*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 ; a1 b1 1470*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1471*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1472*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, m3, 0x30 1473*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 1474*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0x30 1475*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; 45 56 1476*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9, m2 ; a2 b2 1477*c0909341SAndroid Build Coastguard Worker paddd m4, m5 1478*c0909341SAndroid Build Coastguard Worker paddd m4, m3 1479*c0909341SAndroid Build Coastguard Worker psrad m4, 6 1480*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m4, 1 1481*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm3 1482*c0909341SAndroid Build Coastguard Worker pminsw xm4, xm6 1483*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm4 1484*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm4 1485*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1486*c0909341SAndroid Build Coastguard Worker sub hd, 2 1487*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1488*c0909341SAndroid Build Coastguard Worker RET 1489*c0909341SAndroid Build Coastguard Worker.v_w8: 1490*c0909341SAndroid Build Coastguard Worker shl wd, 5 1491*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 12 1492*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 1493*c0909341SAndroid Build Coastguard Worker.v_w8_loop0: 1494*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [srcq+r6 *2] 1495*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+r6 *1] 1496*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*2] 1497*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 1498*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [srcq+ssq*1] 1499*c0909341SAndroid Build Coastguard Worker mov r8, dstq 1500*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r7+ssq*0] 1501*c0909341SAndroid Build Coastguard Worker shufpd m3, m0, 0x0c 1502*c0909341SAndroid Build Coastguard Worker shufpd m4, m1, 0x0c 1503*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 ; 01 1504*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; 23 1505*c0909341SAndroid Build Coastguard Worker shufpd m0, m2, 0x0c 1506*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 ; 12 1507*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 34 1508*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1509*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [r7+ssq*1] 1510*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m7, m1 ; a0 1511*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 1512*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m7, m2 ; b0 1513*c0909341SAndroid Build Coastguard Worker mova m1, m3 1514*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m8 ; a1 1515*c0909341SAndroid Build Coastguard Worker mova m2, m4 1516*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m8 ; b1 1517*c0909341SAndroid Build Coastguard Worker paddd m10, m3 1518*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r7+ssq*0] 1519*c0909341SAndroid Build Coastguard Worker paddd m11, m4 1520*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m5, 0x0d 1521*c0909341SAndroid Build Coastguard Worker shufpd m0, m5, m3, 0x0c 1522*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 45 1523*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 56 1524*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m9, m3 ; a2 1525*c0909341SAndroid Build Coastguard Worker paddd m10, m5 1526*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m9, m4 ; b2 1527*c0909341SAndroid Build Coastguard Worker paddd m5, m11 1528*c0909341SAndroid Build Coastguard Worker psrad m10, 5 1529*c0909341SAndroid Build Coastguard Worker psrad m5, 5 1530*c0909341SAndroid Build Coastguard Worker packusdw m10, m5 1531*c0909341SAndroid Build Coastguard Worker pxor m5, m5 1532*c0909341SAndroid Build Coastguard Worker pavgw m5, m10 1533*c0909341SAndroid Build Coastguard Worker pminsw m5, m6 1534*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 1535*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], xm5 1536*c0909341SAndroid Build Coastguard Worker vextracti128 [r8+dsq*1], m5, 1 1537*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 1538*c0909341SAndroid Build Coastguard Worker sub hd, 2 1539*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1540*c0909341SAndroid Build Coastguard Worker add srcq, 16 1541*c0909341SAndroid Build Coastguard Worker add dstq, 16 1542*c0909341SAndroid Build Coastguard Worker movzx hd, wb 1543*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 1544*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop0 1545*c0909341SAndroid Build Coastguard Worker RET 1546*c0909341SAndroid Build Coastguard Worker.hv: 1547*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12, 16 1548*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_512] 1549*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, r8m 1550*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1551*c0909341SAndroid Build Coastguard Worker jg .hv_w8 1552*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1553*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 1554*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1555*c0909341SAndroid Build Coastguard Worker shr myd, 16 1556*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1557*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1558*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+subpel_filters+1+myq*8] 1559*c0909341SAndroid Build Coastguard Worker mov r6, ssq 1560*c0909341SAndroid Build Coastguard Worker sub srcq, 2 1561*c0909341SAndroid Build Coastguard Worker neg r6 1562*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1563*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m0 1564*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 1565*c0909341SAndroid Build Coastguard Worker psraw m1, 8 ; sign-extend 1566*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 1567*c0909341SAndroid Build Coastguard Worker jz .hv_10bit 1568*c0909341SAndroid Build Coastguard Worker psraw m6, 2 1569*c0909341SAndroid Build Coastguard Worker psllw m1, 2 1570*c0909341SAndroid Build Coastguard Worker.hv_10bit: 1571*c0909341SAndroid Build Coastguard Worker pshufd m7, m1, q0000 1572*c0909341SAndroid Build Coastguard Worker pshufd m8, m1, q1111 1573*c0909341SAndroid Build Coastguard Worker pshufd m9, m1, q2222 1574*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1575*c0909341SAndroid Build Coastguard Worker je .hv_w4 1576*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [subpel_h_shuf2] 1577*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 1578*c0909341SAndroid Build Coastguard Worker vinserti128 m2, m0, [srcq+r6*2], 1 ; 2 0 1579*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 1580*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+r6 *1], 1 ; 3 1 1581*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1582*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0], 0 ; 4 2 1583*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m5}, m2, m1, m0 1584*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m6}, m2, m1, m0 1585*c0909341SAndroid Build Coastguard Worker phaddd m2, m1 1586*c0909341SAndroid Build Coastguard Worker phaddd m1, m0 1587*c0909341SAndroid Build Coastguard Worker paddd m2, m10 1588*c0909341SAndroid Build Coastguard Worker paddd m1, m10 1589*c0909341SAndroid Build Coastguard Worker psrad m2, 10 1590*c0909341SAndroid Build Coastguard Worker psrad m1, 10 1591*c0909341SAndroid Build Coastguard Worker packssdw m2, m1 ; 2 3 3 4 0 1 1 2 1592*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m2, m2 1593*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0 ; 23 34 1594*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m2, 1 ; 01 12 1595*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 1596*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 1597*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1598*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 1599*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm5 1600*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm5 1601*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm6 1602*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm6 1603*c0909341SAndroid Build Coastguard Worker phaddd xm3, xm4 1604*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm7, xm1 ; a0 b0 1605*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1606*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm8 ; a1 b1 1607*c0909341SAndroid Build Coastguard Worker paddd xm4, xm2 1608*c0909341SAndroid Build Coastguard Worker paddd xm3, xm10 1609*c0909341SAndroid Build Coastguard Worker psrad xm3, 10 1610*c0909341SAndroid Build Coastguard Worker packssdw xm3, xm3 1611*c0909341SAndroid Build Coastguard Worker palignr xm2, xm3, xm0, 12 1612*c0909341SAndroid Build Coastguard Worker mova xm0, xm3 1613*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm0 ; 45 56 1614*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm9, xm2 ; a2 b2 1615*c0909341SAndroid Build Coastguard Worker paddd xm4, xm10 1616*c0909341SAndroid Build Coastguard Worker paddd xm4, xm3 1617*c0909341SAndroid Build Coastguard Worker psrad xm4, 10 1618*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm4 1619*c0909341SAndroid Build Coastguard Worker pminsw xm4, xm11 1620*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm4 1621*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm4, 1 1622*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1623*c0909341SAndroid Build Coastguard Worker sub hd, 2 1624*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 1625*c0909341SAndroid Build Coastguard Worker RET 1626*c0909341SAndroid Build Coastguard Worker.hv_w4: 1627*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 14 1628*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [subpel_h_shufA] 1629*c0909341SAndroid Build Coastguard Worker pshufd m5, m6, q0000 1630*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m13, [subpel_h_shufB] 1631*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q1111 1632*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+r6 *2] 1633*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 1634*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 1635*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 1636*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1637*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*0] ; 4 1638*c0909341SAndroid Build Coastguard Worker pshufb m1, m2, m12 1639*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 1640*c0909341SAndroid Build Coastguard Worker pshufb m2, m13 1641*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m6 1642*c0909341SAndroid Build Coastguard Worker pshufb m4, m0, m12 1643*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m5 1644*c0909341SAndroid Build Coastguard Worker pshufb m0, m13 1645*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 1646*c0909341SAndroid Build Coastguard Worker paddd m2, m1 1647*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm3, xm12 1648*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm5 1649*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm13 1650*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm6 1651*c0909341SAndroid Build Coastguard Worker paddd m0, m4 1652*c0909341SAndroid Build Coastguard Worker paddd m2, m10 1653*c0909341SAndroid Build Coastguard Worker paddd xm1, xm10 1654*c0909341SAndroid Build Coastguard Worker paddd m0, m10 1655*c0909341SAndroid Build Coastguard Worker paddd xm3, xm1 1656*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 10}, m2, m0, xm3 1657*c0909341SAndroid Build Coastguard Worker packssdw m2, m0 ; 0 2 1 3 1658*c0909341SAndroid Build Coastguard Worker packssdw xm0, xm3 ; 2 4 1659*c0909341SAndroid Build Coastguard Worker vperm2i128 m0, m2, 0x03 1660*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m0 ; 01 12 1661*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0 ; 23 34 1662*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1663*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 1664*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1665*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*0], 1 1666*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7, m1 ; a0 b0 1667*c0909341SAndroid Build Coastguard Worker mova m1, m2 1668*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 ; a1 b1 1669*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1670*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m12 1671*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 1672*c0909341SAndroid Build Coastguard Worker pshufb m3, m13 1673*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 1674*c0909341SAndroid Build Coastguard Worker paddd m2, m10 1675*c0909341SAndroid Build Coastguard Worker paddd m3, m2 1676*c0909341SAndroid Build Coastguard Worker psrad m3, 10 1677*c0909341SAndroid Build Coastguard Worker packssdw m3, m3 ; 5 5 6 6 1678*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m0, m3, 0x21 1679*c0909341SAndroid Build Coastguard Worker mova m0, m3 1680*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 ; 45 56 1681*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9, m2 ; a2 b2 1682*c0909341SAndroid Build Coastguard Worker paddd m4, m10 1683*c0909341SAndroid Build Coastguard Worker paddd m4, m3 1684*c0909341SAndroid Build Coastguard Worker psrad m4, 10 1685*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m4, 1 1686*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm3 1687*c0909341SAndroid Build Coastguard Worker pminsw xm4, xm11 1688*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm4 1689*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm4 1690*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1691*c0909341SAndroid Build Coastguard Worker sub hd, 2 1692*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1693*c0909341SAndroid Build Coastguard Worker RET 1694*c0909341SAndroid Build Coastguard Worker.hv_w8: 1695*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 16, 12 1696*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1697*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [subpel_h_shufA] 1698*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+subpel_filters+1+mxq*8] 1699*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1700*c0909341SAndroid Build Coastguard Worker shr myd, 16 1701*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1702*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1703*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [base+subpel_filters+1+myq*8] 1704*c0909341SAndroid Build Coastguard Worker shl wd, 5 1705*c0909341SAndroid Build Coastguard Worker mov r6, ssq 1706*c0909341SAndroid Build Coastguard Worker sub srcq, 4 1707*c0909341SAndroid Build Coastguard Worker pxor m0, m0 1708*c0909341SAndroid Build Coastguard Worker neg r6 1709*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 1710*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 1711*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 1712*c0909341SAndroid Build Coastguard Worker jz .hv_w8_10bit 1713*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1714*c0909341SAndroid Build Coastguard Worker psllw xm1, 2 1715*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit: 1716*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q0000 1717*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q1111 1718*c0909341SAndroid Build Coastguard Worker%if WIN64 1719*c0909341SAndroid Build Coastguard Worker %define v_mul (rsp+stack_offset+40) ; r4m 1720*c0909341SAndroid Build Coastguard Worker%else 1721*c0909341SAndroid Build Coastguard Worker %define v_mul (rsp+stack_offset+ 8) ; r6m 1722*c0909341SAndroid Build Coastguard Worker%endif 1723*c0909341SAndroid Build Coastguard Worker mova [v_mul], xm1 1724*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q2222 1725*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 1726*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0+ 0] 1727*c0909341SAndroid Build Coastguard Worker vinserti128 m3, m0, [srcq+r6*2+ 0], 0 1728*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*2] 1729*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [srcq+ssq*0+16] 1730*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m2, [srcq+r6*2+16], 0 1731*c0909341SAndroid Build Coastguard Worker mov r8, dstq 1732*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r7 +ssq*0+ 0], 1 1733*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [r7 +ssq*0+16], 1 1734*c0909341SAndroid Build Coastguard Worker shufpd m4, m3, m1, 0x05 1735*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 1736*c0909341SAndroid Build Coastguard Worker pshufb m%1, m12 ; 01 12 23 34 1737*c0909341SAndroid Build Coastguard Worker pshufb m%2, m12 ; 45 56 67 78 1738*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m7, m%1 ; a0 1739*c0909341SAndroid Build Coastguard Worker pshufb m%3, m12 ; 89 9a ab bc 1740*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%2 ; a2 1741*c0909341SAndroid Build Coastguard Worker shufpd m%1, m%2, 0x05 ; 23 34 45 56 1742*c0909341SAndroid Build Coastguard Worker paddd m%4, m%5 ; a0+a2 1743*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m7, m%2 ; b0 1744*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 67 78 89 9a 1745*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m9 ; b2 1746*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m8 ; a1 1747*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m8 ; b1 1748*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 ; b0+b2 1749*c0909341SAndroid Build Coastguard Worker paddd m%4, m10 1750*c0909341SAndroid Build Coastguard Worker paddd m%3, m10 1751*c0909341SAndroid Build Coastguard Worker paddd m%1, m%4 1752*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 1753*c0909341SAndroid Build Coastguard Worker psrad m%1, 10 1754*c0909341SAndroid Build Coastguard Worker psrad m%2, 10 1755*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%2 1756*c0909341SAndroid Build Coastguard Worker%endmacro 1757*c0909341SAndroid Build Coastguard Worker PUT_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 1758*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r6 *1+ 0] 1759*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+ssq*1+ 0], 1 1760*c0909341SAndroid Build Coastguard Worker shufpd m1, m0, m2, 0x05 1761*c0909341SAndroid Build Coastguard Worker PUT_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 1762*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+r6 *1+16] 1763*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1+16], 1 1764*c0909341SAndroid Build Coastguard Worker shufpd m1, m4, m2, 0x05 1765*c0909341SAndroid Build Coastguard Worker PUT_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 1766*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 1767*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 1768*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 1769*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 ; 01 1770*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; 23 1771*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 ; 12 1772*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 34 1773*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1774*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [v_mul+4*0] 1775*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [v_mul+4*1] 1776*c0909341SAndroid Build Coastguard Worker movu xm5, [r7+ssq*1+ 0] 1777*c0909341SAndroid Build Coastguard Worker movu xm6, [r7+ssq*1+16] 1778*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 1779*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m15, m1 ; a0 1780*c0909341SAndroid Build Coastguard Worker pmaddwd m15, m2 ; b0 1781*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r7+ssq*0+ 0], 1 1782*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r7+ssq*0+16], 1 1783*c0909341SAndroid Build Coastguard Worker mova m1, m3 1784*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 ; a1 1785*c0909341SAndroid Build Coastguard Worker mova m2, m4 1786*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 ; b1 1787*c0909341SAndroid Build Coastguard Worker paddd m14, m3 1788*c0909341SAndroid Build Coastguard Worker shufpd m3, m5, m6, 0x05 1789*c0909341SAndroid Build Coastguard Worker paddd m15, m4 1790*c0909341SAndroid Build Coastguard Worker PUT_6TAP_HV_H 5, 3, 6, 4, 13 ; 5 6 1791*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [v_mul+4*2] 1792*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 1793*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m5, 0x05 1794*c0909341SAndroid Build Coastguard Worker mova m0, m5 1795*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 ; 45 1796*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 56 1797*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6, m3 ; a2 1798*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m4 ; b2 1799*c0909341SAndroid Build Coastguard Worker paddd m14, m10 1800*c0909341SAndroid Build Coastguard Worker paddd m15, m10 1801*c0909341SAndroid Build Coastguard Worker paddd m5, m14 1802*c0909341SAndroid Build Coastguard Worker paddd m6, m15 1803*c0909341SAndroid Build Coastguard Worker psrad m5, 10 1804*c0909341SAndroid Build Coastguard Worker psrad m6, 10 1805*c0909341SAndroid Build Coastguard Worker packusdw m5, m6 1806*c0909341SAndroid Build Coastguard Worker pminsw m5, m11 1807*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 1808*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], xm5 1809*c0909341SAndroid Build Coastguard Worker vextracti128 [r8+dsq*1], m5, 1 1810*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 1811*c0909341SAndroid Build Coastguard Worker sub hd, 2 1812*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 1813*c0909341SAndroid Build Coastguard Worker add srcq, 16 1814*c0909341SAndroid Build Coastguard Worker add dstq, 16 1815*c0909341SAndroid Build Coastguard Worker movzx hd, wb 1816*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 1817*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 1818*c0909341SAndroid Build Coastguard Worker RET 1819*c0909341SAndroid Build Coastguard Worker 1820*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc 1821*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc 1822*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc 1823*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc 1824*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp, SHARP, SHARP 1825*c0909341SAndroid Build Coastguard Worker 1826*c0909341SAndroid Build Coastguard Workercglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1827*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx2 1828*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 1829*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 1830*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 1831*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 1832*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx2] 1833*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1834*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1835*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 1836*c0909341SAndroid Build Coastguard Worker jnz .h 1837*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1838*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put 1839*c0909341SAndroid Build Coastguard Worker.v: 1840*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1841*c0909341SAndroid Build Coastguard Worker shr myd, 16 1842*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1843*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1844*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+myq*8] 1845*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12, 15 1846*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pd_32] 1847*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, r8m 1848*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 1849*c0909341SAndroid Build Coastguard Worker sub srcq, r6 1850*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 1851*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 1852*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q0000 1853*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q1111 1854*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q2222 1855*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q3333 1856*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1857*c0909341SAndroid Build Coastguard Worker jg .v_w8 1858*c0909341SAndroid Build Coastguard Worker je .v_w4 1859*c0909341SAndroid Build Coastguard Worker.v_w2: 1860*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+ssq*0] 1861*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*1], 1 1862*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*2], 2 1863*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 1864*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 1865*c0909341SAndroid Build Coastguard Worker movd xm3, [srcq+ssq*0] 1866*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm1, [srcq+ssq*1] 1867*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*2] 1868*c0909341SAndroid Build Coastguard Worker add srcq, r6 1869*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm1, 0x02 ; 4 5 1870*c0909341SAndroid Build Coastguard Worker vpblendd xm1, xm0, 0x02 ; 5 6 1871*c0909341SAndroid Build Coastguard Worker palignr xm4, xm3, xm2, 4 ; 1 2 3 4 1872*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm1 ; 45 56 1873*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm2, xm4 ; 01 12 1874*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm4 ; 23 34 1875*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 1876*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [srcq+ssq*0] 1877*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm8, xm1 ; a0 b0 1878*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1879*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm9 ; a1 b1 1880*c0909341SAndroid Build Coastguard Worker paddd xm5, xm6 1881*c0909341SAndroid Build Coastguard Worker paddd xm5, xm2 1882*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 1883*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm10 ; a2 b2 1884*c0909341SAndroid Build Coastguard Worker paddd xm5, xm3 1885*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm0, xm4, 0x02 ; 6 7 1886*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*1] 1887*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1888*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm0, 0x02 ; 7 8 1889*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm4 ; 67 78 1890*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm11, xm3 ; a3 b3 1891*c0909341SAndroid Build Coastguard Worker paddd xm5, xm4 1892*c0909341SAndroid Build Coastguard Worker psrad xm5, 6 1893*c0909341SAndroid Build Coastguard Worker packusdw xm5, xm5 1894*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm7 1895*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm5 1896*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm5, 1 1897*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1898*c0909341SAndroid Build Coastguard Worker sub hd, 2 1899*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 1900*c0909341SAndroid Build Coastguard Worker RET 1901*c0909341SAndroid Build Coastguard Worker.v_w4: 1902*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*0] 1903*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*1] 1904*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*2] 1905*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+r6 ] 1906*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 1907*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*0] 1908*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+ssq*1] 1909*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0x30 1910*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x30 1911*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0 ; 01 12 1912*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*2] 1913*c0909341SAndroid Build Coastguard Worker add srcq, r6 1914*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0x30 1915*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, 0x30 1916*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 ; 23 34 1917*c0909341SAndroid Build Coastguard Worker vpblendd m3, m5, 0x30 1918*c0909341SAndroid Build Coastguard Worker vpblendd m5, m0, 0x30 1919*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m5 ; 45 56 1920*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1921*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*0] 1922*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m8, m1 ; a0 b0 1923*c0909341SAndroid Build Coastguard Worker mova m1, m2 1924*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m9 ; a1 b1 1925*c0909341SAndroid Build Coastguard Worker paddd m5, m6 1926*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1927*c0909341SAndroid Build Coastguard Worker mova m2, m3 1928*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m10 ; a2 b2 1929*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1930*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, m4, 0x30 1931*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*1] 1932*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1933*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 1934*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 ; 67 78 1935*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m11, m3 ; a3 b3 1936*c0909341SAndroid Build Coastguard Worker paddd m5, m4 1937*c0909341SAndroid Build Coastguard Worker psrad m5, 6 1938*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m5, 1 1939*c0909341SAndroid Build Coastguard Worker packusdw xm5, xm4 1940*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm7 1941*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm5 1942*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm5 1943*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1944*c0909341SAndroid Build Coastguard Worker sub hd, 2 1945*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1946*c0909341SAndroid Build Coastguard Worker RET 1947*c0909341SAndroid Build Coastguard Worker.v_w8: 1948*c0909341SAndroid Build Coastguard Worker shl wd, 5 1949*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 15 1950*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 1951*c0909341SAndroid Build Coastguard Worker.v_w8_loop0: 1952*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+ssq*0] 1953*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [srcq+ssq*1] 1954*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*4] 1955*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+r6 ] 1956*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [srcq+ssq*2] 1957*c0909341SAndroid Build Coastguard Worker mov r8, dstq 1958*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [r7+ssq*0] 1959*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r7+ssq*1] 1960*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r7+ssq*2] 1961*c0909341SAndroid Build Coastguard Worker add r7, r6 1962*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, 0x0c 1963*c0909341SAndroid Build Coastguard Worker shufpd m5, m1, 0x0c 1964*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 ; 01 1965*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 34 1966*c0909341SAndroid Build Coastguard Worker shufpd m6, m2, 0x0c 1967*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5, m6 ; 12 1968*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 ; 45 1969*c0909341SAndroid Build Coastguard Worker shufpd m0, m3, 0x0c 1970*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m6, m0 ; 23 1971*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 ; 56 1972*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1973*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [r7+ssq*0] 1974*c0909341SAndroid Build Coastguard Worker pmaddwd m12, m8, m1 ; a0 1975*c0909341SAndroid Build Coastguard Worker pmaddwd m13, m8, m2 ; b0 1976*c0909341SAndroid Build Coastguard Worker mova m1, m3 1977*c0909341SAndroid Build Coastguard Worker mova m2, m4 1978*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 ; a1 1979*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 ; b1 1980*c0909341SAndroid Build Coastguard Worker paddd m12, m3 1981*c0909341SAndroid Build Coastguard Worker paddd m13, m4 1982*c0909341SAndroid Build Coastguard Worker mova m3, m5 1983*c0909341SAndroid Build Coastguard Worker mova m4, m6 1984*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m10 ; a2 1985*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m10 ; b2 1986*c0909341SAndroid Build Coastguard Worker paddd m12, m5 1987*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [r7+ssq*1] 1988*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 1989*c0909341SAndroid Build Coastguard Worker paddd m13, m6 1990*c0909341SAndroid Build Coastguard Worker shufpd m6, m0, m14, 0x0d 1991*c0909341SAndroid Build Coastguard Worker shufpd m0, m14, m5, 0x0c 1992*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m0 ; 67 1993*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 ; 78 1994*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m11, m5 ; a3 1995*c0909341SAndroid Build Coastguard Worker paddd m12, m14 1996*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m11, m6 ; b3 1997*c0909341SAndroid Build Coastguard Worker paddd m13, m14 1998*c0909341SAndroid Build Coastguard Worker psrad m12, 5 1999*c0909341SAndroid Build Coastguard Worker psrad m13, 5 2000*c0909341SAndroid Build Coastguard Worker packusdw m12, m13 2001*c0909341SAndroid Build Coastguard Worker pxor m13, m13 2002*c0909341SAndroid Build Coastguard Worker pavgw m12, m13 2003*c0909341SAndroid Build Coastguard Worker pminsw m12, m7 2004*c0909341SAndroid Build Coastguard Worker vpermq m12, m12, q3120 2005*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], xm12 2006*c0909341SAndroid Build Coastguard Worker vextracti128 [r8+dsq*1], m12, 1 2007*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 2008*c0909341SAndroid Build Coastguard Worker sub hd, 2 2009*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2010*c0909341SAndroid Build Coastguard Worker add srcq, 16 2011*c0909341SAndroid Build Coastguard Worker add dstq, 16 2012*c0909341SAndroid Build Coastguard Worker movzx hd, wb 2013*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 2014*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop0 2015*c0909341SAndroid Build Coastguard Worker RET 2016*c0909341SAndroid Build Coastguard Worker.h: 2017*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 2018*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2019*c0909341SAndroid Build Coastguard Worker jnz .hv 2020*c0909341SAndroid Build Coastguard Worker mov r7d, r8m 2021*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, r8m 2022*c0909341SAndroid Build Coastguard Worker shr r7d, 11 2023*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] 2024*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2025*c0909341SAndroid Build Coastguard Worker jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2 2026*c0909341SAndroid Build Coastguard Worker je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4 2027*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 13 2028*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2029*c0909341SAndroid Build Coastguard Worker sub srcq, 6 2030*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+mxq*8] 2031*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufA] 2032*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufB] 2033*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2034*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2035*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q0000 2036*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q1111 2037*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q2222 2038*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q3333 2039*c0909341SAndroid Build Coastguard Worker sub wd, 16 2040*c0909341SAndroid Build Coastguard Worker jge .h_w16 2041*c0909341SAndroid Build Coastguard Worker.h_w8: 2042*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 2043*c0909341SAndroid Build Coastguard Worker pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 2044*c0909341SAndroid Build Coastguard Worker pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 2045*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%4 ; abcd1 2046*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m8 ; abcd0 2047*c0909341SAndroid Build Coastguard Worker pshufb m%2, m7 ; 6 7 7 8 8 9 9 a 2048*c0909341SAndroid Build Coastguard Worker shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 2049*c0909341SAndroid Build Coastguard Worker paddd m%5, m4 2050*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 2051*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m11, m%2 ; abcd3 2052*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 2053*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m10, m%4 ; abcd2 2054*c0909341SAndroid Build Coastguard Worker pshufb m%3, m7 ; a b b c c d d e 2055*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m8 ; efgh0 2056*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 2057*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%2 ; efgh1 2058*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 2059*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m11 ; efgh3 2060*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m10 ; efgh2 2061*c0909341SAndroid Build Coastguard Worker paddd m%4, m4 2062*c0909341SAndroid Build Coastguard Worker paddd m%4, m%5 2063*c0909341SAndroid Build Coastguard Worker paddd m%3, m%4 2064*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 2065*c0909341SAndroid Build Coastguard Worker psrad m%1, 6 2066*c0909341SAndroid Build Coastguard Worker psrad m%2, 6 2067*c0909341SAndroid Build Coastguard Worker packusdw m%1, m%2 2068*c0909341SAndroid Build Coastguard Worker pminsw m%1, m5 2069*c0909341SAndroid Build Coastguard Worker%endmacro 2070*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+ 0] 2071*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+ 0], 1 2072*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0+16] 2073*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1+16], 1 2074*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2075*c0909341SAndroid Build Coastguard Worker shufpd m1, m0, m2, 0x05 2076*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 1, 2, 3, 12 2077*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 2078*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 2079*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2080*c0909341SAndroid Build Coastguard Worker sub hd, 2 2081*c0909341SAndroid Build Coastguard Worker jg .h_w8 2082*c0909341SAndroid Build Coastguard Worker RET 2083*c0909341SAndroid Build Coastguard Worker.h_w16: 2084*c0909341SAndroid Build Coastguard Worker mov r6d, wd 2085*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 2086*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6*2+ 0] 2087*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6*2+ 8] 2088*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6*2+16] 2089*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 1, 2, 3, 12 2090*c0909341SAndroid Build Coastguard Worker mova [dstq+r6*2], m0 2091*c0909341SAndroid Build Coastguard Worker sub r6d, 16 2092*c0909341SAndroid Build Coastguard Worker jge .h_w16_loop 2093*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2094*c0909341SAndroid Build Coastguard Worker add dstq, dsq 2095*c0909341SAndroid Build Coastguard Worker dec hd 2096*c0909341SAndroid Build Coastguard Worker jg .h_w16 2097*c0909341SAndroid Build Coastguard Worker RET 2098*c0909341SAndroid Build Coastguard Worker.hv: 2099*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 16 2100*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m 2101*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2102*c0909341SAndroid Build Coastguard Worker jg .hv_w8 2103*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2104*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 2105*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2106*c0909341SAndroid Build Coastguard Worker shr myd, 16 2107*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2108*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2109*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+subpel_filters+myq*8] 2110*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pd_512] 2111*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2112*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2113*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2114*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2115*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m0 2116*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 2117*c0909341SAndroid Build Coastguard Worker psraw m1, 8 ; sign-extend 2118*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 2119*c0909341SAndroid Build Coastguard Worker jz .hv_10bit 2120*c0909341SAndroid Build Coastguard Worker psraw m7, 2 2121*c0909341SAndroid Build Coastguard Worker psllw m1, 2 2122*c0909341SAndroid Build Coastguard Worker.hv_10bit: 2123*c0909341SAndroid Build Coastguard Worker pshufd m11, m1, q0000 2124*c0909341SAndroid Build Coastguard Worker pshufd m12, m1, q1111 2125*c0909341SAndroid Build Coastguard Worker pshufd m13, m1, q2222 2126*c0909341SAndroid Build Coastguard Worker pshufd m14, m1, q3333 2127*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2128*c0909341SAndroid Build Coastguard Worker je .hv_w4 2129*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shuf2] 2130*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [srcq+r6 ] ; 3 3 2131*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*2] 2132*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 2133*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*1] 2134*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2135*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 2136*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 2137*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 2138*c0909341SAndroid Build Coastguard Worker add srcq, r6 2139*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 2140*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 2141*c0909341SAndroid Build Coastguard Worker pshufb m0, m9 2142*c0909341SAndroid Build Coastguard Worker pshufb m2, m9 2143*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 2144*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 2145*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m7 2146*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 2147*c0909341SAndroid Build Coastguard Worker phaddd m1, m3 2148*c0909341SAndroid Build Coastguard Worker phaddd m0, m2 2149*c0909341SAndroid Build Coastguard Worker paddd m1, m6 2150*c0909341SAndroid Build Coastguard Worker paddd m0, m6 2151*c0909341SAndroid Build Coastguard Worker psrad m1, 10 2152*c0909341SAndroid Build Coastguard Worker psrad m0, 10 2153*c0909341SAndroid Build Coastguard Worker packssdw m1, m0 ; 3 2 0 1 2154*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m1, 1 ; 3 4 5 6 2155*c0909341SAndroid Build Coastguard Worker pshufd xm2, xm1, q1301 ; 2 3 1 2 2156*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm0, q2121 ; 4 5 4 5 2157*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm2 ; 01 12 2158*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm0 ; 23 34 2159*c0909341SAndroid Build Coastguard Worker punpckhwd xm3, xm0 ; 45 56 2160*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 2161*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 2162*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*1] 2163*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2164*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm9 2165*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm9 2166*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm7 2167*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm7 2168*c0909341SAndroid Build Coastguard Worker phaddd xm4, xm5 2169*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm11, xm1 ; a0 b0 2170*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 2171*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm12 ; a1 b1 2172*c0909341SAndroid Build Coastguard Worker paddd xm5, xm2 2173*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 2174*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm13 ; a2 b2 2175*c0909341SAndroid Build Coastguard Worker paddd xm5, xm3 2176*c0909341SAndroid Build Coastguard Worker paddd xm4, xm6 2177*c0909341SAndroid Build Coastguard Worker psrad xm4, 10 2178*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm4 2179*c0909341SAndroid Build Coastguard Worker palignr xm3, xm4, xm0, 12 2180*c0909341SAndroid Build Coastguard Worker mova xm0, xm4 2181*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0 ; 67 78 2182*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm14, xm3 ; a3 b3 2183*c0909341SAndroid Build Coastguard Worker paddd xm5, xm6 2184*c0909341SAndroid Build Coastguard Worker paddd xm5, xm4 2185*c0909341SAndroid Build Coastguard Worker psrad xm5, 10 2186*c0909341SAndroid Build Coastguard Worker packusdw xm5, xm5 2187*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm15 2188*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm5 2189*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm5, 1 2190*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2191*c0909341SAndroid Build Coastguard Worker sub hd, 2 2192*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 2193*c0909341SAndroid Build Coastguard Worker RET 2194*c0909341SAndroid Build Coastguard Worker.hv_w4: 2195*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shufA] 2196*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [subpel_h_shufB] 2197*c0909341SAndroid Build Coastguard Worker pshufd m8, m7, q1111 2198*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 2199*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0] 2200*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 2201*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+r6 ] 2202*c0909341SAndroid Build Coastguard Worker vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 2203*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2204*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 2205*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 2206*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 2207*c0909341SAndroid Build Coastguard Worker add srcq, r6 2208*c0909341SAndroid Build Coastguard Worker pshufb m4, m1, m9 2209*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 2210*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 2211*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 2212*c0909341SAndroid Build Coastguard Worker pshufb m5, m2, m9 2213*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 2214*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m7 2215*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 2216*c0909341SAndroid Build Coastguard Worker paddd m4, m6 2217*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2218*c0909341SAndroid Build Coastguard Worker pshufb m4, m0, m9 2219*c0909341SAndroid Build Coastguard Worker pshufb m0, m10 2220*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 2221*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 2222*c0909341SAndroid Build Coastguard Worker paddd m5, m6 2223*c0909341SAndroid Build Coastguard Worker paddd m2, m5 2224*c0909341SAndroid Build Coastguard Worker pshufb m5, m3, m9 2225*c0909341SAndroid Build Coastguard Worker pshufb m3, m10 2226*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m7 2227*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m8 2228*c0909341SAndroid Build Coastguard Worker paddd m4, m6 2229*c0909341SAndroid Build Coastguard Worker paddd m4, m0 2230*c0909341SAndroid Build Coastguard Worker paddd m5, m6 2231*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2232*c0909341SAndroid Build Coastguard Worker vperm2i128 m0, m1, m2, 0x21 2233*c0909341SAndroid Build Coastguard Worker psrld m1, 10 2234*c0909341SAndroid Build Coastguard Worker psrld m2, 10 2235*c0909341SAndroid Build Coastguard Worker vperm2i128 m3, m4, m5, 0x21 2236*c0909341SAndroid Build Coastguard Worker pslld m4, 6 2237*c0909341SAndroid Build Coastguard Worker pslld m5, 6 2238*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa ; 23 34 2239*c0909341SAndroid Build Coastguard Worker pslld m0, 6 2240*c0909341SAndroid Build Coastguard Worker pblendw m1, m0, 0xaa ; 01 12 2241*c0909341SAndroid Build Coastguard Worker psrld m3, 10 2242*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa ; 45 56 2243*c0909341SAndroid Build Coastguard Worker psrad m0, m5, 16 2244*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 2245*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 2246*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+ssq*1], 1 2247*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2248*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m11, m1 ; a0 b0 2249*c0909341SAndroid Build Coastguard Worker mova m1, m2 2250*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12 ; a1 b1 2251*c0909341SAndroid Build Coastguard Worker paddd m5, m6 2252*c0909341SAndroid Build Coastguard Worker paddd m5, m2 2253*c0909341SAndroid Build Coastguard Worker mova m2, m3 2254*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 ; a2 b2 2255*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2256*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m9 2257*c0909341SAndroid Build Coastguard Worker pshufb m4, m10 2258*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 2259*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m8 2260*c0909341SAndroid Build Coastguard Worker paddd m3, m6 2261*c0909341SAndroid Build Coastguard Worker paddd m4, m3 2262*c0909341SAndroid Build Coastguard Worker psrad m4, 10 2263*c0909341SAndroid Build Coastguard Worker packssdw m0, m4 ; _ 7 6 8 2264*c0909341SAndroid Build Coastguard Worker vpermq m3, m0, q1122 ; _ 6 _ 7 2265*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 ; 67 78 2266*c0909341SAndroid Build Coastguard Worker mova m0, m4 2267*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14, m3 ; a3 b3 2268*c0909341SAndroid Build Coastguard Worker paddd m4, m5 2269*c0909341SAndroid Build Coastguard Worker psrad m4, 10 2270*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 2271*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm5 2272*c0909341SAndroid Build Coastguard Worker pminsw xm4, xm15 2273*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm4 2274*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm4 2275*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2276*c0909341SAndroid Build Coastguard Worker sub hd, 2 2277*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 2278*c0909341SAndroid Build Coastguard Worker RET 2279*c0909341SAndroid Build Coastguard Worker.hv_w8: 2280*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2281*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+subpel_filters+mxq*8] 2282*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2283*c0909341SAndroid Build Coastguard Worker shr myd, 16 2284*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2285*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2286*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [base+subpel_filters+myq*8] 2287*c0909341SAndroid Build Coastguard Worker shl wd, 5 2288*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2289*c0909341SAndroid Build Coastguard Worker sub srcq, 6 2290*c0909341SAndroid Build Coastguard Worker pxor m0, m0 2291*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2292*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 2293*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 2294*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 2295*c0909341SAndroid Build Coastguard Worker jz .hv_w8_10bit 2296*c0909341SAndroid Build Coastguard Worker psraw m0, 2 2297*c0909341SAndroid Build Coastguard Worker psllw xm1, 2 2298*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit: 2299*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q0000 2300*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q1111 2301*c0909341SAndroid Build Coastguard Worker mova [v_mul], xm1 2302*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q2222 2303*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q3333 2304*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 2305*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 2306*c0909341SAndroid Build Coastguard Worker pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 2307*c0909341SAndroid Build Coastguard Worker pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 2308*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m2 2309*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m11 2310*c0909341SAndroid Build Coastguard Worker pshufb m%2, m9 ; 6 7 7 8 8 9 9 a 2311*c0909341SAndroid Build Coastguard Worker shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 2312*c0909341SAndroid Build Coastguard Worker paddd m3, m10 2313*c0909341SAndroid Build Coastguard Worker paddd m%1, m3 2314*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14, m%2 2315*c0909341SAndroid Build Coastguard Worker paddd m%1, m3 2316*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13, m2 2317*c0909341SAndroid Build Coastguard Worker pshufb m%3, m9 ; a b b c c d d e 2318*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11 2319*c0909341SAndroid Build Coastguard Worker paddd m%1, m3 2320*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m%2 2321*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 2322*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m14 2323*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m13 2324*c0909341SAndroid Build Coastguard Worker paddd m2, m10 2325*c0909341SAndroid Build Coastguard Worker paddd m2, m3 2326*c0909341SAndroid Build Coastguard Worker paddd m%3, m2 2327*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 2328*c0909341SAndroid Build Coastguard Worker psrad m%1, 10 2329*c0909341SAndroid Build Coastguard Worker psrad m%2, 10 2330*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%2 2331*c0909341SAndroid Build Coastguard Worker%endmacro 2332*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r6 *1+ 0] 2333*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufA] 2334*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*4] 2335*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+r6 *1+ 8] 2336*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shufB] 2337*c0909341SAndroid Build Coastguard Worker mov r8, dstq 2338*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+r6 *1+16] 2339*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_512] 2340*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*0+ 0] 2341*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r7 +ssq*0+ 0], 1 2342*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0+16] 2343*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [r7 +ssq*0+16], 1 2344*c0909341SAndroid Build Coastguard Worker shufpd m7, m5, m1, 0x05 2345*c0909341SAndroid Build Coastguard Worker INIT_XMM avx2 2346*c0909341SAndroid Build Coastguard Worker PUT_8TAP_HV_H 4, 6, 0 ; 3 2347*c0909341SAndroid Build Coastguard Worker INIT_YMM avx2 2348*c0909341SAndroid Build Coastguard Worker PUT_8TAP_HV_H 5, 7, 1 ; 0 4 2349*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*2+ 0] 2350*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+r6 *2+ 0], 1 2351*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*2+16] 2352*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+r6 *2+16], 1 2353*c0909341SAndroid Build Coastguard Worker shufpd m7, m0, m1, 0x05 2354*c0909341SAndroid Build Coastguard Worker PUT_8TAP_HV_H 0, 7, 1 ; 2 6 2355*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ssq*1+ 0] 2356*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1+16] 2357*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r7 +ssq*1+ 0], 1 2358*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [r7 +ssq*1+16], 1 2359*c0909341SAndroid Build Coastguard Worker add r7, r6 2360*c0909341SAndroid Build Coastguard Worker shufpd m7, m6, m1, 0x05 2361*c0909341SAndroid Build Coastguard Worker PUT_8TAP_HV_H 6, 7, 1 ; 1 5 2362*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q1100 2363*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 2364*c0909341SAndroid Build Coastguard Worker vpermq m6, m6, q3120 2365*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 2366*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m7, m4 ; 23 2367*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 34 2368*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m5, m6 ; 01 2369*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 ; 45 2370*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m6, m7 ; 12 2371*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 56 2372*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 2373*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [v_mul+4*0] 2374*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [v_mul+4*1] 2375*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [v_mul+4*2] 2376*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m9, m1 ; a0 2377*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m2 ; b0 2378*c0909341SAndroid Build Coastguard Worker mova m1, m3 2379*c0909341SAndroid Build Coastguard Worker mova m2, m4 2380*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 ; a1 2381*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 ; b1 2382*c0909341SAndroid Build Coastguard Worker paddd m8, m3 2383*c0909341SAndroid Build Coastguard Worker paddd m9, m4 2384*c0909341SAndroid Build Coastguard Worker mova m3, m5 2385*c0909341SAndroid Build Coastguard Worker mova m4, m6 2386*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m10 ; a2 2387*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m10 ; b2 2388*c0909341SAndroid Build Coastguard Worker paddd m8, m5 2389*c0909341SAndroid Build Coastguard Worker paddd m9, m6 2390*c0909341SAndroid Build Coastguard Worker movu xm5, [r7+ssq*0] 2391*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r7+ssq*1], 1 2392*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufA] 2393*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [subpel_h_shufB] 2394*c0909341SAndroid Build Coastguard Worker movu xm6, [r7+ssq*0+16] 2395*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r7+ssq*1+16], 1 2396*c0909341SAndroid Build Coastguard Worker vextracti128 [r8], m0, 1 2397*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m7 ; 01 2398*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 ; 23 2399*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 2400*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 2401*c0909341SAndroid Build Coastguard Worker paddd m0, m5 2402*c0909341SAndroid Build Coastguard Worker pshufb m5, m6, m7 ; 89 2403*c0909341SAndroid Build Coastguard Worker pshufb m6, m10 ; ab 2404*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13 2405*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 2406*c0909341SAndroid Build Coastguard Worker paddd m6, m5 2407*c0909341SAndroid Build Coastguard Worker movu xm5, [r7+ssq*0+8] 2408*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r7+ssq*1+8], 1 2409*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 2410*c0909341SAndroid Build Coastguard Worker pshufb m7, m5, m7 2411*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 2412*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m13, m7 2413*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m11 2414*c0909341SAndroid Build Coastguard Worker paddd m0, m10 2415*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_512] 2416*c0909341SAndroid Build Coastguard Worker paddd m6, m7 2417*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m14, m5 2418*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 2419*c0909341SAndroid Build Coastguard Worker paddd m0, m7 2420*c0909341SAndroid Build Coastguard Worker paddd m5, m6 2421*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [r8] 2422*c0909341SAndroid Build Coastguard Worker paddd m8, m10 2423*c0909341SAndroid Build Coastguard Worker paddd m9, m10 2424*c0909341SAndroid Build Coastguard Worker paddd m0, m10 2425*c0909341SAndroid Build Coastguard Worker paddd m5, m10 2426*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [v_mul+4*3] 2427*c0909341SAndroid Build Coastguard Worker psrad m0, 10 2428*c0909341SAndroid Build Coastguard Worker psrad m5, 10 2429*c0909341SAndroid Build Coastguard Worker packssdw m0, m5 2430*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 ; 7 8 2431*c0909341SAndroid Build Coastguard Worker shufpd m6, m7, 0x04 ; 6 7 2432*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m7 ; 67 2433*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 78 2434*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m10, m5 ; a3 2435*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m6 ; b3 2436*c0909341SAndroid Build Coastguard Worker paddd m7, m8 2437*c0909341SAndroid Build Coastguard Worker paddd m9, m10 2438*c0909341SAndroid Build Coastguard Worker psrad m7, 10 2439*c0909341SAndroid Build Coastguard Worker psrad m9, 10 2440*c0909341SAndroid Build Coastguard Worker packusdw m7, m9 2441*c0909341SAndroid Build Coastguard Worker pminsw m7, m15 2442*c0909341SAndroid Build Coastguard Worker vpermq m7, m7, q3120 2443*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], xm7 2444*c0909341SAndroid Build Coastguard Worker vextracti128 [r8+dsq*1], m7, 1 2445*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 2446*c0909341SAndroid Build Coastguard Worker sub hd, 2 2447*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 2448*c0909341SAndroid Build Coastguard Worker add srcq, 16 2449*c0909341SAndroid Build Coastguard Worker add dstq, 16 2450*c0909341SAndroid Build Coastguard Worker movzx hd, wb 2451*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 2452*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 2453*c0909341SAndroid Build Coastguard Worker RET 2454*c0909341SAndroid Build Coastguard Worker 2455*c0909341SAndroid Build Coastguard Worker%if WIN64 2456*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4 2457*c0909341SAndroid Build Coastguard Worker%else 2458*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7 2459*c0909341SAndroid Build Coastguard Worker%endif 2460*c0909341SAndroid Build Coastguard Worker 2461*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap, 2462*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc 2463*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc 2464*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc 2465*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular, REGULAR, REGULAR 2466*c0909341SAndroid Build Coastguard Worker 2467*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my 2468*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx2 2469*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2470*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 2471*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2472*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 2473*c0909341SAndroid Build Coastguard Worker lea r7, [prep_avx2] 2474*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2475*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2476*c0909341SAndroid Build Coastguard Worker jnz .h 2477*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2478*c0909341SAndroid Build Coastguard Worker jnz .v 2479*c0909341SAndroid Build Coastguard Worker.prep: 2480*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2481*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; bitdepth_max 2482*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep,)] 2483*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [r7-prep_avx2+pw_8192] 2484*c0909341SAndroid Build Coastguard Worker shr r6d, 11 2485*c0909341SAndroid Build Coastguard Worker add wq, r7 2486*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+prep_mul+r6*4] 2487*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2488*c0909341SAndroid Build Coastguard Worker%if WIN64 2489*c0909341SAndroid Build Coastguard Worker pop r7 2490*c0909341SAndroid Build Coastguard Worker%endif 2491*c0909341SAndroid Build Coastguard Worker jmp wq 2492*c0909341SAndroid Build Coastguard Worker.h_w4: 2493*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2494*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2495*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, [base+subpel_filters+mxq*8] 2496*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [subpel_h_shufA] 2497*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2498*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [subpel_h_shufB] 2499*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 2500*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm0, q2211 2501*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 2502*c0909341SAndroid Build Coastguard Worker jnz .h_w4_12bpc 2503*c0909341SAndroid Build Coastguard Worker psllw xm0, 2 2504*c0909341SAndroid Build Coastguard Worker.h_w4_12bpc: 2505*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, xm0 2506*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q1111 2507*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 2508*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0] 2509*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*2], 1 2510*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*1] 2511*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+r6 *1], 1 2512*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2513*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 2514*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 ; 2 3 3 4 4 5 5 6 2515*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 2516*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 2517*c0909341SAndroid Build Coastguard Worker paddd m0, m5 2518*c0909341SAndroid Build Coastguard Worker paddd m0, m1 2519*c0909341SAndroid Build Coastguard Worker pshufb m1, m2, m3 2520*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 2521*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m6 2522*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 2523*c0909341SAndroid Build Coastguard Worker paddd m1, m5 2524*c0909341SAndroid Build Coastguard Worker paddd m1, m2 2525*c0909341SAndroid Build Coastguard Worker psrad m0, 4 2526*c0909341SAndroid Build Coastguard Worker psrad m1, 4 2527*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 2528*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 2529*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2530*c0909341SAndroid Build Coastguard Worker sub hd, 4 2531*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 2532*c0909341SAndroid Build Coastguard Worker RET 2533*c0909341SAndroid Build Coastguard Worker.h: 2534*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2535*c0909341SAndroid Build Coastguard Worker jnz .hv 2536*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) 2537*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2538*c0909341SAndroid Build Coastguard Worker je .h_w4 2539*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2540*c0909341SAndroid Build Coastguard Worker sub srcq, 4 2541*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+1+mxq*8] 2542*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 10 2543*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufA] 2544*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2545*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2546*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 2547*c0909341SAndroid Build Coastguard Worker jnz .h_12bpc 2548*c0909341SAndroid Build Coastguard Worker psllw m0, 2 2549*c0909341SAndroid Build Coastguard Worker.h_12bpc: 2550*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q0000 2551*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q1111 2552*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q2222 2553*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2554*c0909341SAndroid Build Coastguard Worker jg .h_w16 2555*c0909341SAndroid Build Coastguard Worker.h_w8: 2556*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+ 0] 2557*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+ 0], 1 2558*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0+16] 2559*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1+16], 1 2560*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2561*c0909341SAndroid Build Coastguard Worker shufpd m1, m0, m2, 0x05 2562*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 2563*c0909341SAndroid Build Coastguard Worker pshufb m%1, m6 ; 01 12 23 34 2564*c0909341SAndroid Build Coastguard Worker pshufb m%2, m6 ; 45 56 67 78 2565*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m7, m%1 ; a0 2566*c0909341SAndroid Build Coastguard Worker pshufb m%3, m6 ; 89 9a ab bc 2567*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%2 ; a2 2568*c0909341SAndroid Build Coastguard Worker shufpd m%1, m%2, 0x05 ; 23 34 45 56 2569*c0909341SAndroid Build Coastguard Worker paddd m%4, m%5 ; a0+a2 2570*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m7, m%2 ; b0 2571*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 67 78 89 9a 2572*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m9 ; b2 2573*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m8 ; a1 2574*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m8 ; b1 2575*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 ; b0+b2 2576*c0909341SAndroid Build Coastguard Worker paddd m%4, m5 2577*c0909341SAndroid Build Coastguard Worker paddd m%3, m5 2578*c0909341SAndroid Build Coastguard Worker paddd m%1, m%4 2579*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 2580*c0909341SAndroid Build Coastguard Worker psrad m%1, 4 2581*c0909341SAndroid Build Coastguard Worker psrad m%2, 4 2582*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%2 2583*c0909341SAndroid Build Coastguard Worker%endmacro 2584*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 0, 1, 2, 3, 4 2585*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 2586*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2587*c0909341SAndroid Build Coastguard Worker sub hd, 2 2588*c0909341SAndroid Build Coastguard Worker jg .h_w8 2589*c0909341SAndroid Build Coastguard Worker RET 2590*c0909341SAndroid Build Coastguard Worker.h_w16: 2591*c0909341SAndroid Build Coastguard Worker add wd, wd 2592*c0909341SAndroid Build Coastguard Worker.h_w16_loop0: 2593*c0909341SAndroid Build Coastguard Worker mov r6d, wd 2594*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 2595*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6-32] 2596*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6-24] 2597*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6-16] 2598*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 0, 1, 2, 3, 4 2599*c0909341SAndroid Build Coastguard Worker mova [tmpq+r6-32], m0 2600*c0909341SAndroid Build Coastguard Worker sub r6d, 32 2601*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 2602*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2603*c0909341SAndroid Build Coastguard Worker add tmpq, wq 2604*c0909341SAndroid Build Coastguard Worker dec hd 2605*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop0 2606*c0909341SAndroid Build Coastguard Worker RET 2607*c0909341SAndroid Build Coastguard Worker.v: 2608*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2609*c0909341SAndroid Build Coastguard Worker shr myd, 16 2610*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2611*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2612*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+1+myq*8] 2613*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 9, 12 2614*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [prep_8tap_1d_rnd] 2615*c0909341SAndroid Build Coastguard Worker mov r6, ssq 2616*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2617*c0909341SAndroid Build Coastguard Worker neg r6 2618*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2619*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 2620*c0909341SAndroid Build Coastguard Worker jnz .v_12bpc 2621*c0909341SAndroid Build Coastguard Worker psllw m0, 2 2622*c0909341SAndroid Build Coastguard Worker.v_12bpc: 2623*c0909341SAndroid Build Coastguard Worker pshufd m6, m0, q0000 2624*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q1111 2625*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q2222 2626*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2627*c0909341SAndroid Build Coastguard Worker jg .v_w8 2628*c0909341SAndroid Build Coastguard Worker.v_w4: 2629*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+r6 *2] 2630*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+r6 *1] 2631*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 2632*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 2633*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2634*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 2635*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0x30 2636*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x30 2637*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; 01 12 2638*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0x30 2639*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 2640*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 ; 23 34 2641*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 2642*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2643*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2644*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6, m1 ; a0 b0 2645*c0909341SAndroid Build Coastguard Worker mova m1, m2 2646*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 ; a1 b1 2647*c0909341SAndroid Build Coastguard Worker paddd m4, m2 2648*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, m3, 0x30 2649*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 2650*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0x30 2651*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; 45 56 2652*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m8, m2 ; a2 b2 2653*c0909341SAndroid Build Coastguard Worker paddd m4, m5 2654*c0909341SAndroid Build Coastguard Worker paddd m4, m3 2655*c0909341SAndroid Build Coastguard Worker psrad m4, 4 2656*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m4, 1 2657*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm3 2658*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 2659*c0909341SAndroid Build Coastguard Worker add tmpq, 16 2660*c0909341SAndroid Build Coastguard Worker sub hd, 2 2661*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 2662*c0909341SAndroid Build Coastguard Worker RET 2663*c0909341SAndroid Build Coastguard Worker.v_w8: 2664*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 12 2665*c0909341SAndroid Build Coastguard Worker%if WIN64 2666*c0909341SAndroid Build Coastguard Worker push r8 2667*c0909341SAndroid Build Coastguard Worker%endif 2668*c0909341SAndroid Build Coastguard Worker mov r8d, wd 2669*c0909341SAndroid Build Coastguard Worker shl wd, 5 2670*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 2671*c0909341SAndroid Build Coastguard Worker.v_w8_loop0: 2672*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [srcq+r6 *2] 2673*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+r6 *1] 2674*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+ssq*2] 2675*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 2676*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [srcq+ssq*1] 2677*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 2678*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r5+ssq*0] 2679*c0909341SAndroid Build Coastguard Worker shufpd m3, m0, 0x0c 2680*c0909341SAndroid Build Coastguard Worker shufpd m4, m1, 0x0c 2681*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 ; 01 2682*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; 23 2683*c0909341SAndroid Build Coastguard Worker shufpd m0, m2, 0x0c 2684*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 ; 12 2685*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 34 2686*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 2687*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [r5+ssq*1] 2688*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m6, m1 ; a0 2689*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 2690*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m6, m2 ; b0 2691*c0909341SAndroid Build Coastguard Worker mova m1, m3 2692*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 ; a1 2693*c0909341SAndroid Build Coastguard Worker mova m2, m4 2694*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 ; b1 2695*c0909341SAndroid Build Coastguard Worker paddd m10, m5 2696*c0909341SAndroid Build Coastguard Worker paddd m11, m5 2697*c0909341SAndroid Build Coastguard Worker paddd m10, m3 2698*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r5+ssq*0] 2699*c0909341SAndroid Build Coastguard Worker paddd m11, m4 2700*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m9, 0x0d 2701*c0909341SAndroid Build Coastguard Worker shufpd m0, m9, m3, 0x0c 2702*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 45 2703*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 56 2704*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m8, m3 ; a2 2705*c0909341SAndroid Build Coastguard Worker paddd m10, m9 2706*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m8, m4 ; b2 2707*c0909341SAndroid Build Coastguard Worker paddd m11, m9 2708*c0909341SAndroid Build Coastguard Worker psrad m10, 4 2709*c0909341SAndroid Build Coastguard Worker psrad m11, 4 2710*c0909341SAndroid Build Coastguard Worker packssdw m10, m11 2711*c0909341SAndroid Build Coastguard Worker vpermq m10, m10, q3120 2712*c0909341SAndroid Build Coastguard Worker mova [r7+r8*0], xm10 2713*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+r8*2], m10, 1 2714*c0909341SAndroid Build Coastguard Worker lea r7, [r7+r8*4] 2715*c0909341SAndroid Build Coastguard Worker sub hd, 2 2716*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2717*c0909341SAndroid Build Coastguard Worker add srcq, 16 2718*c0909341SAndroid Build Coastguard Worker add tmpq, 16 2719*c0909341SAndroid Build Coastguard Worker movzx hd, wb 2720*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 2721*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop0 2722*c0909341SAndroid Build Coastguard Worker%if WIN64 2723*c0909341SAndroid Build Coastguard Worker pop r8 2724*c0909341SAndroid Build Coastguard Worker%endif 2725*c0909341SAndroid Build Coastguard Worker RET 2726*c0909341SAndroid Build Coastguard Worker.hv: 2727*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 13, 15 2728*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [prep_8tap_2d_rnd] 2729*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufA] 2730*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2731*c0909341SAndroid Build Coastguard Worker jg .hv_w8 2732*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2733*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 2734*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2735*c0909341SAndroid Build Coastguard Worker shr myd, 16 2736*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2737*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2738*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+subpel_filters+1+myq*8] 2739*c0909341SAndroid Build Coastguard Worker mov r6, ssq 2740*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2741*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2742*c0909341SAndroid Build Coastguard Worker neg r6 2743*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m0 2744*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 2745*c0909341SAndroid Build Coastguard Worker psraw m6, 4 2746*c0909341SAndroid Build Coastguard Worker psraw m1, 8 2747*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 2748*c0909341SAndroid Build Coastguard Worker jz .hv_w4_10bit 2749*c0909341SAndroid Build Coastguard Worker psraw m6, 2 2750*c0909341SAndroid Build Coastguard Worker.hv_w4_10bit: 2751*c0909341SAndroid Build Coastguard Worker pshufd m10, m1, q0000 2752*c0909341SAndroid Build Coastguard Worker pshufd m11, m1, q1111 2753*c0909341SAndroid Build Coastguard Worker pshufd m12, m1, q2222 2754*c0909341SAndroid Build Coastguard Worker.hv_w4: 2755*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+r6 *2] 2756*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 2757*c0909341SAndroid Build Coastguard Worker pshufd m5, m6, q0000 2758*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [base+subpel_h_shufB] 2759*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 2760*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q1111 2761*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 2762*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2763*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*0] ; 4 2764*c0909341SAndroid Build Coastguard Worker pshufb m1, m2, m8 2765*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 2766*c0909341SAndroid Build Coastguard Worker pshufb m2, m9 2767*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m6 2768*c0909341SAndroid Build Coastguard Worker pshufb m4, m0, m8 2769*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m5 2770*c0909341SAndroid Build Coastguard Worker pshufb m0, m9 2771*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 2772*c0909341SAndroid Build Coastguard Worker paddd m2, m1 2773*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm3, xm8 2774*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm5 2775*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm9 2776*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm6 2777*c0909341SAndroid Build Coastguard Worker paddd m0, m4 2778*c0909341SAndroid Build Coastguard Worker paddd m2, m7 2779*c0909341SAndroid Build Coastguard Worker paddd xm1, xm7 2780*c0909341SAndroid Build Coastguard Worker paddd m0, m7 2781*c0909341SAndroid Build Coastguard Worker paddd xm3, xm1 2782*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m2, m0, xm3 2783*c0909341SAndroid Build Coastguard Worker packssdw m2, m0 ; 0 2 1 3 2784*c0909341SAndroid Build Coastguard Worker packssdw xm0, xm3 ; 2 4 2785*c0909341SAndroid Build Coastguard Worker vperm2i128 m0, m2, 0x03 2786*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m0 ; 01 12 2787*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0 ; 23 34 2788*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 2789*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 2790*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2791*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*0], 1 2792*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m10, m1 ; a0 b0 2793*c0909341SAndroid Build Coastguard Worker mova m1, m2 2794*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11 ; a1 b1 2795*c0909341SAndroid Build Coastguard Worker paddd m4, m2 2796*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m8 2797*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 2798*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 2799*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 2800*c0909341SAndroid Build Coastguard Worker paddd m2, m7 2801*c0909341SAndroid Build Coastguard Worker paddd m3, m2 2802*c0909341SAndroid Build Coastguard Worker psrad m3, 6 2803*c0909341SAndroid Build Coastguard Worker packssdw m3, m3 ; 5 5 6 6 2804*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m0, m3, 0x21 2805*c0909341SAndroid Build Coastguard Worker mova m0, m3 2806*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 ; 45 56 2807*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m2 ; a2 b2 2808*c0909341SAndroid Build Coastguard Worker paddd m4, m7 2809*c0909341SAndroid Build Coastguard Worker paddd m4, m3 2810*c0909341SAndroid Build Coastguard Worker psrad m4, 6 2811*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m4, 1 2812*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm3 2813*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 2814*c0909341SAndroid Build Coastguard Worker add tmpq, 16 2815*c0909341SAndroid Build Coastguard Worker sub hd, 2 2816*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 2817*c0909341SAndroid Build Coastguard Worker RET 2818*c0909341SAndroid Build Coastguard Worker.hv_w8: 2819*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2820*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+subpel_filters+1+mxq*8] 2821*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2822*c0909341SAndroid Build Coastguard Worker shr myd, 16 2823*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2824*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2825*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [base+subpel_filters+1+myq*8] 2826*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 15 2827*c0909341SAndroid Build Coastguard Worker%if WIN64 2828*c0909341SAndroid Build Coastguard Worker PUSH r8 2829*c0909341SAndroid Build Coastguard Worker%endif 2830*c0909341SAndroid Build Coastguard Worker mov r8d, wd 2831*c0909341SAndroid Build Coastguard Worker shl wd, 5 2832*c0909341SAndroid Build Coastguard Worker mov r6, ssq 2833*c0909341SAndroid Build Coastguard Worker sub srcq, 4 2834*c0909341SAndroid Build Coastguard Worker neg r6 2835*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 2836*c0909341SAndroid Build Coastguard Worker pxor m0, m0 2837*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 2838*c0909341SAndroid Build Coastguard Worker psraw m0, 4 2839*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 2840*c0909341SAndroid Build Coastguard Worker jz .hv_w8_10bit 2841*c0909341SAndroid Build Coastguard Worker psraw m0, 2 2842*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit: 2843*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q0000 2844*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q1111 2845*c0909341SAndroid Build Coastguard Worker mova [v_mul], xm1 2846*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q2222 2847*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 2848*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0+ 0] 2849*c0909341SAndroid Build Coastguard Worker vinserti128 m3, m0, [srcq+r6*2+ 0], 0 2850*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+ssq*2] 2851*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [srcq+ssq*0+16] 2852*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m2, [srcq+r6*2+16], 0 2853*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 2854*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r5 +ssq*0+ 0], 1 2855*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [r5 +ssq*0+16], 1 2856*c0909341SAndroid Build Coastguard Worker shufpd m4, m3, m1, 0x05 2857*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 2858*c0909341SAndroid Build Coastguard Worker pshufb m%1, m8 ; 01 12 23 34 2859*c0909341SAndroid Build Coastguard Worker pshufb m%2, m8 ; 45 56 67 78 2860*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m10, m%1 ; a0 2861*c0909341SAndroid Build Coastguard Worker pshufb m%3, m8 ; 89 9a ab bc 2862*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m12, m%2 ; a2 2863*c0909341SAndroid Build Coastguard Worker shufpd m%1, m%2, 0x05 ; 23 34 45 56 2864*c0909341SAndroid Build Coastguard Worker paddd m%4, m%5 ; a0+a2 2865*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m10, m%2 ; b0 2866*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 67 78 89 9a 2867*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m12 ; b2 2868*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m11 ; a1 2869*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m11 ; b1 2870*c0909341SAndroid Build Coastguard Worker paddd m%3, m%5 ; b0+b2 2871*c0909341SAndroid Build Coastguard Worker paddd m%4, m7 2872*c0909341SAndroid Build Coastguard Worker paddd m%3, m7 2873*c0909341SAndroid Build Coastguard Worker paddd m%1, m%4 2874*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 2875*c0909341SAndroid Build Coastguard Worker psrad m%1, 6 2876*c0909341SAndroid Build Coastguard Worker psrad m%2, 6 2877*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%2 2878*c0909341SAndroid Build Coastguard Worker%endmacro 2879*c0909341SAndroid Build Coastguard Worker PREP_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 2880*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r6 *1+ 0] 2881*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+ssq*1+ 0], 1 2882*c0909341SAndroid Build Coastguard Worker shufpd m1, m0, m2, 0x05 2883*c0909341SAndroid Build Coastguard Worker PREP_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 2884*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+r6 *1+16] 2885*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1+16], 1 2886*c0909341SAndroid Build Coastguard Worker shufpd m1, m4, m2, 0x05 2887*c0909341SAndroid Build Coastguard Worker PREP_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 2888*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 2889*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 2890*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 2891*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 ; 01 2892*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; 23 2893*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 ; 12 2894*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 34 2895*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 2896*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [v_mul+4*0] 2897*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [v_mul+4*1] 2898*c0909341SAndroid Build Coastguard Worker movu xm5, [r5+ssq*1+ 0] 2899*c0909341SAndroid Build Coastguard Worker movu xm6, [r5+ssq*1+16] 2900*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 2901*c0909341SAndroid Build Coastguard Worker pmaddwd m13, m14, m1 ; a0 2902*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m2 ; b0 2903*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r5+ssq*0+ 0], 1 2904*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r5+ssq*0+16], 1 2905*c0909341SAndroid Build Coastguard Worker mova m1, m3 2906*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 ; a1 2907*c0909341SAndroid Build Coastguard Worker mova m2, m4 2908*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 ; b1 2909*c0909341SAndroid Build Coastguard Worker paddd m13, m3 2910*c0909341SAndroid Build Coastguard Worker shufpd m3, m5, m6, 0x05 2911*c0909341SAndroid Build Coastguard Worker paddd m14, m4 2912*c0909341SAndroid Build Coastguard Worker PREP_6TAP_HV_H 5, 3, 6, 4, 9 ; 5 6 2913*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [v_mul+4*2] 2914*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 2915*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m5, 0x05 2916*c0909341SAndroid Build Coastguard Worker mova m0, m5 2917*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 ; 45 2918*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 56 2919*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6, m3 ; a2 2920*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m4 ; b2 2921*c0909341SAndroid Build Coastguard Worker paddd m13, m7 2922*c0909341SAndroid Build Coastguard Worker paddd m14, m7 2923*c0909341SAndroid Build Coastguard Worker paddd m5, m13 2924*c0909341SAndroid Build Coastguard Worker paddd m6, m14 2925*c0909341SAndroid Build Coastguard Worker psrad m5, 6 2926*c0909341SAndroid Build Coastguard Worker psrad m6, 6 2927*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 2928*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 2929*c0909341SAndroid Build Coastguard Worker mova [r7+r8*0], xm5 2930*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+r8*2], m5, 1 2931*c0909341SAndroid Build Coastguard Worker lea r7, [r7+r8*4] 2932*c0909341SAndroid Build Coastguard Worker sub hd, 2 2933*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 2934*c0909341SAndroid Build Coastguard Worker add srcq, 16 2935*c0909341SAndroid Build Coastguard Worker add tmpq, 16 2936*c0909341SAndroid Build Coastguard Worker movzx hd, wb 2937*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 2938*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 2939*c0909341SAndroid Build Coastguard Worker%if WIN64 2940*c0909341SAndroid Build Coastguard Worker POP r8 2941*c0909341SAndroid Build Coastguard Worker%endif 2942*c0909341SAndroid Build Coastguard Worker RET 2943*c0909341SAndroid Build Coastguard Worker 2944*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc 2945*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc 2946*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc 2947*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc 2948*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp, SHARP, SHARP 2949*c0909341SAndroid Build Coastguard Worker 2950*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my 2951*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx2 2952*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2953*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 2954*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2955*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 2956*c0909341SAndroid Build Coastguard Worker lea r7, [prep_avx2] 2957*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2958*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2959*c0909341SAndroid Build Coastguard Worker jnz .h 2960*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2961*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep 2962*c0909341SAndroid Build Coastguard Worker.v: 2963*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2964*c0909341SAndroid Build Coastguard Worker shr myd, 16 2965*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2966*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2967*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+myq*8] 2968*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12, 15 2969*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [prep_8tap_1d_rnd] 2970*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 2971*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2972*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2973*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2974*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 2975*c0909341SAndroid Build Coastguard Worker jnz .v_12bpc 2976*c0909341SAndroid Build Coastguard Worker psllw m0, 2 2977*c0909341SAndroid Build Coastguard Worker.v_12bpc: 2978*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q0000 2979*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q1111 2980*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q2222 2981*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q3333 2982*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2983*c0909341SAndroid Build Coastguard Worker jg .v_w8 2984*c0909341SAndroid Build Coastguard Worker.v_w4: 2985*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*0] 2986*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*1] 2987*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+strideq*2] 2988*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+r6 ] 2989*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 2990*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+strideq*0] 2991*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+strideq*1] 2992*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0x30 2993*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x30 2994*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0 ; 01 12 2995*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*2] 2996*c0909341SAndroid Build Coastguard Worker add srcq, r6 2997*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0x30 2998*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, 0x30 2999*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 ; 23 34 3000*c0909341SAndroid Build Coastguard Worker vpblendd m3, m5, 0x30 3001*c0909341SAndroid Build Coastguard Worker vpblendd m5, m0, 0x30 3002*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m5 ; 45 56 3003*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 3004*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*0] 3005*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m8, m1 ; a0 b0 3006*c0909341SAndroid Build Coastguard Worker mova m1, m2 3007*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m9 ; a1 b1 3008*c0909341SAndroid Build Coastguard Worker paddd m5, m7 3009*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3010*c0909341SAndroid Build Coastguard Worker mova m2, m3 3011*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m10 ; a2 b2 3012*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3013*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, m4, 0x30 3014*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*1] 3015*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3016*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 3017*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 ; 67 78 3018*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m11, m3 ; a3 b3 3019*c0909341SAndroid Build Coastguard Worker paddd m5, m4 3020*c0909341SAndroid Build Coastguard Worker psrad m5, 4 3021*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m5, 1 3022*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm4 3023*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm5 3024*c0909341SAndroid Build Coastguard Worker add tmpq, 16 3025*c0909341SAndroid Build Coastguard Worker sub hd, 2 3026*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 3027*c0909341SAndroid Build Coastguard Worker RET 3028*c0909341SAndroid Build Coastguard Worker.v_w8: 3029*c0909341SAndroid Build Coastguard Worker%if WIN64 3030*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 15 3031*c0909341SAndroid Build Coastguard Worker push r8 3032*c0909341SAndroid Build Coastguard Worker%endif 3033*c0909341SAndroid Build Coastguard Worker mov r8d, wd 3034*c0909341SAndroid Build Coastguard Worker shl wd, 5 3035*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 3036*c0909341SAndroid Build Coastguard Worker.v_w8_loop0: 3037*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+strideq*0] 3038*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [srcq+strideq*1] 3039*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+strideq*4] 3040*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+r6 ] 3041*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [srcq+strideq*2] 3042*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3043*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [r5+strideq*0] 3044*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r5+strideq*1] 3045*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r5+strideq*2] 3046*c0909341SAndroid Build Coastguard Worker add r5, r6 3047*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, 0x0c 3048*c0909341SAndroid Build Coastguard Worker shufpd m5, m1, 0x0c 3049*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 ; 01 3050*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 34 3051*c0909341SAndroid Build Coastguard Worker shufpd m6, m2, 0x0c 3052*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5, m6 ; 12 3053*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 ; 45 3054*c0909341SAndroid Build Coastguard Worker shufpd m0, m3, 0x0c 3055*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m6, m0 ; 23 3056*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 ; 56 3057*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 3058*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [r5+strideq*0] 3059*c0909341SAndroid Build Coastguard Worker pmaddwd m12, m8, m1 ; a0 3060*c0909341SAndroid Build Coastguard Worker pmaddwd m13, m8, m2 ; b0 3061*c0909341SAndroid Build Coastguard Worker mova m1, m3 3062*c0909341SAndroid Build Coastguard Worker mova m2, m4 3063*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 ; a1 3064*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 ; b1 3065*c0909341SAndroid Build Coastguard Worker paddd m12, m7 3066*c0909341SAndroid Build Coastguard Worker paddd m13, m7 3067*c0909341SAndroid Build Coastguard Worker paddd m12, m3 3068*c0909341SAndroid Build Coastguard Worker paddd m13, m4 3069*c0909341SAndroid Build Coastguard Worker mova m3, m5 3070*c0909341SAndroid Build Coastguard Worker mova m4, m6 3071*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m10 ; a2 3072*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m10 ; b2 3073*c0909341SAndroid Build Coastguard Worker paddd m12, m5 3074*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [r5+strideq*1] 3075*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3076*c0909341SAndroid Build Coastguard Worker paddd m13, m6 3077*c0909341SAndroid Build Coastguard Worker shufpd m6, m0, m14, 0x0d 3078*c0909341SAndroid Build Coastguard Worker shufpd m0, m14, m5, 0x0c 3079*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m0 ; 67 3080*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 ; 78 3081*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m11, m5 ; a3 3082*c0909341SAndroid Build Coastguard Worker paddd m12, m14 3083*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m11, m6 ; b3 3084*c0909341SAndroid Build Coastguard Worker paddd m13, m14 3085*c0909341SAndroid Build Coastguard Worker psrad m12, 4 3086*c0909341SAndroid Build Coastguard Worker psrad m13, 4 3087*c0909341SAndroid Build Coastguard Worker packssdw m12, m13 3088*c0909341SAndroid Build Coastguard Worker vpermq m12, m12, q3120 3089*c0909341SAndroid Build Coastguard Worker mova [r7+r8*0], xm12 3090*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+r8*2], m12, 1 3091*c0909341SAndroid Build Coastguard Worker lea r7, [r7+r8*4] 3092*c0909341SAndroid Build Coastguard Worker sub hd, 2 3093*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 3094*c0909341SAndroid Build Coastguard Worker add srcq, 16 3095*c0909341SAndroid Build Coastguard Worker add tmpq, 16 3096*c0909341SAndroid Build Coastguard Worker movzx hd, wb 3097*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 3098*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop0 3099*c0909341SAndroid Build Coastguard Worker%if WIN64 3100*c0909341SAndroid Build Coastguard Worker pop r8 3101*c0909341SAndroid Build Coastguard Worker%endif 3102*c0909341SAndroid Build Coastguard Worker RET 3103*c0909341SAndroid Build Coastguard Worker.h: 3104*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3105*c0909341SAndroid Build Coastguard Worker jnz .hv 3106*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) 3107*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3108*c0909341SAndroid Build Coastguard Worker je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4 3109*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3110*c0909341SAndroid Build Coastguard Worker sub srcq, 6 3111*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+mxq*8] 3112*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12 3113*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufA] 3114*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufB] 3115*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 3116*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 3117*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 3118*c0909341SAndroid Build Coastguard Worker jnz .h_12bpc 3119*c0909341SAndroid Build Coastguard Worker psllw m0, 2 3120*c0909341SAndroid Build Coastguard Worker.h_12bpc: 3121*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q0000 3122*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q1111 3123*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q2222 3124*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q3333 3125*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3126*c0909341SAndroid Build Coastguard Worker jg .h_w16 3127*c0909341SAndroid Build Coastguard Worker.h_w8: 3128*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 3129*c0909341SAndroid Build Coastguard Worker pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 3130*c0909341SAndroid Build Coastguard Worker pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 3131*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%4 ; abcd1 3132*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m8 ; abcd0 3133*c0909341SAndroid Build Coastguard Worker pshufb m%2, m7 ; 6 7 7 8 8 9 9 a 3134*c0909341SAndroid Build Coastguard Worker shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 3135*c0909341SAndroid Build Coastguard Worker paddd m%5, m5 3136*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 3137*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m11, m%2 ; abcd3 3138*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 3139*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m10, m%4 ; abcd2 3140*c0909341SAndroid Build Coastguard Worker pshufb m%3, m7 ; a b b c c d d e 3141*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m8 ; efgh0 3142*c0909341SAndroid Build Coastguard Worker paddd m%1, m%5 3143*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m9, m%2 ; efgh1 3144*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 3145*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m11 ; efgh3 3146*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m10 ; efgh2 3147*c0909341SAndroid Build Coastguard Worker paddd m%4, m5 3148*c0909341SAndroid Build Coastguard Worker paddd m%4, m%5 3149*c0909341SAndroid Build Coastguard Worker paddd m%3, m%4 3150*c0909341SAndroid Build Coastguard Worker paddd m%2, m%3 3151*c0909341SAndroid Build Coastguard Worker psrad m%1, 4 3152*c0909341SAndroid Build Coastguard Worker psrad m%2, 4 3153*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%2 3154*c0909341SAndroid Build Coastguard Worker%endmacro 3155*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+ 0] 3156*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*1+ 0], 1 3157*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*0+16] 3158*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*1+16], 1 3159*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3160*c0909341SAndroid Build Coastguard Worker shufpd m1, m0, m2, 0x05 3161*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 0, 1, 2, 3, 4 3162*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 3163*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3164*c0909341SAndroid Build Coastguard Worker sub hd, 2 3165*c0909341SAndroid Build Coastguard Worker jg .h_w8 3166*c0909341SAndroid Build Coastguard Worker RET 3167*c0909341SAndroid Build Coastguard Worker.h_w16: 3168*c0909341SAndroid Build Coastguard Worker add wd, wd 3169*c0909341SAndroid Build Coastguard Worker.h_w16_loop0: 3170*c0909341SAndroid Build Coastguard Worker mov r6d, wd 3171*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 3172*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6-32] 3173*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6-24] 3174*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6-16] 3175*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 0, 1, 2, 3, 4 3176*c0909341SAndroid Build Coastguard Worker mova [tmpq+r6-32], m0 3177*c0909341SAndroid Build Coastguard Worker sub r6d, 32 3178*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 3179*c0909341SAndroid Build Coastguard Worker add srcq, strideq 3180*c0909341SAndroid Build Coastguard Worker add tmpq, wq 3181*c0909341SAndroid Build Coastguard Worker dec hd 3182*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop0 3183*c0909341SAndroid Build Coastguard Worker RET 3184*c0909341SAndroid Build Coastguard Worker.hv: 3185*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 16 3186*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [prep_8tap_2d_rnd] 3187*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3188*c0909341SAndroid Build Coastguard Worker jg .hv_w8 3189*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 3190*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 3191*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3192*c0909341SAndroid Build Coastguard Worker shr myd, 16 3193*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3194*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3195*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+subpel_filters+myq*8] 3196*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 3197*c0909341SAndroid Build Coastguard Worker sub srcq, 2 3198*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3199*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3200*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m0 3201*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 3202*c0909341SAndroid Build Coastguard Worker psraw m7, 4 3203*c0909341SAndroid Build Coastguard Worker psraw m1, 8 3204*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 3205*c0909341SAndroid Build Coastguard Worker jz .hv_w4_10bit 3206*c0909341SAndroid Build Coastguard Worker psraw m7, 2 3207*c0909341SAndroid Build Coastguard Worker.hv_w4_10bit: 3208*c0909341SAndroid Build Coastguard Worker pshufd m11, m1, q0000 3209*c0909341SAndroid Build Coastguard Worker pshufd m12, m1, q1111 3210*c0909341SAndroid Build Coastguard Worker pshufd m13, m1, q2222 3211*c0909341SAndroid Build Coastguard Worker pshufd m14, m1, q3333 3212*c0909341SAndroid Build Coastguard Worker.hv_w4: 3213*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shufA] 3214*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [subpel_h_shufB] 3215*c0909341SAndroid Build Coastguard Worker pshufd m8, m7, q1111 3216*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 3217*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0] 3218*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 3219*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+r6 ] 3220*c0909341SAndroid Build Coastguard Worker vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 3221*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3222*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 3223*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+strideq*1] 3224*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 3225*c0909341SAndroid Build Coastguard Worker add srcq, r6 3226*c0909341SAndroid Build Coastguard Worker pshufb m4, m1, m9 3227*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 3228*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 3229*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 3230*c0909341SAndroid Build Coastguard Worker pshufb m5, m2, m9 3231*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 3232*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m7 3233*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 3234*c0909341SAndroid Build Coastguard Worker paddd m4, m15 3235*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3236*c0909341SAndroid Build Coastguard Worker pshufb m4, m0, m9 3237*c0909341SAndroid Build Coastguard Worker pshufb m0, m10 3238*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 3239*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 3240*c0909341SAndroid Build Coastguard Worker paddd m5, m15 3241*c0909341SAndroid Build Coastguard Worker paddd m2, m5 3242*c0909341SAndroid Build Coastguard Worker pshufb m5, m3, m9 3243*c0909341SAndroid Build Coastguard Worker pshufb m3, m10 3244*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m7 3245*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m8 3246*c0909341SAndroid Build Coastguard Worker paddd m4, m15 3247*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3248*c0909341SAndroid Build Coastguard Worker paddd m5, m15 3249*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3250*c0909341SAndroid Build Coastguard Worker vperm2i128 m0, m1, m2, 0x21 3251*c0909341SAndroid Build Coastguard Worker psrld m1, 6 3252*c0909341SAndroid Build Coastguard Worker psrld m2, 6 3253*c0909341SAndroid Build Coastguard Worker vperm2i128 m3, m4, m5, 0x21 3254*c0909341SAndroid Build Coastguard Worker pslld m4, 10 3255*c0909341SAndroid Build Coastguard Worker pslld m5, 10 3256*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa ; 23 34 3257*c0909341SAndroid Build Coastguard Worker pslld m0, 10 3258*c0909341SAndroid Build Coastguard Worker pblendw m1, m0, 0xaa ; 01 12 3259*c0909341SAndroid Build Coastguard Worker psrld m3, 6 3260*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa ; 45 56 3261*c0909341SAndroid Build Coastguard Worker psrad m0, m5, 16 3262*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 3263*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+strideq*0] 3264*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+strideq*1], 1 3265*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3266*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m11, m1 ; a0 b0 3267*c0909341SAndroid Build Coastguard Worker mova m1, m2 3268*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12 ; a1 b1 3269*c0909341SAndroid Build Coastguard Worker paddd m5, m15 3270*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3271*c0909341SAndroid Build Coastguard Worker mova m2, m3 3272*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 ; a2 b2 3273*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3274*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m9 3275*c0909341SAndroid Build Coastguard Worker pshufb m4, m10 3276*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 3277*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m8 3278*c0909341SAndroid Build Coastguard Worker paddd m3, m15 3279*c0909341SAndroid Build Coastguard Worker paddd m4, m3 3280*c0909341SAndroid Build Coastguard Worker psrad m4, 6 3281*c0909341SAndroid Build Coastguard Worker packssdw m0, m4 ; _ 7 6 8 3282*c0909341SAndroid Build Coastguard Worker vpermq m3, m0, q1122 ; _ 6 _ 7 3283*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 ; 67 78 3284*c0909341SAndroid Build Coastguard Worker mova m0, m4 3285*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14, m3 ; a3 b3 3286*c0909341SAndroid Build Coastguard Worker paddd m4, m5 3287*c0909341SAndroid Build Coastguard Worker psrad m4, 6 3288*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 3289*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 3290*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 3291*c0909341SAndroid Build Coastguard Worker add tmpq, 16 3292*c0909341SAndroid Build Coastguard Worker sub hd, 2 3293*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 3294*c0909341SAndroid Build Coastguard Worker RET 3295*c0909341SAndroid Build Coastguard Worker.hv_w8: 3296*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3297*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+subpel_filters+mxq*8] 3298*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3299*c0909341SAndroid Build Coastguard Worker shr myd, 16 3300*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3301*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3302*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [base+subpel_filters+myq*8] 3303*c0909341SAndroid Build Coastguard Worker%if WIN64 3304*c0909341SAndroid Build Coastguard Worker PUSH r8 3305*c0909341SAndroid Build Coastguard Worker%endif 3306*c0909341SAndroid Build Coastguard Worker mov r8d, wd 3307*c0909341SAndroid Build Coastguard Worker shl wd, 5 3308*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 3309*c0909341SAndroid Build Coastguard Worker sub srcq, 6 3310*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3311*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq-256] 3312*c0909341SAndroid Build Coastguard Worker pxor m0, m0 3313*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 3314*c0909341SAndroid Build Coastguard Worker psraw m0, 4 3315*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 3316*c0909341SAndroid Build Coastguard Worker jz .hv_w8_10bit 3317*c0909341SAndroid Build Coastguard Worker psraw m0, 2 3318*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit: 3319*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q0000 3320*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q1111 3321*c0909341SAndroid Build Coastguard Worker mova [v_mul], xm1 3322*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q2222 3323*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q3333 3324*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 3325*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 3326*c0909341SAndroid Build Coastguard Worker pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 3327*c0909341SAndroid Build Coastguard Worker pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 3328*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m2 3329*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m11 3330*c0909341SAndroid Build Coastguard Worker pshufb m%2, m9 ; 6 7 7 8 8 9 9 a 3331*c0909341SAndroid Build Coastguard Worker shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 3332*c0909341SAndroid Build Coastguard Worker paddd m3, m15 3333*c0909341SAndroid Build Coastguard Worker paddd m%1, m3 3334*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14, m%2 3335*c0909341SAndroid Build Coastguard Worker paddd m%1, m3 3336*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13, m2 3337*c0909341SAndroid Build Coastguard Worker pshufb m%3, m9 ; a b b c c d d e 3338*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11 3339*c0909341SAndroid Build Coastguard Worker paddd m%1, m3 3340*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m%2 3341*c0909341SAndroid Build Coastguard Worker shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 3342*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m14 3343*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m13 3344*c0909341SAndroid Build Coastguard Worker paddd m2, m15 3345*c0909341SAndroid Build Coastguard Worker paddd m2, m3 3346*c0909341SAndroid Build Coastguard Worker paddd m2, m%3 3347*c0909341SAndroid Build Coastguard Worker paddd m2, m%2 3348*c0909341SAndroid Build Coastguard Worker psrad m%1, 6 3349*c0909341SAndroid Build Coastguard Worker psrad m2, 6 3350*c0909341SAndroid Build Coastguard Worker packssdw m%1, m2 3351*c0909341SAndroid Build Coastguard Worker%endmacro 3352*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r6 + 0] 3353*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufA] 3354*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+strideq*4] 3355*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+r6 + 8] 3356*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shufB] 3357*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3358*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+r6 +16] 3359*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+strideq*0+ 0] 3360*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r5 +strideq*0+ 0], 1 3361*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0+16] 3362*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [r5 +strideq*0+16], 1 3363*c0909341SAndroid Build Coastguard Worker shufpd m7, m5, m1, 0x05 3364*c0909341SAndroid Build Coastguard Worker INIT_XMM avx2 3365*c0909341SAndroid Build Coastguard Worker PREP_8TAP_HV_H 4, 6, 0 ; 3 3366*c0909341SAndroid Build Coastguard Worker INIT_YMM avx2 3367*c0909341SAndroid Build Coastguard Worker PREP_8TAP_HV_H 5, 7, 1 ; 0 4 3368*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*2+ 0] 3369*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+r6 *2+ 0], 1 3370*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*2+16] 3371*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+r6 *2+16], 1 3372*c0909341SAndroid Build Coastguard Worker shufpd m7, m0, m1, 0x05 3373*c0909341SAndroid Build Coastguard Worker PREP_8TAP_HV_H 0, 7, 1 ; 2 6 3374*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+strideq*1+ 0] 3375*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1+16] 3376*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r5 +strideq*1+ 0], 1 3377*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [r5 +strideq*1+16], 1 3378*c0909341SAndroid Build Coastguard Worker add r5, r6 3379*c0909341SAndroid Build Coastguard Worker shufpd m7, m6, m1, 0x05 3380*c0909341SAndroid Build Coastguard Worker PREP_8TAP_HV_H 6, 7, 1 ; 1 5 3381*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q1100 3382*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 3383*c0909341SAndroid Build Coastguard Worker vpermq m6, m6, q3120 3384*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 3385*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m7, m4 ; 23 3386*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 34 3387*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m5, m6 ; 01 3388*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 ; 45 3389*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m6, m7 ; 12 3390*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 56 3391*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 3392*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [v_mul+4*0] 3393*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [v_mul+4*1] 3394*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [v_mul+4*2] 3395*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m9, m1 ; a0 3396*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m2 ; b0 3397*c0909341SAndroid Build Coastguard Worker mova m1, m3 3398*c0909341SAndroid Build Coastguard Worker mova m2, m4 3399*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 ; a1 3400*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m7 ; b1 3401*c0909341SAndroid Build Coastguard Worker paddd m8, m15 3402*c0909341SAndroid Build Coastguard Worker paddd m9, m15 3403*c0909341SAndroid Build Coastguard Worker paddd m8, m3 3404*c0909341SAndroid Build Coastguard Worker paddd m9, m4 3405*c0909341SAndroid Build Coastguard Worker mova m3, m5 3406*c0909341SAndroid Build Coastguard Worker mova m4, m6 3407*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m10 ; a2 3408*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m10 ; b2 3409*c0909341SAndroid Build Coastguard Worker paddd m8, m5 3410*c0909341SAndroid Build Coastguard Worker paddd m9, m6 3411*c0909341SAndroid Build Coastguard Worker movu xm5, [r5+strideq*0] 3412*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r5+strideq*1], 1 3413*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufA] 3414*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [subpel_h_shufB] 3415*c0909341SAndroid Build Coastguard Worker movu xm6, [r5+strideq*0+16] 3416*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r5+strideq*1+16], 1 3417*c0909341SAndroid Build Coastguard Worker vextracti128 [r7], m0, 1 3418*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m7 ; 01 3419*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 ; 23 3420*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 3421*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 3422*c0909341SAndroid Build Coastguard Worker paddd m0, m15 3423*c0909341SAndroid Build Coastguard Worker paddd m0, m5 3424*c0909341SAndroid Build Coastguard Worker pshufb m5, m6, m7 ; 89 3425*c0909341SAndroid Build Coastguard Worker pshufb m6, m10 ; ab 3426*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13 3427*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 3428*c0909341SAndroid Build Coastguard Worker paddd m5, m15 3429*c0909341SAndroid Build Coastguard Worker paddd m6, m5 3430*c0909341SAndroid Build Coastguard Worker movu xm5, [r5+strideq*0+8] 3431*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r5+strideq*1+8], 1 3432*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3433*c0909341SAndroid Build Coastguard Worker pshufb m7, m5, m7 3434*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 3435*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m13, m7 3436*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m11 3437*c0909341SAndroid Build Coastguard Worker paddd m0, m10 3438*c0909341SAndroid Build Coastguard Worker paddd m6, m7 3439*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m14, m5 3440*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 3441*c0909341SAndroid Build Coastguard Worker paddd m0, m7 3442*c0909341SAndroid Build Coastguard Worker paddd m5, m6 3443*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [r7] 3444*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [v_mul+4*3] 3445*c0909341SAndroid Build Coastguard Worker psrad m0, 6 3446*c0909341SAndroid Build Coastguard Worker psrad m5, 6 3447*c0909341SAndroid Build Coastguard Worker packssdw m0, m5 3448*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 ; 7 8 3449*c0909341SAndroid Build Coastguard Worker shufpd m6, m7, 0x04 ; 6 7 3450*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m7 ; 67 3451*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 78 3452*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m10, m5 ; a3 3453*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m6 ; b3 3454*c0909341SAndroid Build Coastguard Worker paddd m7, m8 3455*c0909341SAndroid Build Coastguard Worker paddd m9, m10 3456*c0909341SAndroid Build Coastguard Worker psrad m7, 6 3457*c0909341SAndroid Build Coastguard Worker psrad m9, 6 3458*c0909341SAndroid Build Coastguard Worker packssdw m7, m9 3459*c0909341SAndroid Build Coastguard Worker vpermq m7, m7, q3120 3460*c0909341SAndroid Build Coastguard Worker mova [r7+r8*0], xm7 3461*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+r8*2], m7, 1 3462*c0909341SAndroid Build Coastguard Worker lea r7, [r7+r8*4] 3463*c0909341SAndroid Build Coastguard Worker sub hd, 2 3464*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 3465*c0909341SAndroid Build Coastguard Worker add srcq, 16 3466*c0909341SAndroid Build Coastguard Worker add tmpq, 16 3467*c0909341SAndroid Build Coastguard Worker movzx hd, wb 3468*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 3469*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 3470*c0909341SAndroid Build Coastguard Worker%if WIN64 3471*c0909341SAndroid Build Coastguard Worker POP r8 3472*c0909341SAndroid Build Coastguard Worker%endif 3473*c0909341SAndroid Build Coastguard Worker RET 3474*c0909341SAndroid Build Coastguard Worker 3475*c0909341SAndroid Build Coastguard Worker%macro movifprep 2 3476*c0909341SAndroid Build Coastguard Worker %if isprep 3477*c0909341SAndroid Build Coastguard Worker mov %1, %2 3478*c0909341SAndroid Build Coastguard Worker %endif 3479*c0909341SAndroid Build Coastguard Worker%endmacro 3480*c0909341SAndroid Build Coastguard Worker 3481*c0909341SAndroid Build Coastguard Worker%macro REMAP_REG 2 3482*c0909341SAndroid Build Coastguard Worker %xdefine r%1 r%2 3483*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%2q 3484*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%2d 3485*c0909341SAndroid Build Coastguard Worker%endmacro 3486*c0909341SAndroid Build Coastguard Worker 3487*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 3488*c0909341SAndroid Build Coastguard Worker %if isprep 3489*c0909341SAndroid Build Coastguard Worker %xdefine r14_save r14 3490*c0909341SAndroid Build Coastguard Worker %assign %%i 14 3491*c0909341SAndroid Build Coastguard Worker %rep 14 3492*c0909341SAndroid Build Coastguard Worker %assign %%j %%i-1 3493*c0909341SAndroid Build Coastguard Worker REMAP_REG %%i, %%j 3494*c0909341SAndroid Build Coastguard Worker %assign %%i %%i-1 3495*c0909341SAndroid Build Coastguard Worker %endrep 3496*c0909341SAndroid Build Coastguard Worker %endif 3497*c0909341SAndroid Build Coastguard Worker%endmacro 3498*c0909341SAndroid Build Coastguard Worker 3499*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 3500*c0909341SAndroid Build Coastguard Worker %if isprep 3501*c0909341SAndroid Build Coastguard Worker %assign %%i 1 3502*c0909341SAndroid Build Coastguard Worker %rep 13 3503*c0909341SAndroid Build Coastguard Worker %assign %%j %%i+1 3504*c0909341SAndroid Build Coastguard Worker REMAP_REG %%i, %%j 3505*c0909341SAndroid Build Coastguard Worker %assign %%i %%i+1 3506*c0909341SAndroid Build Coastguard Worker %endrep 3507*c0909341SAndroid Build Coastguard Worker %xdefine r14 r14_save 3508*c0909341SAndroid Build Coastguard Worker %undef r14_save 3509*c0909341SAndroid Build Coastguard Worker %endif 3510*c0909341SAndroid Build Coastguard Worker%endmacro 3511*c0909341SAndroid Build Coastguard Worker 3512*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 3513*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 3514*c0909341SAndroid Build Coastguard Worker RET 3515*c0909341SAndroid Build Coastguard Worker %if %1 3516*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3517*c0909341SAndroid Build Coastguard Worker %endif 3518*c0909341SAndroid Build Coastguard Worker%endmacro 3519*c0909341SAndroid Build Coastguard Worker 3520*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd 3521*c0909341SAndroid Build Coastguard Worker movu xm%1, [srcq+ r4*2] 3522*c0909341SAndroid Build Coastguard Worker movu xm%2, [srcq+ r6*2] 3523*c0909341SAndroid Build Coastguard Worker movu xm%3, [srcq+ r7*2] 3524*c0909341SAndroid Build Coastguard Worker movu xm%4, [srcq+ r9*2] 3525*c0909341SAndroid Build Coastguard Worker vinserti128 m%1, [srcq+r10*2], 1 3526*c0909341SAndroid Build Coastguard Worker vinserti128 m%2, [srcq+r11*2], 1 3527*c0909341SAndroid Build Coastguard Worker vinserti128 m%3, [srcq+r13*2], 1 3528*c0909341SAndroid Build Coastguard Worker vinserti128 m%4, [srcq+ rX*2], 1 3529*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3530*c0909341SAndroid Build Coastguard Worker movu xm%5, [srcq+ r4*2] 3531*c0909341SAndroid Build Coastguard Worker movu xm%6, [srcq+ r6*2] 3532*c0909341SAndroid Build Coastguard Worker movu xm%7, [srcq+ r7*2] 3533*c0909341SAndroid Build Coastguard Worker movu xm%8, [srcq+ r9*2] 3534*c0909341SAndroid Build Coastguard Worker vinserti128 m%5, [srcq+r10*2], 1 3535*c0909341SAndroid Build Coastguard Worker vinserti128 m%6, [srcq+r11*2], 1 3536*c0909341SAndroid Build Coastguard Worker vinserti128 m%7, [srcq+r13*2], 1 3537*c0909341SAndroid Build Coastguard Worker vinserti128 m%8, [srcq+ rX*2], 1 3538*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3539*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m12 3540*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m13 3541*c0909341SAndroid Build Coastguard Worker pmaddwd m%3, m14 3542*c0909341SAndroid Build Coastguard Worker pmaddwd m%4, m15 3543*c0909341SAndroid Build Coastguard Worker pmaddwd m%5, m12 3544*c0909341SAndroid Build Coastguard Worker pmaddwd m%6, m13 3545*c0909341SAndroid Build Coastguard Worker pmaddwd m%7, m14 3546*c0909341SAndroid Build Coastguard Worker pmaddwd m%8, m15 3547*c0909341SAndroid Build Coastguard Worker phaddd m%1, m%2 3548*c0909341SAndroid Build Coastguard Worker %if %9 3549*c0909341SAndroid Build Coastguard Worker mova m10, [rsp+0x00] 3550*c0909341SAndroid Build Coastguard Worker %endif 3551*c0909341SAndroid Build Coastguard Worker phaddd m%3, m%4 3552*c0909341SAndroid Build Coastguard Worker phaddd m%5, m%6 3553*c0909341SAndroid Build Coastguard Worker phaddd m%7, m%8 3554*c0909341SAndroid Build Coastguard Worker phaddd m%1, m%3 3555*c0909341SAndroid Build Coastguard Worker phaddd m%5, m%7 3556*c0909341SAndroid Build Coastguard Worker paddd m%1, m10 3557*c0909341SAndroid Build Coastguard Worker paddd m%5, m10 3558*c0909341SAndroid Build Coastguard Worker psrad m%1, xm11 3559*c0909341SAndroid Build Coastguard Worker psrad m%5, xm11 3560*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%5 3561*c0909341SAndroid Build Coastguard Worker%endmacro 3562*c0909341SAndroid Build Coastguard Worker 3563*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED 1 3564*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3565*c0909341SAndroid Build Coastguard Worker %assign isput 1 3566*c0909341SAndroid Build Coastguard Worker %assign isprep 0 3567*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 3568*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r12 3569*c0909341SAndroid Build Coastguard Worker mov r7d, pxmaxm 3570*c0909341SAndroid Build Coastguard Worker%else 3571*c0909341SAndroid Build Coastguard Worker %assign isput 0 3572*c0909341SAndroid Build Coastguard Worker %assign isprep 1 3573*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 3574*c0909341SAndroid Build Coastguard Worker %define tmp_stridem qword [rsp+0xd0] 3575*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r11 3576*c0909341SAndroid Build Coastguard Worker%endif 3577*c0909341SAndroid Build Coastguard Worker lea base_reg, [%1_8tap_scaled_16bpc_avx2] 3578*c0909341SAndroid Build Coastguard Worker%define base base_reg-%1_8tap_scaled_16bpc_avx2 3579*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3580*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 3581*c0909341SAndroid Build Coastguard Worker%if isprep && UNIX64 3582*c0909341SAndroid Build Coastguard Worker movd xm10, mxd 3583*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, xm10 3584*c0909341SAndroid Build Coastguard Worker mov r5d, t0d 3585*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 5, 7 3586*c0909341SAndroid Build Coastguard Worker mov r6d, pxmaxm 3587*c0909341SAndroid Build Coastguard Worker%else 3588*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, mxm 3589*c0909341SAndroid Build Coastguard Worker %if isput 3590*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, pxmaxm 3591*c0909341SAndroid Build Coastguard Worker %else 3592*c0909341SAndroid Build Coastguard Worker mov r6d, pxmaxm 3593*c0909341SAndroid Build Coastguard Worker %endif 3594*c0909341SAndroid Build Coastguard Worker%endif 3595*c0909341SAndroid Build Coastguard Worker mov dyd, dym 3596*c0909341SAndroid Build Coastguard Worker%if isput 3597*c0909341SAndroid Build Coastguard Worker %if WIN64 3598*c0909341SAndroid Build Coastguard Worker mov r8d, hm 3599*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 3600*c0909341SAndroid Build Coastguard Worker %define hm r5m 3601*c0909341SAndroid Build Coastguard Worker %define dxm r8m 3602*c0909341SAndroid Build Coastguard Worker %else 3603*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 3604*c0909341SAndroid Build Coastguard Worker %define hm r6m 3605*c0909341SAndroid Build Coastguard Worker %endif 3606*c0909341SAndroid Build Coastguard Worker %define dsm [rsp+0x98] 3607*c0909341SAndroid Build Coastguard Worker %define rX r1 3608*c0909341SAndroid Build Coastguard Worker %define rXd r1d 3609*c0909341SAndroid Build Coastguard Worker%else ; prep 3610*c0909341SAndroid Build Coastguard Worker %if WIN64 3611*c0909341SAndroid Build Coastguard Worker mov r7d, hm 3612*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 3613*c0909341SAndroid Build Coastguard Worker %define hm r4m 3614*c0909341SAndroid Build Coastguard Worker %define dxm r7m 3615*c0909341SAndroid Build Coastguard Worker %else 3616*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 3617*c0909341SAndroid Build Coastguard Worker %define hm [rsp+0x98] 3618*c0909341SAndroid Build Coastguard Worker %endif 3619*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3620*c0909341SAndroid Build Coastguard Worker %define rX r14 3621*c0909341SAndroid Build Coastguard Worker %define rXd r14d 3622*c0909341SAndroid Build Coastguard Worker%endif 3623*c0909341SAndroid Build Coastguard Worker shr r7d, 11 3624*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_0x3ff] 3625*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] 3626*c0909341SAndroid Build Coastguard Worker movd xm7, [base+s_8tap_h_sh+r7*4] 3627*c0909341SAndroid Build Coastguard Worker%if isput 3628*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] 3629*c0909341SAndroid Build Coastguard Worker pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 3630*c0909341SAndroid Build Coastguard Worker%else 3631*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pd_m524256] 3632*c0909341SAndroid Build Coastguard Worker%endif 3633*c0909341SAndroid Build Coastguard Worker pxor m9, m9 3634*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 3635*c0909341SAndroid Build Coastguard Worker movzx r7d, t1b 3636*c0909341SAndroid Build Coastguard Worker shr t1d, 16 3637*c0909341SAndroid Build Coastguard Worker cmp hd, 6 3638*c0909341SAndroid Build Coastguard Worker cmovs t1d, r7d 3639*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 3640*c0909341SAndroid Build Coastguard Worker cmp dyd, 1024 3641*c0909341SAndroid Build Coastguard Worker je .dy1 3642*c0909341SAndroid Build Coastguard Worker cmp dyd, 2048 3643*c0909341SAndroid Build Coastguard Worker je .dy2 3644*c0909341SAndroid Build Coastguard Worker movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] 3645*c0909341SAndroid Build Coastguard Worker add wq, base_reg 3646*c0909341SAndroid Build Coastguard Worker jmp wq 3647*c0909341SAndroid Build Coastguard Worker%if isput 3648*c0909341SAndroid Build Coastguard Worker.w2: 3649*c0909341SAndroid Build Coastguard Worker mov myd, mym 3650*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 3651*c0909341SAndroid Build Coastguard Worker sub srcq, 2 3652*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 3653*c0909341SAndroid Build Coastguard Worker punpckldq m8, m9, m8 3654*c0909341SAndroid Build Coastguard Worker paddd m10, m8 ; mx+dx*[0,1] 3655*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+pq_0x40000000+2] 3656*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 3657*c0909341SAndroid Build Coastguard Worker pand xm8, xm10, xm6 3658*c0909341SAndroid Build Coastguard Worker psrld xm8, 6 3659*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 3660*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 3661*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 3662*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_q] 3663*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_s_shuf2] 3664*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, [base+subpel_filters+r4*8+2] 3665*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [base+subpel_filters+r6*8+2] 3666*c0909341SAndroid Build Coastguard Worker pcmpeqd xm8, xm9 3667*c0909341SAndroid Build Coastguard Worker psrld m10, 10 3668*c0909341SAndroid Build Coastguard Worker paddd m10, m10 3669*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 3670*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 3671*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*2] 3672*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ss3q ] 3673*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3674*c0909341SAndroid Build Coastguard Worker pshufb m10, m5 3675*c0909341SAndroid Build Coastguard Worker paddb m10, m6 3676*c0909341SAndroid Build Coastguard Worker vpblendd xm15, xm4, 0xa 3677*c0909341SAndroid Build Coastguard Worker pblendvb xm15, xm14, xm8 3678*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 3679*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 3680*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 3681*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 3682*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 3683*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3684*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m10}, m0, m1, m2, m3 3685*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m15}, m0, m1, m2, m3 3686*c0909341SAndroid Build Coastguard Worker phaddd m0, m1 3687*c0909341SAndroid Build Coastguard Worker phaddd m2, m3 3688*c0909341SAndroid Build Coastguard Worker paddd m0, m12 3689*c0909341SAndroid Build Coastguard Worker paddd m2, m12 3690*c0909341SAndroid Build Coastguard Worker psrad m0, xm7 3691*c0909341SAndroid Build Coastguard Worker psrad m2, xm7 3692*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 ; 0 1 2 3 4 5 6 7 3693*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 3694*c0909341SAndroid Build Coastguard Worker palignr xm2, xm1, xm0, 4 ; 1 2 3 4 3695*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0, xm2 ; 01 12 3696*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm2 ; 23 34 3697*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm1, q0321 ; 5 6 7 _ 3698*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm4 ; 45 56 3699*c0909341SAndroid Build Coastguard Worker punpckhwd xm4, xm1, xm4 ; 67 __ 3700*c0909341SAndroid Build Coastguard Worker.w2_loop: 3701*c0909341SAndroid Build Coastguard Worker and myd, 0x3ff 3702*c0909341SAndroid Build Coastguard Worker mov r6d, 64 << 24 3703*c0909341SAndroid Build Coastguard Worker mov r4d, myd 3704*c0909341SAndroid Build Coastguard Worker shr r4d, 6 3705*c0909341SAndroid Build Coastguard Worker lea r4d, [t1+r4] 3706*c0909341SAndroid Build Coastguard Worker cmovnz r6q, [base+subpel_filters+r4*8] 3707*c0909341SAndroid Build Coastguard Worker movq xm14, r6q 3708*c0909341SAndroid Build Coastguard Worker pmovsxbw xm14, xm14 3709*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm14, q0000 3710*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm14, q1111 3711*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm3, xm8 3712*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm0, xm9 3713*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm14, q2222 3714*c0909341SAndroid Build Coastguard Worker pshufd xm14, xm14, q3333 3715*c0909341SAndroid Build Coastguard Worker paddd xm5, xm6 3716*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm2, xm8 3717*c0909341SAndroid Build Coastguard Worker pmaddwd xm8, xm4, xm14 3718*c0909341SAndroid Build Coastguard Worker psrldq xm9, xm7, 8 3719*c0909341SAndroid Build Coastguard Worker paddd xm5, xm6 3720*c0909341SAndroid Build Coastguard Worker paddd xm5, xm13 3721*c0909341SAndroid Build Coastguard Worker paddd xm5, xm8 3722*c0909341SAndroid Build Coastguard Worker psrad xm5, xm9 3723*c0909341SAndroid Build Coastguard Worker packusdw xm5, xm5 3724*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm11 3725*c0909341SAndroid Build Coastguard Worker movd [dstq], xm5 3726*c0909341SAndroid Build Coastguard Worker add dstq, dsq 3727*c0909341SAndroid Build Coastguard Worker dec hd 3728*c0909341SAndroid Build Coastguard Worker jz .ret 3729*c0909341SAndroid Build Coastguard Worker add myd, dyd 3730*c0909341SAndroid Build Coastguard Worker test myd, ~0x3ff 3731*c0909341SAndroid Build Coastguard Worker jz .w2_loop 3732*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq] 3733*c0909341SAndroid Build Coastguard Worker test myd, 0x400 3734*c0909341SAndroid Build Coastguard Worker jz .w2_skip_line 3735*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3736*c0909341SAndroid Build Coastguard Worker shufps xm3, xm0, q1032 ; 01 12 3737*c0909341SAndroid Build Coastguard Worker shufps xm0, xm2, q1032 ; 23 34 3738*c0909341SAndroid Build Coastguard Worker shufps xm2, xm4, q1032 ; 45 56 3739*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm10 3740*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm15 3741*c0909341SAndroid Build Coastguard Worker phaddd xm5, xm5 3742*c0909341SAndroid Build Coastguard Worker paddd xm5, xm12 3743*c0909341SAndroid Build Coastguard Worker psrad xm5, xm7 3744*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 3745*c0909341SAndroid Build Coastguard Worker palignr xm1, xm5, xm1, 12 3746*c0909341SAndroid Build Coastguard Worker punpcklqdq xm1, xm1 ; 6 7 6 7 3747*c0909341SAndroid Build Coastguard Worker punpcklwd xm4, xm1, xm5 ; 67 __ 3748*c0909341SAndroid Build Coastguard Worker jmp .w2_loop 3749*c0909341SAndroid Build Coastguard Worker.w2_skip_line: 3750*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ssq*1] 3751*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3752*c0909341SAndroid Build Coastguard Worker mova xm3, xm0 ; 01 12 3753*c0909341SAndroid Build Coastguard Worker mova xm0, xm2 ; 23 34 3754*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm10 3755*c0909341SAndroid Build Coastguard Worker pshufb xm6, xm10 3756*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm15 3757*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm15 3758*c0909341SAndroid Build Coastguard Worker phaddd xm5, xm6 3759*c0909341SAndroid Build Coastguard Worker paddd xm5, xm12 3760*c0909341SAndroid Build Coastguard Worker psrad xm5, xm7 3761*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 ; 6 7 6 7 3762*c0909341SAndroid Build Coastguard Worker palignr xm1, xm5, xm1, 8 ; 4 5 6 7 3763*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm1, q0321 ; 5 6 7 _ 3764*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm5 ; 45 56 3765*c0909341SAndroid Build Coastguard Worker punpckhwd xm4, xm1, xm5 ; 67 __ 3766*c0909341SAndroid Build Coastguard Worker jmp .w2_loop 3767*c0909341SAndroid Build Coastguard Worker%endif 3768*c0909341SAndroid Build Coastguard Worker.w4: 3769*c0909341SAndroid Build Coastguard Worker mov myd, mym 3770*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m12 3771*c0909341SAndroid Build Coastguard Worker%if isput 3772*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], xm13 3773*c0909341SAndroid Build Coastguard Worker%else 3774*c0909341SAndroid Build Coastguard Worker SWAP m11, m13 3775*c0909341SAndroid Build Coastguard Worker%endif 3776*c0909341SAndroid Build Coastguard Worker mova [rsp+0x30], xm7 3777*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+rescale_mul] 3778*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 3779*c0909341SAndroid Build Coastguard Worker sub srcq, 2 3780*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 3781*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m7 3782*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+pq_0x40000000+1] 3783*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 3784*c0909341SAndroid Build Coastguard Worker SWAP m13, m10 3785*c0909341SAndroid Build Coastguard Worker paddd m13, m8 ; mx+dx*[0-3] 3786*c0909341SAndroid Build Coastguard Worker pand m6, m13 3787*c0909341SAndroid Build Coastguard Worker psrld m6, 6 3788*c0909341SAndroid Build Coastguard Worker paddd xm15, xm6 3789*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 3790*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 3791*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm15, 2 3792*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm15, 3 3793*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_q+ 0] 3794*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [base+bdct_lb_q+16] 3795*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [base+subpel_s_shuf2] 3796*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+subpel_filters+r4*8+2] 3797*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm7, [base+subpel_filters+r6*8+2] 3798*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, [base+subpel_filters+r11*8+2] 3799*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm8, [base+subpel_filters+r13*8+2] 3800*c0909341SAndroid Build Coastguard Worker pcmpeqd m6, m9 3801*c0909341SAndroid Build Coastguard Worker punpckldq m10, m6, m6 3802*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m6 3803*c0909341SAndroid Build Coastguard Worker psrld m13, 10 3804*c0909341SAndroid Build Coastguard Worker paddd m13, m13 3805*c0909341SAndroid Build Coastguard Worker vpblendd xm14, xm7, 0xa 3806*c0909341SAndroid Build Coastguard Worker vpblendd xm15, xm8, 0xa 3807*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 3808*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 3809*c0909341SAndroid Build Coastguard Worker pblendvb m14, m2, m10 3810*c0909341SAndroid Build Coastguard Worker pblendvb m15, m2, m6 3811*c0909341SAndroid Build Coastguard Worker pextrd r4, xm13, 2 3812*c0909341SAndroid Build Coastguard Worker pshufb m12, m13, m5 3813*c0909341SAndroid Build Coastguard Worker pshufb m13, m1 3814*c0909341SAndroid Build Coastguard Worker lea r6, [r4+ssq*1] 3815*c0909341SAndroid Build Coastguard Worker lea r11, [r4+ssq*2] 3816*c0909341SAndroid Build Coastguard Worker lea r13, [r4+ss3q ] 3817*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+ssq*0] 3818*c0909341SAndroid Build Coastguard Worker movu xm9, [srcq+ssq*1] 3819*c0909341SAndroid Build Coastguard Worker movu xm8, [srcq+ssq*2] 3820*c0909341SAndroid Build Coastguard Worker movu xm10, [srcq+ss3q ] 3821*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+r4 ] 3822*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+r6 ] 3823*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+r11 ] 3824*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r13 ] 3825*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3826*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+ssq*0], 1 3827*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [srcq+ssq*1], 1 3828*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [srcq+ssq*2], 1 3829*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [srcq+ss3q ], 1 3830*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+r4 ], 1 3831*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+r6 ], 1 3832*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+r11 ], 1 3833*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r13 ], 1 3834*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3835*c0909341SAndroid Build Coastguard Worker vpbroadcastb m5, xm13 3836*c0909341SAndroid Build Coastguard Worker psubb m13, m5 3837*c0909341SAndroid Build Coastguard Worker paddb m12, m0 3838*c0909341SAndroid Build Coastguard Worker paddb m13, m0 3839*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m12}, m7, m9, m8, m10 3840*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m7, m9, m8, m10 3841*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m13}, m1, m2, m3, m4 3842*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m15}, m1, m2, m3, m4 3843*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+0x00] 3844*c0909341SAndroid Build Coastguard Worker movd xm6, [rsp+0x30] 3845*c0909341SAndroid Build Coastguard Worker phaddd m7, m1 3846*c0909341SAndroid Build Coastguard Worker phaddd m9, m3 3847*c0909341SAndroid Build Coastguard Worker phaddd m8, m2 3848*c0909341SAndroid Build Coastguard Worker phaddd m10, m4 3849*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m5}, m7, m9, m8, m10 3850*c0909341SAndroid Build Coastguard Worker REPX {psrad x, xm6}, m7, m9, m8, m10 3851*c0909341SAndroid Build Coastguard Worker packssdw m7, m9 ; 0 1 4 5 3852*c0909341SAndroid Build Coastguard Worker packssdw m8, m10 ; 2 3 6 7 3853*c0909341SAndroid Build Coastguard Worker vextracti128 xm9, m7, 1 ; 4 5 3854*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m8, 1 ; 6 7 3855*c0909341SAndroid Build Coastguard Worker shufps xm4, xm7, xm8, q1032 ; 1 2 3856*c0909341SAndroid Build Coastguard Worker shufps xm5, xm8, xm9, q1032 ; 3 4 3857*c0909341SAndroid Build Coastguard Worker shufps xm6, xm9, xm3, q1032 ; 5 6 3858*c0909341SAndroid Build Coastguard Worker psrldq xm10, xm3, 8 ; 7 _ 3859*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm7, xm4 ; 01 3860*c0909341SAndroid Build Coastguard Worker punpckhwd xm7, xm4 ; 12 3861*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm8, xm5 ; 23 3862*c0909341SAndroid Build Coastguard Worker punpckhwd xm8, xm5 ; 34 3863*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm9, xm6 ; 45 3864*c0909341SAndroid Build Coastguard Worker punpckhwd xm9, xm6 ; 56 3865*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm10 ; 67 3866*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], xm7 3867*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm8 3868*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], xm9 3869*c0909341SAndroid Build Coastguard Worker.w4_loop: 3870*c0909341SAndroid Build Coastguard Worker and myd, 0x3ff 3871*c0909341SAndroid Build Coastguard Worker mov r11d, 64 << 24 3872*c0909341SAndroid Build Coastguard Worker mov r13d, myd 3873*c0909341SAndroid Build Coastguard Worker shr r13d, 6 3874*c0909341SAndroid Build Coastguard Worker lea r13d, [t1+r13] 3875*c0909341SAndroid Build Coastguard Worker cmovnz r11q, [base+subpel_filters+r13*8] 3876*c0909341SAndroid Build Coastguard Worker movq xm9, r11q 3877*c0909341SAndroid Build Coastguard Worker pmovsxbw xm9, xm9 3878*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm9, q0000 3879*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm9, q1111 3880*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm0, xm7 3881*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm1, xm8 3882*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm9, q2222 3883*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm9, q3333 3884*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm2, xm7 3885*c0909341SAndroid Build Coastguard Worker pmaddwd xm8, xm3, xm9 3886*c0909341SAndroid Build Coastguard Worker%if isput 3887*c0909341SAndroid Build Coastguard Worker mova xm7, [rsp+0x20] 3888*c0909341SAndroid Build Coastguard Worker movd xm9, [rsp+0x38] 3889*c0909341SAndroid Build Coastguard Worker%else 3890*c0909341SAndroid Build Coastguard Worker SWAP m7, m11 3891*c0909341SAndroid Build Coastguard Worker%endif 3892*c0909341SAndroid Build Coastguard Worker paddd xm4, xm5 3893*c0909341SAndroid Build Coastguard Worker paddd xm6, xm8 3894*c0909341SAndroid Build Coastguard Worker paddd xm4, xm6 3895*c0909341SAndroid Build Coastguard Worker paddd xm4, xm7 3896*c0909341SAndroid Build Coastguard Worker%if isput 3897*c0909341SAndroid Build Coastguard Worker psrad xm4, xm9 3898*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm4 3899*c0909341SAndroid Build Coastguard Worker pminuw xm4, xm11 3900*c0909341SAndroid Build Coastguard Worker movq [dstq], xm4 3901*c0909341SAndroid Build Coastguard Worker add dstq, dsq 3902*c0909341SAndroid Build Coastguard Worker%else 3903*c0909341SAndroid Build Coastguard Worker SWAP m11, m7 3904*c0909341SAndroid Build Coastguard Worker psrad xm4, 6 3905*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm4 3906*c0909341SAndroid Build Coastguard Worker movq [tmpq], xm4 3907*c0909341SAndroid Build Coastguard Worker add tmpq, 8 3908*c0909341SAndroid Build Coastguard Worker%endif 3909*c0909341SAndroid Build Coastguard Worker dec hd 3910*c0909341SAndroid Build Coastguard Worker jz .ret 3911*c0909341SAndroid Build Coastguard Worker add myd, dyd 3912*c0909341SAndroid Build Coastguard Worker test myd, ~0x3ff 3913*c0909341SAndroid Build Coastguard Worker jz .w4_loop 3914*c0909341SAndroid Build Coastguard Worker mova xm8, [rsp+0x00] 3915*c0909341SAndroid Build Coastguard Worker movd xm9, [rsp+0x30] 3916*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq] 3917*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+r4] 3918*c0909341SAndroid Build Coastguard Worker test myd, 0x400 3919*c0909341SAndroid Build Coastguard Worker jz .w4_skip_line 3920*c0909341SAndroid Build Coastguard Worker mova xm0, [rsp+0x40] 3921*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], xm1 3922*c0909341SAndroid Build Coastguard Worker mova xm1, [rsp+0x50] 3923*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm2 3924*c0909341SAndroid Build Coastguard Worker mova xm2, [rsp+0x60] 3925*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], xm3 3926*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm12 3927*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm13 3928*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm14 3929*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm15 3930*c0909341SAndroid Build Coastguard Worker phaddd xm4, xm5 3931*c0909341SAndroid Build Coastguard Worker paddd xm4, xm8 3932*c0909341SAndroid Build Coastguard Worker psrad xm4, xm9 3933*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm4 3934*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm10, xm4 3935*c0909341SAndroid Build Coastguard Worker mova xm10, xm4 3936*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3937*c0909341SAndroid Build Coastguard Worker jmp .w4_loop 3938*c0909341SAndroid Build Coastguard Worker.w4_skip_line: 3939*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ssq*1] 3940*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+r6] 3941*c0909341SAndroid Build Coastguard Worker movu m0, [rsp+0x50] 3942*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm12 3943*c0909341SAndroid Build Coastguard Worker pshufb xm6, xm12 3944*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm13 3945*c0909341SAndroid Build Coastguard Worker pshufb xm7, xm13 3946*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm14 3947*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm14 3948*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm15 3949*c0909341SAndroid Build Coastguard Worker pmaddwd xm7, xm15 3950*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], m0 3951*c0909341SAndroid Build Coastguard Worker phaddd xm4, xm5 3952*c0909341SAndroid Build Coastguard Worker phaddd xm6, xm7 3953*c0909341SAndroid Build Coastguard Worker paddd xm4, xm8 3954*c0909341SAndroid Build Coastguard Worker paddd xm6, xm8 3955*c0909341SAndroid Build Coastguard Worker psrad xm4, xm9 3956*c0909341SAndroid Build Coastguard Worker psrad xm6, xm9 3957*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm6 3958*c0909341SAndroid Build Coastguard Worker punpcklwd xm9, xm10, xm4 3959*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], xm9 3960*c0909341SAndroid Build Coastguard Worker psrldq xm10, xm4, 8 3961*c0909341SAndroid Build Coastguard Worker mova xm0, xm1 3962*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 3963*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 3964*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm4, xm10 3965*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3966*c0909341SAndroid Build Coastguard Worker jmp .w4_loop 3967*c0909341SAndroid Build Coastguard Worker SWAP m10, m13 3968*c0909341SAndroid Build Coastguard Worker%if isprep 3969*c0909341SAndroid Build Coastguard Worker SWAP m13, m11 3970*c0909341SAndroid Build Coastguard Worker%endif 3971*c0909341SAndroid Build Coastguard Worker.w8: 3972*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0x80], 1 3973*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 16 3974*c0909341SAndroid Build Coastguard Worker jmp .w_start 3975*c0909341SAndroid Build Coastguard Worker.w16: 3976*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0x80], 2 3977*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 32 3978*c0909341SAndroid Build Coastguard Worker jmp .w_start 3979*c0909341SAndroid Build Coastguard Worker.w32: 3980*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0x80], 4 3981*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 64 3982*c0909341SAndroid Build Coastguard Worker jmp .w_start 3983*c0909341SAndroid Build Coastguard Worker.w64: 3984*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0x80], 8 3985*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 128 3986*c0909341SAndroid Build Coastguard Worker jmp .w_start 3987*c0909341SAndroid Build Coastguard Worker.w128: 3988*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0x80], 16 3989*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 256 3990*c0909341SAndroid Build Coastguard Worker.w_start: 3991*c0909341SAndroid Build Coastguard Worker SWAP m10, m12, m1 3992*c0909341SAndroid Build Coastguard Worker SWAP m11, m7 3993*c0909341SAndroid Build Coastguard Worker ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free 3994*c0909341SAndroid Build Coastguard Worker%if isput 3995*c0909341SAndroid Build Coastguard Worker movifnidn dsm, dsq 3996*c0909341SAndroid Build Coastguard Worker mova [rsp+0xb0], xm7 3997*c0909341SAndroid Build Coastguard Worker%endif 3998*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m10 3999*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], m13 4000*c0909341SAndroid Build Coastguard Worker shr t0d, 16 4001*c0909341SAndroid Build Coastguard Worker sub srcq, 6 4002*c0909341SAndroid Build Coastguard Worker pmaddwd m8, [base+rescale_mul2] 4003*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4004*c0909341SAndroid Build Coastguard Worker mov [rsp+0x84], t0d 4005*c0909341SAndroid Build Coastguard Worker mov [rsp+0x88], srcq 4006*c0909341SAndroid Build Coastguard Worker mov [rsp+0x90], r0q ; dstq / tmpq 4007*c0909341SAndroid Build Coastguard Worker%if UNIX64 4008*c0909341SAndroid Build Coastguard Worker mov hm, hd 4009*c0909341SAndroid Build Coastguard Worker%endif 4010*c0909341SAndroid Build Coastguard Worker shl dword dxm, 3 ; dx*8 4011*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xm15 4012*c0909341SAndroid Build Coastguard Worker paddd m1, m8 ; mx+dx*[0-7] 4013*c0909341SAndroid Build Coastguard Worker jmp .hloop 4014*c0909341SAndroid Build Coastguard Worker.hloop_prep: 4015*c0909341SAndroid Build Coastguard Worker dec dword [rsp+0x80] 4016*c0909341SAndroid Build Coastguard Worker jz .ret 4017*c0909341SAndroid Build Coastguard Worker add qword [rsp+0x90], 16 4018*c0909341SAndroid Build Coastguard Worker mov hd, hm 4019*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 4020*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_0x3ff] 4021*c0909341SAndroid Build Coastguard Worker paddd m1, m8, [rsp+0x40] 4022*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+0x84] 4023*c0909341SAndroid Build Coastguard Worker pxor m9, m9 4024*c0909341SAndroid Build Coastguard Worker mov srcq, [rsp+0x88] 4025*c0909341SAndroid Build Coastguard Worker mov r0q, [rsp+0x90] ; dstq / tmpq 4026*c0909341SAndroid Build Coastguard Worker.hloop: 4027*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm2, [base+pq_0x40000000] 4028*c0909341SAndroid Build Coastguard Worker pand m5, m1, m6 4029*c0909341SAndroid Build Coastguard Worker psrld m5, 6 4030*c0909341SAndroid Build Coastguard Worker paddd m15, m5 4031*c0909341SAndroid Build Coastguard Worker pcmpeqd m5, m9 4032*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m15, 1 4033*c0909341SAndroid Build Coastguard Worker movq r6, xm15 4034*c0909341SAndroid Build Coastguard Worker pextrq r9, xm15, 1 4035*c0909341SAndroid Build Coastguard Worker movq r11, xm7 4036*c0909341SAndroid Build Coastguard Worker pextrq rX, xm7, 1 4037*c0909341SAndroid Build Coastguard Worker mov r4d, r6d 4038*c0909341SAndroid Build Coastguard Worker shr r6, 32 4039*c0909341SAndroid Build Coastguard Worker mov r7d, r9d 4040*c0909341SAndroid Build Coastguard Worker shr r9, 32 4041*c0909341SAndroid Build Coastguard Worker mov r10d, r11d 4042*c0909341SAndroid Build Coastguard Worker shr r11, 32 4043*c0909341SAndroid Build Coastguard Worker mov r13d, rXd 4044*c0909341SAndroid Build Coastguard Worker shr rX, 32 4045*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], m1 4046*c0909341SAndroid Build Coastguard Worker movq xm12, [base+subpel_filters+ r4*8] 4047*c0909341SAndroid Build Coastguard Worker movq xm13, [base+subpel_filters+ r6*8] 4048*c0909341SAndroid Build Coastguard Worker movhps xm12, [base+subpel_filters+ r7*8] 4049*c0909341SAndroid Build Coastguard Worker movhps xm13, [base+subpel_filters+ r9*8] 4050*c0909341SAndroid Build Coastguard Worker movq xm14, [base+subpel_filters+r10*8] 4051*c0909341SAndroid Build Coastguard Worker movq xm15, [base+subpel_filters+r11*8] 4052*c0909341SAndroid Build Coastguard Worker movhps xm14, [base+subpel_filters+r13*8] 4053*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+subpel_filters+ rX*8] 4054*c0909341SAndroid Build Coastguard Worker psrld m1, 10 4055*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m1, 1 4056*c0909341SAndroid Build Coastguard Worker vextracti128 xm6, m5, 1 4057*c0909341SAndroid Build Coastguard Worker movq [rsp+0xa0], xm1 4058*c0909341SAndroid Build Coastguard Worker movq [rsp+0xa8], xm7 4059*c0909341SAndroid Build Coastguard Worker movq r6, xm1 4060*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 4061*c0909341SAndroid Build Coastguard Worker movq r9, xm7 4062*c0909341SAndroid Build Coastguard Worker pextrq rX, xm7, 1 4063*c0909341SAndroid Build Coastguard Worker mov r4d, r6d 4064*c0909341SAndroid Build Coastguard Worker shr r6, 32 4065*c0909341SAndroid Build Coastguard Worker mov r10d, r11d 4066*c0909341SAndroid Build Coastguard Worker shr r11, 32 4067*c0909341SAndroid Build Coastguard Worker mov r7d, r9d 4068*c0909341SAndroid Build Coastguard Worker shr r9, 32 4069*c0909341SAndroid Build Coastguard Worker mov r13d, rXd 4070*c0909341SAndroid Build Coastguard Worker shr rX, 32 4071*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm5, q2200 4072*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q3311 4073*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm6, q2200 4074*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm6, q3311 4075*c0909341SAndroid Build Coastguard Worker pblendvb xm12, xm2, xm4 4076*c0909341SAndroid Build Coastguard Worker pblendvb xm13, xm2, xm5 4077*c0909341SAndroid Build Coastguard Worker pblendvb xm14, xm2, xm7 4078*c0909341SAndroid Build Coastguard Worker pblendvb xm15, xm2, xm6 4079*c0909341SAndroid Build Coastguard Worker pmovsxbw m12, xm12 4080*c0909341SAndroid Build Coastguard Worker pmovsxbw m13, xm13 4081*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 4082*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 4083*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4084*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], m0 4085*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4086*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4087*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b 4088*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+0x60] 4089*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [base+subpel_s_shuf8] 4090*c0909341SAndroid Build Coastguard Worker mov myd, mym 4091*c0909341SAndroid Build Coastguard Worker mov dyd, dym 4092*c0909341SAndroid Build Coastguard Worker pshufb m0, m9 ; 01a 01b 4093*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 ; 23a 23b 4094*c0909341SAndroid Build Coastguard Worker pshufb m2, m9 ; 45a 45b 4095*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 ; 67a 67b 4096*c0909341SAndroid Build Coastguard Worker.vloop: 4097*c0909341SAndroid Build Coastguard Worker and myd, 0x3ff 4098*c0909341SAndroid Build Coastguard Worker mov r6d, 64 << 24 4099*c0909341SAndroid Build Coastguard Worker mov r4d, myd 4100*c0909341SAndroid Build Coastguard Worker shr r4d, 6 4101*c0909341SAndroid Build Coastguard Worker lea r4d, [t1+r4] 4102*c0909341SAndroid Build Coastguard Worker cmovnz r6q, [base+subpel_filters+r4*8] 4103*c0909341SAndroid Build Coastguard Worker movq xm9, r6q 4104*c0909341SAndroid Build Coastguard Worker punpcklqdq xm9, xm9 4105*c0909341SAndroid Build Coastguard Worker pmovsxbw m9, xm9 4106*c0909341SAndroid Build Coastguard Worker pshufd m8, m9, q0000 4107*c0909341SAndroid Build Coastguard Worker pshufd m7, m9, q1111 4108*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 4109*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m7 4110*c0909341SAndroid Build Coastguard Worker pshufd m8, m9, q2222 4111*c0909341SAndroid Build Coastguard Worker pshufd m9, m9, q3333 4112*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m8 4113*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m3, m9 4114*c0909341SAndroid Build Coastguard Worker%if isput 4115*c0909341SAndroid Build Coastguard Worker psrldq xm8, xm11, 8 4116*c0909341SAndroid Build Coastguard Worker%endif 4117*c0909341SAndroid Build Coastguard Worker paddd m4, [rsp+0x20] 4118*c0909341SAndroid Build Coastguard Worker paddd m6, m7 4119*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4120*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4121*c0909341SAndroid Build Coastguard Worker%if isput 4122*c0909341SAndroid Build Coastguard Worker psrad m4, xm8 4123*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4124*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm5 4125*c0909341SAndroid Build Coastguard Worker pminsw xm4, [rsp+0xb0] 4126*c0909341SAndroid Build Coastguard Worker mova [dstq], xm4 4127*c0909341SAndroid Build Coastguard Worker add dstq, dsm 4128*c0909341SAndroid Build Coastguard Worker%else 4129*c0909341SAndroid Build Coastguard Worker psrad m4, 6 4130*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4131*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4132*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4133*c0909341SAndroid Build Coastguard Worker add tmpq, tmp_stridem 4134*c0909341SAndroid Build Coastguard Worker%endif 4135*c0909341SAndroid Build Coastguard Worker dec hd 4136*c0909341SAndroid Build Coastguard Worker jz .hloop_prep 4137*c0909341SAndroid Build Coastguard Worker add myd, dyd 4138*c0909341SAndroid Build Coastguard Worker test myd, ~0x3ff 4139*c0909341SAndroid Build Coastguard Worker jz .vloop 4140*c0909341SAndroid Build Coastguard Worker test myd, 0x400 4141*c0909341SAndroid Build Coastguard Worker mov [rsp+0x60], myd 4142*c0909341SAndroid Build Coastguard Worker mov r4d, [rsp+0xa0] 4143*c0909341SAndroid Build Coastguard Worker mov r6d, [rsp+0xa4] 4144*c0909341SAndroid Build Coastguard Worker mov r7d, [rsp+0xa8] 4145*c0909341SAndroid Build Coastguard Worker mov r9d, [rsp+0xac] 4146*c0909341SAndroid Build Coastguard Worker jz .skip_line 4147*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [base+wswap] 4148*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ r4*2] 4149*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ r6*2] 4150*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ r7*2] 4151*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+ r9*2] 4152*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r10*2], 1 4153*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r11*2], 1 4154*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+r13*2], 1 4155*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+ rX*2], 1 4156*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4157*c0909341SAndroid Build Coastguard Worker mov myd, [rsp+0x60] 4158*c0909341SAndroid Build Coastguard Worker mov dyd, dym 4159*c0909341SAndroid Build Coastguard Worker pshufb m0, m9 4160*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 4161*c0909341SAndroid Build Coastguard Worker pshufb m2, m9 4162*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 4163*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m12 4164*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13 4165*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 4166*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15 4167*c0909341SAndroid Build Coastguard Worker phaddd m4, m5 4168*c0909341SAndroid Build Coastguard Worker phaddd m6, m7 4169*c0909341SAndroid Build Coastguard Worker phaddd m4, m6 4170*c0909341SAndroid Build Coastguard Worker paddd m4, m10 4171*c0909341SAndroid Build Coastguard Worker psrad m4, xm11 4172*c0909341SAndroid Build Coastguard Worker pslld m4, 16 4173*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 4174*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 0xaa 4175*c0909341SAndroid Build Coastguard Worker pblendw m2, m3, 0xaa 4176*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 4177*c0909341SAndroid Build Coastguard Worker jmp .vloop 4178*c0909341SAndroid Build Coastguard Worker.skip_line: 4179*c0909341SAndroid Build Coastguard Worker mova m0, m1 4180*c0909341SAndroid Build Coastguard Worker mova m1, m2 4181*c0909341SAndroid Build Coastguard Worker mova m2, m3 4182*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 4183*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [base+subpel_s_shuf8] 4184*c0909341SAndroid Build Coastguard Worker mov myd, [rsp+0x60] 4185*c0909341SAndroid Build Coastguard Worker mov dyd, dym 4186*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 4187*c0909341SAndroid Build Coastguard Worker jmp .vloop 4188*c0909341SAndroid Build Coastguard Worker SWAP m1, m12, m10 4189*c0909341SAndroid Build Coastguard Worker SWAP m7, m11 4190*c0909341SAndroid Build Coastguard Worker.dy1: 4191*c0909341SAndroid Build Coastguard Worker movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] 4192*c0909341SAndroid Build Coastguard Worker add wq, base_reg 4193*c0909341SAndroid Build Coastguard Worker jmp wq 4194*c0909341SAndroid Build Coastguard Worker%if isput 4195*c0909341SAndroid Build Coastguard Worker.dy1_w2: 4196*c0909341SAndroid Build Coastguard Worker mov myd, mym 4197*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4198*c0909341SAndroid Build Coastguard Worker sub srcq, 2 4199*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4200*c0909341SAndroid Build Coastguard Worker punpckldq m8, m9, m8 4201*c0909341SAndroid Build Coastguard Worker paddd m10, m8 ; mx+dx*[0-1] 4202*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+pq_0x40000000+2] 4203*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4204*c0909341SAndroid Build Coastguard Worker pand xm8, xm10, xm6 4205*c0909341SAndroid Build Coastguard Worker psrld xm8, 6 4206*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 4207*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4208*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4209*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_q] 4210*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_s_shuf2] 4211*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+subpel_filters+r4*8+2] 4212*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+subpel_filters+r6*8+2] 4213*c0909341SAndroid Build Coastguard Worker pcmpeqd xm8, xm9 4214*c0909341SAndroid Build Coastguard Worker psrld m10, 10 4215*c0909341SAndroid Build Coastguard Worker paddd m10, m10 4216*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 4217*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 4218*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*2] 4219*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ss3q ] 4220*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4221*c0909341SAndroid Build Coastguard Worker shr myd, 6 4222*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4223*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4224*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4225*c0909341SAndroid Build Coastguard Worker pshufb m10, m5 4226*c0909341SAndroid Build Coastguard Worker paddb m10, m6 4227*c0909341SAndroid Build Coastguard Worker vpblendd xm15, xm4, 0xa 4228*c0909341SAndroid Build Coastguard Worker pblendvb xm15, xm14, xm8 4229*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 4230*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0], 1 4231*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1], 1 4232*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*2], 1 4233*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 4234*c0909341SAndroid Build Coastguard Worker movq xm6, r4q 4235*c0909341SAndroid Build Coastguard Worker pmovsxbw xm6, xm6 4236*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm6, q0000 4237*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm6, q1111 4238*c0909341SAndroid Build Coastguard Worker pshufd xm14, xm6, q2222 4239*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm6, q3333 4240*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m10}, m0, m1, m2 4241*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm10 4242*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m15}, m0, m1, m2 4243*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm15 4244*c0909341SAndroid Build Coastguard Worker phaddd m0, m1 4245*c0909341SAndroid Build Coastguard Worker phaddd m2, m3 4246*c0909341SAndroid Build Coastguard Worker paddd m0, m12 4247*c0909341SAndroid Build Coastguard Worker paddd m2, m12 4248*c0909341SAndroid Build Coastguard Worker psrad m0, xm7 4249*c0909341SAndroid Build Coastguard Worker psrad m2, xm7 4250*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 4251*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4252*c0909341SAndroid Build Coastguard Worker palignr xm2, xm1, xm0, 4 4253*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm1, q2121 4254*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0, xm2 ; 01 12 4255*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm2 ; 23 34 4256*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm4 ; 45 56 4257*c0909341SAndroid Build Coastguard Worker.dy1_w2_loop: 4258*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0] 4259*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*1] 4260*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4261*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm10 4262*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm10 4263*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm15 4264*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm15 4265*c0909341SAndroid Build Coastguard Worker phaddd xm1, xm5 4266*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm3, xm8 4267*c0909341SAndroid Build Coastguard Worker mova xm3, xm0 4268*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm9 4269*c0909341SAndroid Build Coastguard Worker paddd xm1, xm12 4270*c0909341SAndroid Build Coastguard Worker psrad xm1, xm7 4271*c0909341SAndroid Build Coastguard Worker packssdw xm1, xm1 4272*c0909341SAndroid Build Coastguard Worker paddd xm5, xm0 4273*c0909341SAndroid Build Coastguard Worker mova xm0, xm2 4274*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm14 4275*c0909341SAndroid Build Coastguard Worker paddd xm5, xm2 4276*c0909341SAndroid Build Coastguard Worker palignr xm2, xm1, xm4, 12 4277*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1 ; 67 78 4278*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm2, xm6 4279*c0909341SAndroid Build Coastguard Worker paddd xm5, xm13 4280*c0909341SAndroid Build Coastguard Worker paddd xm5, xm4 4281*c0909341SAndroid Build Coastguard Worker mova xm4, xm1 4282*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm7, 8 4283*c0909341SAndroid Build Coastguard Worker psrad xm5, xm1 4284*c0909341SAndroid Build Coastguard Worker packusdw xm5, xm5 4285*c0909341SAndroid Build Coastguard Worker pminsw xm5, xm11 4286*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm5 4287*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm5, 1 4288*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4289*c0909341SAndroid Build Coastguard Worker sub hd, 2 4290*c0909341SAndroid Build Coastguard Worker jg .dy1_w2_loop 4291*c0909341SAndroid Build Coastguard Worker RET 4292*c0909341SAndroid Build Coastguard Worker%endif 4293*c0909341SAndroid Build Coastguard Worker.dy1_w4: 4294*c0909341SAndroid Build Coastguard Worker mov myd, mym 4295*c0909341SAndroid Build Coastguard Worker%if isput 4296*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm11 4297*c0909341SAndroid Build Coastguard Worker%endif 4298*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m12 4299*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], m13 4300*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], xm7 4301*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+rescale_mul] 4302*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4303*c0909341SAndroid Build Coastguard Worker sub srcq, 2 4304*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4305*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m7 4306*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+pq_0x40000000+1] 4307*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4308*c0909341SAndroid Build Coastguard Worker SWAP m13, m10 4309*c0909341SAndroid Build Coastguard Worker paddd m13, m8 ; mx+dx*[0-3] 4310*c0909341SAndroid Build Coastguard Worker pand m6, m13 4311*c0909341SAndroid Build Coastguard Worker psrld m6, 6 4312*c0909341SAndroid Build Coastguard Worker paddd xm15, xm6 4313*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4314*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4315*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm15, 2 4316*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm15, 3 4317*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_q+ 0] 4318*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [base+bdct_lb_q+16] 4319*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [base+subpel_s_shuf2] 4320*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+subpel_filters+r4*8+2] 4321*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm7, [base+subpel_filters+r6*8+2] 4322*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, [base+subpel_filters+r11*8+2] 4323*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm8, [base+subpel_filters+r13*8+2] 4324*c0909341SAndroid Build Coastguard Worker pcmpeqd m6, m9 4325*c0909341SAndroid Build Coastguard Worker punpckldq m10, m6, m6 4326*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m6 4327*c0909341SAndroid Build Coastguard Worker psrld m13, 10 4328*c0909341SAndroid Build Coastguard Worker paddd m13, m13 4329*c0909341SAndroid Build Coastguard Worker vpblendd xm14, xm7, 0xa 4330*c0909341SAndroid Build Coastguard Worker vpblendd xm15, xm8, 0xa 4331*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 4332*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 4333*c0909341SAndroid Build Coastguard Worker pblendvb m14, m2, m10 4334*c0909341SAndroid Build Coastguard Worker pblendvb m15, m2, m6 4335*c0909341SAndroid Build Coastguard Worker pextrd r4, xm13, 2 4336*c0909341SAndroid Build Coastguard Worker pshufb m12, m13, m5 4337*c0909341SAndroid Build Coastguard Worker pshufb m13, m1 4338*c0909341SAndroid Build Coastguard Worker lea r6, [r4+ssq*2] 4339*c0909341SAndroid Build Coastguard Worker lea r11, [r4+ssq*1] 4340*c0909341SAndroid Build Coastguard Worker lea r13, [r4+ss3q ] 4341*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 4342*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+r4 ] 4343*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*2] 4344*c0909341SAndroid Build Coastguard Worker movu xm8, [srcq+r6 ] 4345*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 4346*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+r11 ], 1 4347*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 4348*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [srcq+r13 ], 1 4349*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4350*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0] 4351*c0909341SAndroid Build Coastguard Worker movu xm9, [srcq+r4 ] 4352*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*2] ; 6 _ 4353*c0909341SAndroid Build Coastguard Worker movu xm10, [srcq+r6 ] 4354*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 4355*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [srcq+r11 ], 1 4356*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ss3q ] 4357*c0909341SAndroid Build Coastguard Worker vpbroadcastb m5, xm13 4358*c0909341SAndroid Build Coastguard Worker psubb m13, m5 4359*c0909341SAndroid Build Coastguard Worker paddb m12, m4 4360*c0909341SAndroid Build Coastguard Worker paddb m13, m4 4361*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+0x00] 4362*c0909341SAndroid Build Coastguard Worker movd xm6, [rsp+0x40] 4363*c0909341SAndroid Build Coastguard Worker pshufb m0, m12 4364*c0909341SAndroid Build Coastguard Worker pshufb m1, m12 4365*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m14 4366*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m14 4367*c0909341SAndroid Build Coastguard Worker pshufb m7, m13 4368*c0909341SAndroid Build Coastguard Worker pshufb m8, m13 4369*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15 4370*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m15 4371*c0909341SAndroid Build Coastguard Worker pshufb m2, m12 4372*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm12 4373*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 4374*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm14 4375*c0909341SAndroid Build Coastguard Worker pshufb m9, m13 4376*c0909341SAndroid Build Coastguard Worker pshufb xm10, xm13 4377*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m15 4378*c0909341SAndroid Build Coastguard Worker pmaddwd xm10, xm15 4379*c0909341SAndroid Build Coastguard Worker phaddd m0, m7 4380*c0909341SAndroid Build Coastguard Worker phaddd m1, m8 4381*c0909341SAndroid Build Coastguard Worker phaddd m2, m9 4382*c0909341SAndroid Build Coastguard Worker phaddd xm3, xm10 4383*c0909341SAndroid Build Coastguard Worker paddd m0, m5 4384*c0909341SAndroid Build Coastguard Worker paddd m1, m5 4385*c0909341SAndroid Build Coastguard Worker paddd m2, m5 4386*c0909341SAndroid Build Coastguard Worker paddd xm3, xm5 4387*c0909341SAndroid Build Coastguard Worker psrad m0, xm6 4388*c0909341SAndroid Build Coastguard Worker psrad m1, xm6 4389*c0909341SAndroid Build Coastguard Worker psrad m2, xm6 4390*c0909341SAndroid Build Coastguard Worker psrad xm3, xm6 4391*c0909341SAndroid Build Coastguard Worker vperm2i128 m4, m0, m1, 0x21 ; 1 2 4392*c0909341SAndroid Build Coastguard Worker vperm2i128 m5, m1, m2, 0x21 ; 3 4 4393*c0909341SAndroid Build Coastguard Worker vperm2i128 m6, m2, m3, 0x21 ; 5 6 4394*c0909341SAndroid Build Coastguard Worker shr myd, 6 4395*c0909341SAndroid Build Coastguard Worker mov r13d, 64 << 24 4396*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4397*c0909341SAndroid Build Coastguard Worker cmovnz r13q, [base+subpel_filters+myq*8] 4398*c0909341SAndroid Build Coastguard Worker pslld m4, 16 4399*c0909341SAndroid Build Coastguard Worker pslld m5, 16 4400*c0909341SAndroid Build Coastguard Worker pslld m6, 16 4401*c0909341SAndroid Build Coastguard Worker pblendw m0, m4, 0xaa ; 01 12 4402*c0909341SAndroid Build Coastguard Worker pblendw m1, m5, 0xaa ; 23 34 4403*c0909341SAndroid Build Coastguard Worker pblendw m2, m6, 0xaa ; 45 56 4404*c0909341SAndroid Build Coastguard Worker movq xm10, r13q 4405*c0909341SAndroid Build Coastguard Worker punpcklqdq xm10, xm10 4406*c0909341SAndroid Build Coastguard Worker pmovsxbw m10, xm10 4407*c0909341SAndroid Build Coastguard Worker pshufd m7, m10, q0000 4408*c0909341SAndroid Build Coastguard Worker pshufd m8, m10, q1111 4409*c0909341SAndroid Build Coastguard Worker pshufd m9, m10, q2222 4410*c0909341SAndroid Build Coastguard Worker pshufd m10, m10, q3333 4411*c0909341SAndroid Build Coastguard Worker.dy1_w4_loop: 4412*c0909341SAndroid Build Coastguard Worker movu xm11, [srcq+ssq*0] 4413*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+r4 ] 4414*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [srcq+ssq*1], 1 4415*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+r11 ], 1 4416*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4417*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m7 4418*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m8 4419*c0909341SAndroid Build Coastguard Worker pshufb m11, m12 4420*c0909341SAndroid Build Coastguard Worker pshufb m6, m13 4421*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m14 4422*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m15 4423*c0909341SAndroid Build Coastguard Worker paddd m4, [rsp+0x20] 4424*c0909341SAndroid Build Coastguard Worker phaddd m11, m6 4425*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m9 4426*c0909341SAndroid Build Coastguard Worker paddd m11, [rsp+0x00] 4427*c0909341SAndroid Build Coastguard Worker psrad m11, [rsp+0x40] 4428*c0909341SAndroid Build Coastguard Worker mova m0, m1 4429*c0909341SAndroid Build Coastguard Worker mova m1, m2 4430*c0909341SAndroid Build Coastguard Worker paddd m5, m6 4431*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4432*c0909341SAndroid Build Coastguard Worker vinserti128 m2, m3, xm11, 1 4433*c0909341SAndroid Build Coastguard Worker pslld m3, m11, 16 4434*c0909341SAndroid Build Coastguard Worker pblendw m2, m3, 0xaa ; 67 78 4435*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m2, m10 4436*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m11, 1 4437*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4438*c0909341SAndroid Build Coastguard Worker%if isput 4439*c0909341SAndroid Build Coastguard Worker psrad m4, [rsp+0x48] 4440*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4441*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm5 4442*c0909341SAndroid Build Coastguard Worker pminsw xm4, [rsp+0x50] 4443*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm4 4444*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm4 4445*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4446*c0909341SAndroid Build Coastguard Worker%else 4447*c0909341SAndroid Build Coastguard Worker psrad m4, 6 4448*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4449*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4450*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4451*c0909341SAndroid Build Coastguard Worker add tmpq, 16 4452*c0909341SAndroid Build Coastguard Worker%endif 4453*c0909341SAndroid Build Coastguard Worker sub hd, 2 4454*c0909341SAndroid Build Coastguard Worker jg .dy1_w4_loop 4455*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_RET 4456*c0909341SAndroid Build Coastguard Worker SWAP m10, m13 4457*c0909341SAndroid Build Coastguard Worker.dy1_w8: 4458*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 1 4459*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 16 4460*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4461*c0909341SAndroid Build Coastguard Worker.dy1_w16: 4462*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 2 4463*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 32 4464*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4465*c0909341SAndroid Build Coastguard Worker.dy1_w32: 4466*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 4 4467*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 64 4468*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4469*c0909341SAndroid Build Coastguard Worker.dy1_w64: 4470*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 8 4471*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 128 4472*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4473*c0909341SAndroid Build Coastguard Worker.dy1_w128: 4474*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 16 4475*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 256 4476*c0909341SAndroid Build Coastguard Worker.dy1_w_start: 4477*c0909341SAndroid Build Coastguard Worker SWAP m10, m12, m1 4478*c0909341SAndroid Build Coastguard Worker SWAP m11, m7 4479*c0909341SAndroid Build Coastguard Worker ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free 4480*c0909341SAndroid Build Coastguard Worker mov myd, mym 4481*c0909341SAndroid Build Coastguard Worker%if isput 4482*c0909341SAndroid Build Coastguard Worker %define dsm [rsp+0xb8] 4483*c0909341SAndroid Build Coastguard Worker movifnidn dsm, dsq 4484*c0909341SAndroid Build Coastguard Worker mova [rsp+0xc0], xm7 4485*c0909341SAndroid Build Coastguard Worker%else 4486*c0909341SAndroid Build Coastguard Worker %if UNIX64 4487*c0909341SAndroid Build Coastguard Worker %define hm [rsp+0xb8] 4488*c0909341SAndroid Build Coastguard Worker %endif 4489*c0909341SAndroid Build Coastguard Worker%endif 4490*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m10 4491*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], m13 4492*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], xm11 4493*c0909341SAndroid Build Coastguard Worker shr t0d, 16 4494*c0909341SAndroid Build Coastguard Worker sub srcq, 6 4495*c0909341SAndroid Build Coastguard Worker shr myd, 6 4496*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4497*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4498*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4499*c0909341SAndroid Build Coastguard Worker pmaddwd m8, [base+rescale_mul2] 4500*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4501*c0909341SAndroid Build Coastguard Worker mov [rsp+0xa4], t0d 4502*c0909341SAndroid Build Coastguard Worker mov [rsp+0xa8], srcq 4503*c0909341SAndroid Build Coastguard Worker mov [rsp+0xb0], r0q ; dstq / tmpq 4504*c0909341SAndroid Build Coastguard Worker%if UNIX64 4505*c0909341SAndroid Build Coastguard Worker mov hm, hd 4506*c0909341SAndroid Build Coastguard Worker%endif 4507*c0909341SAndroid Build Coastguard Worker shl dword dxm, 3 ; dx*8 4508*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xm15 4509*c0909341SAndroid Build Coastguard Worker paddd m1, m8 ; mx+dx*[0-7] 4510*c0909341SAndroid Build Coastguard Worker movq xm0, r4q 4511*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, xm0 4512*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm0 4513*c0909341SAndroid Build Coastguard Worker jmp .dy1_hloop 4514*c0909341SAndroid Build Coastguard Worker.dy1_hloop_prep: 4515*c0909341SAndroid Build Coastguard Worker dec dword [rsp+0xa0] 4516*c0909341SAndroid Build Coastguard Worker jz .ret 4517*c0909341SAndroid Build Coastguard Worker add qword [rsp+0xb0], 16 4518*c0909341SAndroid Build Coastguard Worker mov hd, hm 4519*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 4520*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_0x3ff] 4521*c0909341SAndroid Build Coastguard Worker paddd m1, m8, [rsp+0x60] 4522*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+0xa4] 4523*c0909341SAndroid Build Coastguard Worker pxor m9, m9 4524*c0909341SAndroid Build Coastguard Worker mov srcq, [rsp+0xa8] 4525*c0909341SAndroid Build Coastguard Worker mov r0q, [rsp+0xb0] ; dstq / tmpq 4526*c0909341SAndroid Build Coastguard Worker mova m10, [rsp+0x00] 4527*c0909341SAndroid Build Coastguard Worker mova xm11, [rsp+0x40] 4528*c0909341SAndroid Build Coastguard Worker.dy1_hloop: 4529*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm2, [base+pq_0x40000000] 4530*c0909341SAndroid Build Coastguard Worker pand m5, m1, m6 4531*c0909341SAndroid Build Coastguard Worker psrld m5, 6 4532*c0909341SAndroid Build Coastguard Worker paddd m15, m5 4533*c0909341SAndroid Build Coastguard Worker pcmpeqd m5, m9 4534*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m15, 1 4535*c0909341SAndroid Build Coastguard Worker movq r6, xm15 4536*c0909341SAndroid Build Coastguard Worker pextrq r9, xm15, 1 4537*c0909341SAndroid Build Coastguard Worker movq r11, xm7 4538*c0909341SAndroid Build Coastguard Worker pextrq rX, xm7, 1 4539*c0909341SAndroid Build Coastguard Worker mov r4d, r6d 4540*c0909341SAndroid Build Coastguard Worker shr r6, 32 4541*c0909341SAndroid Build Coastguard Worker mov r7d, r9d 4542*c0909341SAndroid Build Coastguard Worker shr r9, 32 4543*c0909341SAndroid Build Coastguard Worker mov r10d, r11d 4544*c0909341SAndroid Build Coastguard Worker shr r11, 32 4545*c0909341SAndroid Build Coastguard Worker mov r13d, rXd 4546*c0909341SAndroid Build Coastguard Worker shr rX, 32 4547*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], m1 4548*c0909341SAndroid Build Coastguard Worker movq xm12, [base+subpel_filters+ r4*8] 4549*c0909341SAndroid Build Coastguard Worker movq xm13, [base+subpel_filters+ r6*8] 4550*c0909341SAndroid Build Coastguard Worker movhps xm12, [base+subpel_filters+ r7*8] 4551*c0909341SAndroid Build Coastguard Worker movhps xm13, [base+subpel_filters+ r9*8] 4552*c0909341SAndroid Build Coastguard Worker movq xm14, [base+subpel_filters+r10*8] 4553*c0909341SAndroid Build Coastguard Worker movq xm15, [base+subpel_filters+r11*8] 4554*c0909341SAndroid Build Coastguard Worker movhps xm14, [base+subpel_filters+r13*8] 4555*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+subpel_filters+ rX*8] 4556*c0909341SAndroid Build Coastguard Worker psrld m1, 10 4557*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m1, 1 4558*c0909341SAndroid Build Coastguard Worker vextracti128 xm6, m5, 1 4559*c0909341SAndroid Build Coastguard Worker movq r6, xm1 4560*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 4561*c0909341SAndroid Build Coastguard Worker movq r9, xm7 4562*c0909341SAndroid Build Coastguard Worker pextrq rX, xm7, 1 4563*c0909341SAndroid Build Coastguard Worker mov r4d, r6d 4564*c0909341SAndroid Build Coastguard Worker shr r6, 32 4565*c0909341SAndroid Build Coastguard Worker mov r10d, r11d 4566*c0909341SAndroid Build Coastguard Worker shr r11, 32 4567*c0909341SAndroid Build Coastguard Worker mov r7d, r9d 4568*c0909341SAndroid Build Coastguard Worker shr r9, 32 4569*c0909341SAndroid Build Coastguard Worker mov r13d, rXd 4570*c0909341SAndroid Build Coastguard Worker shr rX, 32 4571*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm5, q2200 4572*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q3311 4573*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm6, q2200 4574*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm6, q3311 4575*c0909341SAndroid Build Coastguard Worker pblendvb xm12, xm2, xm4 4576*c0909341SAndroid Build Coastguard Worker pblendvb xm13, xm2, xm5 4577*c0909341SAndroid Build Coastguard Worker pblendvb xm14, xm2, xm7 4578*c0909341SAndroid Build Coastguard Worker pblendvb xm15, xm2, xm6 4579*c0909341SAndroid Build Coastguard Worker pmovsxbw m12, xm12 4580*c0909341SAndroid Build Coastguard Worker pmovsxbw m13, xm13 4581*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 4582*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 4583*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4584*c0909341SAndroid Build Coastguard Worker mova [rsp+0x80], m0 4585*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4586*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4587*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b 4588*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+0x80] 4589*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+subpel_s_shuf8] 4590*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [rsp+0x50] 4591*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [rsp+0x54] 4592*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [rsp+0x58] 4593*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [rsp+0x5c] 4594*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 ; 01a 01b 4595*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 ; 23a 23b 4596*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 ; 45a 45b 4597*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 ; 67a 67b 4598*c0909341SAndroid Build Coastguard Worker.dy1_vloop: 4599*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 4600*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m9 4601*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m10 4602*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m3, m11 4603*c0909341SAndroid Build Coastguard Worker paddd m4, [rsp+0x20] 4604*c0909341SAndroid Build Coastguard Worker paddd m6, m7 4605*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4606*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4607*c0909341SAndroid Build Coastguard Worker%if isput 4608*c0909341SAndroid Build Coastguard Worker psrad m4, [rsp+0x48] 4609*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4610*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm5 4611*c0909341SAndroid Build Coastguard Worker pminsw xm4, [rsp+0xc0] 4612*c0909341SAndroid Build Coastguard Worker mova [dstq], xm4 4613*c0909341SAndroid Build Coastguard Worker add dstq, dsm 4614*c0909341SAndroid Build Coastguard Worker%else 4615*c0909341SAndroid Build Coastguard Worker psrad m4, 6 4616*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4617*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4618*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4619*c0909341SAndroid Build Coastguard Worker add tmpq, tmp_stridem 4620*c0909341SAndroid Build Coastguard Worker%endif 4621*c0909341SAndroid Build Coastguard Worker dec hd 4622*c0909341SAndroid Build Coastguard Worker jz .dy1_hloop_prep 4623*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+wswap] 4624*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 4625*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 4626*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 4627*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 4628*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ r4*2] 4629*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ r6*2] 4630*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ r7*2] 4631*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+ r9*2] 4632*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r10*2], 1 4633*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r11*2], 1 4634*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+r13*2], 1 4635*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+ rX*2], 1 4636*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4637*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m12 4638*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13 4639*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 4640*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15 4641*c0909341SAndroid Build Coastguard Worker phaddd m4, m5 4642*c0909341SAndroid Build Coastguard Worker phaddd m6, m7 4643*c0909341SAndroid Build Coastguard Worker phaddd m4, m6 4644*c0909341SAndroid Build Coastguard Worker paddd m4, [rsp+0x00] 4645*c0909341SAndroid Build Coastguard Worker psrad m4, [rsp+0x40] 4646*c0909341SAndroid Build Coastguard Worker pslld m4, 16 4647*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 4648*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 0xaa 4649*c0909341SAndroid Build Coastguard Worker pblendw m2, m3, 0xaa 4650*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 4651*c0909341SAndroid Build Coastguard Worker jmp .dy1_vloop 4652*c0909341SAndroid Build Coastguard Worker SWAP m1, m12, m10 4653*c0909341SAndroid Build Coastguard Worker SWAP m7, m11 4654*c0909341SAndroid Build Coastguard Worker.dy2: 4655*c0909341SAndroid Build Coastguard Worker movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] 4656*c0909341SAndroid Build Coastguard Worker add wq, base_reg 4657*c0909341SAndroid Build Coastguard Worker jmp wq 4658*c0909341SAndroid Build Coastguard Worker%if isput 4659*c0909341SAndroid Build Coastguard Worker.dy2_w2: 4660*c0909341SAndroid Build Coastguard Worker mov myd, mym 4661*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4662*c0909341SAndroid Build Coastguard Worker sub srcq, 2 4663*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4664*c0909341SAndroid Build Coastguard Worker punpckldq m8, m9, m8 4665*c0909341SAndroid Build Coastguard Worker paddd m10, m8 ; mx+dx*[0-1] 4666*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+pq_0x40000000+2] 4667*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4668*c0909341SAndroid Build Coastguard Worker pand xm8, xm10, xm6 4669*c0909341SAndroid Build Coastguard Worker psrld xm8, 6 4670*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 4671*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4672*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4673*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_q] 4674*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_s_shuf2] 4675*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, [base+subpel_filters+r4*8+2] 4676*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [base+subpel_filters+r6*8+2] 4677*c0909341SAndroid Build Coastguard Worker pcmpeqd xm8, xm9 4678*c0909341SAndroid Build Coastguard Worker psrld m10, 10 4679*c0909341SAndroid Build Coastguard Worker paddd m10, m10 4680*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 4681*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*2] 4682*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*4] 4683*c0909341SAndroid Build Coastguard Worker pshufb m10, m5 4684*c0909341SAndroid Build Coastguard Worker paddb m10, m6 4685*c0909341SAndroid Build Coastguard Worker vpblendd xm15, xm4, 0xa 4686*c0909341SAndroid Build Coastguard Worker pblendvb xm15, xm14, xm8 4687*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 4688*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 4689*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 4690*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4691*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 4692*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4693*c0909341SAndroid Build Coastguard Worker shr myd, 6 4694*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4695*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4696*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4697*c0909341SAndroid Build Coastguard Worker pshufb m0, m10 4698*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 4699*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 4700*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 4701*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 4702*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 4703*c0909341SAndroid Build Coastguard Worker movq xm6, r4q 4704*c0909341SAndroid Build Coastguard Worker pmovsxbw xm6, xm6 4705*c0909341SAndroid Build Coastguard Worker phaddd m0, m1 4706*c0909341SAndroid Build Coastguard Worker phaddd m1, m2 4707*c0909341SAndroid Build Coastguard Worker paddd m0, m12 4708*c0909341SAndroid Build Coastguard Worker paddd m1, m12 4709*c0909341SAndroid Build Coastguard Worker psrad m0, xm7 4710*c0909341SAndroid Build Coastguard Worker psrad m1, xm7 4711*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 ; 0 2 2 4 1 3 3 5 4712*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4713*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm6, q0000 4714*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm6, q1111 4715*c0909341SAndroid Build Coastguard Worker pshufd xm14, xm6, q2222 4716*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm6, q3333 4717*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm0, xm1 ; 01 23 4718*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm1 ; 23 45 4719*c0909341SAndroid Build Coastguard Worker.dy2_w2_loop: 4720*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*0] 4721*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*2] 4722*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 4723*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 4724*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4725*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm2, xm8 4726*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm9 4727*c0909341SAndroid Build Coastguard Worker pshufb m3, m10 4728*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 4729*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m15 4730*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m15 4731*c0909341SAndroid Build Coastguard Worker phaddd m3, m5 4732*c0909341SAndroid Build Coastguard Worker paddd xm4, xm1 4733*c0909341SAndroid Build Coastguard Worker paddd m3, m12 4734*c0909341SAndroid Build Coastguard Worker psrad m3, xm7 4735*c0909341SAndroid Build Coastguard Worker packssdw m3, m3 4736*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q2100 4737*c0909341SAndroid Build Coastguard Worker palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 4738*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4739*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm0, xm1 ; 45 67 4740*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm0, xm1 ; 67 89 4741*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm2, xm14 4742*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm1, xm6 4743*c0909341SAndroid Build Coastguard Worker paddd xm4, xm13 4744*c0909341SAndroid Build Coastguard Worker paddd xm4, xm3 4745*c0909341SAndroid Build Coastguard Worker psrldq xm3, xm7, 8 4746*c0909341SAndroid Build Coastguard Worker paddd xm4, xm5 4747*c0909341SAndroid Build Coastguard Worker psrad xm4, xm3 4748*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm4 4749*c0909341SAndroid Build Coastguard Worker pminsw xm4, xm11 4750*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm4 4751*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm4, 1 4752*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4753*c0909341SAndroid Build Coastguard Worker sub hd, 2 4754*c0909341SAndroid Build Coastguard Worker jg .dy2_w2_loop 4755*c0909341SAndroid Build Coastguard Worker RET 4756*c0909341SAndroid Build Coastguard Worker%endif 4757*c0909341SAndroid Build Coastguard Worker.dy2_w4: 4758*c0909341SAndroid Build Coastguard Worker mov myd, mym 4759*c0909341SAndroid Build Coastguard Worker%if isput 4760*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm11 4761*c0909341SAndroid Build Coastguard Worker%endif 4762*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m12 4763*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], m13 4764*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], xm7 4765*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+rescale_mul] 4766*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4767*c0909341SAndroid Build Coastguard Worker sub srcq, 2 4768*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4769*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m7 4770*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+pq_0x40000000+1] 4771*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4772*c0909341SAndroid Build Coastguard Worker SWAP m13, m10 4773*c0909341SAndroid Build Coastguard Worker paddd m13, m8 ; mx+dx*[0-3] 4774*c0909341SAndroid Build Coastguard Worker pand m6, m13 4775*c0909341SAndroid Build Coastguard Worker psrld m6, 6 4776*c0909341SAndroid Build Coastguard Worker paddd xm15, xm6 4777*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4778*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4779*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm15, 2 4780*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm15, 3 4781*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_q+ 0] 4782*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [base+bdct_lb_q+16] 4783*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [base+subpel_s_shuf2] 4784*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+subpel_filters+r4*8+2] 4785*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm7, [base+subpel_filters+r6*8+2] 4786*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, [base+subpel_filters+r11*8+2] 4787*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm8, [base+subpel_filters+r13*8+2] 4788*c0909341SAndroid Build Coastguard Worker shr myd, 6 4789*c0909341SAndroid Build Coastguard Worker mov r13d, 64 << 24 4790*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4791*c0909341SAndroid Build Coastguard Worker cmovnz r13q, [base+subpel_filters+myq*8] 4792*c0909341SAndroid Build Coastguard Worker pcmpeqd m6, m9 4793*c0909341SAndroid Build Coastguard Worker punpckldq m11, m6, m6 4794*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m6 4795*c0909341SAndroid Build Coastguard Worker psrld m13, 10 4796*c0909341SAndroid Build Coastguard Worker paddd m13, m13 4797*c0909341SAndroid Build Coastguard Worker vpblendd xm14, xm7, 0xa 4798*c0909341SAndroid Build Coastguard Worker vpblendd xm15, xm8, 0xa 4799*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 4800*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 4801*c0909341SAndroid Build Coastguard Worker movq xm10, r13q 4802*c0909341SAndroid Build Coastguard Worker pblendvb m14, m2, m11 4803*c0909341SAndroid Build Coastguard Worker pblendvb m15, m2, m6 4804*c0909341SAndroid Build Coastguard Worker pextrd r4, xm13, 2 4805*c0909341SAndroid Build Coastguard Worker pshufb m12, m13, m5 4806*c0909341SAndroid Build Coastguard Worker pshufb m13, m1 4807*c0909341SAndroid Build Coastguard Worker lea r6, [r4+ssq*1] 4808*c0909341SAndroid Build Coastguard Worker lea r11, [r4+ssq*2] 4809*c0909341SAndroid Build Coastguard Worker lea r13, [r4+ss3q ] 4810*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 4811*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+r4 ] 4812*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 4813*c0909341SAndroid Build Coastguard Worker movu xm8, [srcq+r6 ] 4814*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 4815*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+r11 ], 1 4816*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 4817*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [srcq+r13 ], 1 4818*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4819*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0] 4820*c0909341SAndroid Build Coastguard Worker movu xm9, [srcq+r4 ] 4821*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 4822*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [srcq+r6 ], 1 4823*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4824*c0909341SAndroid Build Coastguard Worker vpbroadcastb m5, xm13 4825*c0909341SAndroid Build Coastguard Worker psubb m13, m5 4826*c0909341SAndroid Build Coastguard Worker paddb m12, m4 4827*c0909341SAndroid Build Coastguard Worker paddb m13, m4 4828*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+0x00] 4829*c0909341SAndroid Build Coastguard Worker movd xm6, [rsp+0x40] 4830*c0909341SAndroid Build Coastguard Worker pshufb m0, m12 4831*c0909341SAndroid Build Coastguard Worker pshufb m1, m12 4832*c0909341SAndroid Build Coastguard Worker pshufb m2, m12 4833*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m14 4834*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m14 4835*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 4836*c0909341SAndroid Build Coastguard Worker pshufb m7, m13 4837*c0909341SAndroid Build Coastguard Worker pshufb m8, m13 4838*c0909341SAndroid Build Coastguard Worker pshufb m9, m13 4839*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15 4840*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m15 4841*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m15 4842*c0909341SAndroid Build Coastguard Worker punpcklqdq xm10, xm10 4843*c0909341SAndroid Build Coastguard Worker pmovsxbw m10, xm10 4844*c0909341SAndroid Build Coastguard Worker phaddd m0, m7 4845*c0909341SAndroid Build Coastguard Worker phaddd m1, m8 4846*c0909341SAndroid Build Coastguard Worker phaddd m2, m9 4847*c0909341SAndroid Build Coastguard Worker paddd m0, m5 4848*c0909341SAndroid Build Coastguard Worker paddd m1, m5 4849*c0909341SAndroid Build Coastguard Worker paddd m2, m5 4850*c0909341SAndroid Build Coastguard Worker psrad m0, xm6 4851*c0909341SAndroid Build Coastguard Worker psrad m1, xm6 4852*c0909341SAndroid Build Coastguard Worker psrad m2, xm6 4853*c0909341SAndroid Build Coastguard Worker vperm2i128 m3, m0, m2, 0x21 ; 2 4 4854*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m1, 0x13 ; 3 5 4855*c0909341SAndroid Build Coastguard Worker pshufd m7, m10, q0000 4856*c0909341SAndroid Build Coastguard Worker pshufd m8, m10, q1111 4857*c0909341SAndroid Build Coastguard Worker pshufd m9, m10, q2222 4858*c0909341SAndroid Build Coastguard Worker pshufd m10, m10, q3333 4859*c0909341SAndroid Build Coastguard Worker packssdw m0, m3 ; 0 2 2 4 4860*c0909341SAndroid Build Coastguard Worker packssdw m1, m2 ; 1 3 3 5 4861*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 ; 23 45 4862*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 ; 01 23 4863*c0909341SAndroid Build Coastguard Worker.dy2_w4_loop: 4864*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0] 4865*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+r4 ] 4866*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 4867*c0909341SAndroid Build Coastguard Worker movu xm11, [srcq+r6 ] 4868*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 4869*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+r11 ], 1 4870*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 4871*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [srcq+r13 ], 1 4872*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4873*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m7 4874*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m2, m8 4875*c0909341SAndroid Build Coastguard Worker pshufb m1, m12 4876*c0909341SAndroid Build Coastguard Worker pshufb m3, m12 4877*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m14 4878*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 4879*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+0x00] 4880*c0909341SAndroid Build Coastguard Worker pshufb m6, m13 4881*c0909341SAndroid Build Coastguard Worker pshufb m11, m13 4882*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m15 4883*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m15 4884*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4885*c0909341SAndroid Build Coastguard Worker movd xm5, [rsp+0x40] 4886*c0909341SAndroid Build Coastguard Worker phaddd m1, m6 4887*c0909341SAndroid Build Coastguard Worker phaddd m3, m11 4888*c0909341SAndroid Build Coastguard Worker paddd m1, m0 4889*c0909341SAndroid Build Coastguard Worker paddd m3, m0 4890*c0909341SAndroid Build Coastguard Worker psrad m1, xm5 4891*c0909341SAndroid Build Coastguard Worker psrad m3, xm5 4892*c0909341SAndroid Build Coastguard Worker pslld m3, 16 4893*c0909341SAndroid Build Coastguard Worker pblendw m1, m3, 0xaa ; 67 89 4894*c0909341SAndroid Build Coastguard Worker vperm2i128 m0, m2, m1, 0x21 ; 45 67 4895*c0909341SAndroid Build Coastguard Worker paddd m4, [rsp+0x20] 4896*c0909341SAndroid Build Coastguard Worker mova m2, m1 4897*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m0, m9 4898*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m10 4899*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4900*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4901*c0909341SAndroid Build Coastguard Worker%if isput 4902*c0909341SAndroid Build Coastguard Worker psrad m4, [rsp+0x48] 4903*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4904*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm5 4905*c0909341SAndroid Build Coastguard Worker pminsw xm4, [rsp+0x50] 4906*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm4 4907*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm4 4908*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4909*c0909341SAndroid Build Coastguard Worker%else 4910*c0909341SAndroid Build Coastguard Worker psrad m4, 6 4911*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4912*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4913*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4914*c0909341SAndroid Build Coastguard Worker add tmpq, 16 4915*c0909341SAndroid Build Coastguard Worker%endif 4916*c0909341SAndroid Build Coastguard Worker sub hd, 2 4917*c0909341SAndroid Build Coastguard Worker jg .dy2_w4_loop 4918*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_RET 4919*c0909341SAndroid Build Coastguard Worker SWAP m10, m13 4920*c0909341SAndroid Build Coastguard Worker.dy2_w8: 4921*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 1 4922*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 16 4923*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4924*c0909341SAndroid Build Coastguard Worker.dy2_w16: 4925*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 2 4926*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 32 4927*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4928*c0909341SAndroid Build Coastguard Worker.dy2_w32: 4929*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 4 4930*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 64 4931*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4932*c0909341SAndroid Build Coastguard Worker.dy2_w64: 4933*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 8 4934*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 128 4935*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4936*c0909341SAndroid Build Coastguard Worker.dy2_w128: 4937*c0909341SAndroid Build Coastguard Worker mov dword [rsp+0xa0], 16 4938*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 256 4939*c0909341SAndroid Build Coastguard Worker.dy2_w_start: 4940*c0909341SAndroid Build Coastguard Worker SWAP m10, m12, m1 4941*c0909341SAndroid Build Coastguard Worker SWAP m11, m7 4942*c0909341SAndroid Build Coastguard Worker ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free 4943*c0909341SAndroid Build Coastguard Worker mov myd, mym 4944*c0909341SAndroid Build Coastguard Worker%if isput 4945*c0909341SAndroid Build Coastguard Worker movifnidn dsm, dsq 4946*c0909341SAndroid Build Coastguard Worker mova [rsp+0xc0], xm7 4947*c0909341SAndroid Build Coastguard Worker%endif 4948*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], m10 4949*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], m13 4950*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], xm11 4951*c0909341SAndroid Build Coastguard Worker shr t0d, 16 4952*c0909341SAndroid Build Coastguard Worker sub srcq, 6 4953*c0909341SAndroid Build Coastguard Worker shr myd, 6 4954*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4955*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4956*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4957*c0909341SAndroid Build Coastguard Worker pmaddwd m8, [base+rescale_mul2] 4958*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4959*c0909341SAndroid Build Coastguard Worker mov [rsp+0xa4], t0d 4960*c0909341SAndroid Build Coastguard Worker mov [rsp+0xa8], srcq 4961*c0909341SAndroid Build Coastguard Worker mov [rsp+0xb0], r0q ; dstq / tmpq 4962*c0909341SAndroid Build Coastguard Worker%if UNIX64 4963*c0909341SAndroid Build Coastguard Worker mov hm, hd 4964*c0909341SAndroid Build Coastguard Worker%endif 4965*c0909341SAndroid Build Coastguard Worker shl dword dxm, 3 ; dx*8 4966*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xm15 4967*c0909341SAndroid Build Coastguard Worker paddd m1, m8 ; mx+dx*[0-7] 4968*c0909341SAndroid Build Coastguard Worker movq xm0, r4q 4969*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, xm0 4970*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm0 4971*c0909341SAndroid Build Coastguard Worker jmp .dy2_hloop 4972*c0909341SAndroid Build Coastguard Worker.dy2_hloop_prep: 4973*c0909341SAndroid Build Coastguard Worker dec dword [rsp+0xa0] 4974*c0909341SAndroid Build Coastguard Worker jz .ret 4975*c0909341SAndroid Build Coastguard Worker add qword [rsp+0xb0], 16 4976*c0909341SAndroid Build Coastguard Worker mov hd, hm 4977*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 4978*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_0x3ff] 4979*c0909341SAndroid Build Coastguard Worker paddd m1, m8, [rsp+0x60] 4980*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+0xa4] 4981*c0909341SAndroid Build Coastguard Worker pxor m9, m9 4982*c0909341SAndroid Build Coastguard Worker mov srcq, [rsp+0xa8] 4983*c0909341SAndroid Build Coastguard Worker mov r0q, [rsp+0xb0] ; dstq / tmpq 4984*c0909341SAndroid Build Coastguard Worker mova m10, [rsp+0x00] 4985*c0909341SAndroid Build Coastguard Worker mova xm11, [rsp+0x40] 4986*c0909341SAndroid Build Coastguard Worker.dy2_hloop: 4987*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm2, [base+pq_0x40000000] 4988*c0909341SAndroid Build Coastguard Worker pand m5, m1, m6 4989*c0909341SAndroid Build Coastguard Worker psrld m5, 6 4990*c0909341SAndroid Build Coastguard Worker paddd m15, m5 4991*c0909341SAndroid Build Coastguard Worker pcmpeqd m5, m9 4992*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m15, 1 4993*c0909341SAndroid Build Coastguard Worker movq r6, xm15 4994*c0909341SAndroid Build Coastguard Worker pextrq r9, xm15, 1 4995*c0909341SAndroid Build Coastguard Worker movq r11, xm7 4996*c0909341SAndroid Build Coastguard Worker pextrq rX, xm7, 1 4997*c0909341SAndroid Build Coastguard Worker mov r4d, r6d 4998*c0909341SAndroid Build Coastguard Worker shr r6, 32 4999*c0909341SAndroid Build Coastguard Worker mov r7d, r9d 5000*c0909341SAndroid Build Coastguard Worker shr r9, 32 5001*c0909341SAndroid Build Coastguard Worker mov r10d, r11d 5002*c0909341SAndroid Build Coastguard Worker shr r11, 32 5003*c0909341SAndroid Build Coastguard Worker mov r13d, rXd 5004*c0909341SAndroid Build Coastguard Worker shr rX, 32 5005*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], m1 5006*c0909341SAndroid Build Coastguard Worker movq xm12, [base+subpel_filters+ r4*8] 5007*c0909341SAndroid Build Coastguard Worker movq xm13, [base+subpel_filters+ r6*8] 5008*c0909341SAndroid Build Coastguard Worker movhps xm12, [base+subpel_filters+ r7*8] 5009*c0909341SAndroid Build Coastguard Worker movhps xm13, [base+subpel_filters+ r9*8] 5010*c0909341SAndroid Build Coastguard Worker movq xm14, [base+subpel_filters+r10*8] 5011*c0909341SAndroid Build Coastguard Worker movq xm15, [base+subpel_filters+r11*8] 5012*c0909341SAndroid Build Coastguard Worker movhps xm14, [base+subpel_filters+r13*8] 5013*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+subpel_filters+ rX*8] 5014*c0909341SAndroid Build Coastguard Worker psrld m1, 10 5015*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m1, 1 5016*c0909341SAndroid Build Coastguard Worker vextracti128 xm6, m5, 1 5017*c0909341SAndroid Build Coastguard Worker movq r6, xm1 5018*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 5019*c0909341SAndroid Build Coastguard Worker movq r9, xm7 5020*c0909341SAndroid Build Coastguard Worker pextrq rX, xm7, 1 5021*c0909341SAndroid Build Coastguard Worker mov r4d, r6d 5022*c0909341SAndroid Build Coastguard Worker shr r6, 32 5023*c0909341SAndroid Build Coastguard Worker mov r10d, r11d 5024*c0909341SAndroid Build Coastguard Worker shr r11, 32 5025*c0909341SAndroid Build Coastguard Worker mov r7d, r9d 5026*c0909341SAndroid Build Coastguard Worker shr r9, 32 5027*c0909341SAndroid Build Coastguard Worker mov r13d, rXd 5028*c0909341SAndroid Build Coastguard Worker shr rX, 32 5029*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm5, q2200 5030*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q3311 5031*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm6, q2200 5032*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm6, q3311 5033*c0909341SAndroid Build Coastguard Worker pblendvb xm12, xm2, xm4 5034*c0909341SAndroid Build Coastguard Worker pblendvb xm13, xm2, xm5 5035*c0909341SAndroid Build Coastguard Worker pblendvb xm14, xm2, xm7 5036*c0909341SAndroid Build Coastguard Worker pblendvb xm15, xm2, xm6 5037*c0909341SAndroid Build Coastguard Worker pmovsxbw m12, xm12 5038*c0909341SAndroid Build Coastguard Worker pmovsxbw m13, xm13 5039*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 5040*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 5041*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 5042*c0909341SAndroid Build Coastguard Worker mova [rsp+0x80], m0 5043*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 5044*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 5045*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b 5046*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+0x80] 5047*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+subpel_s_shuf8] 5048*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [rsp+0x50] 5049*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [rsp+0x54] 5050*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [rsp+0x58] 5051*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [rsp+0x5c] 5052*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 ; 01a 01b 5053*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 ; 23a 23b 5054*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 ; 45a 45b 5055*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 ; 67a 67b 5056*c0909341SAndroid Build Coastguard Worker.dy2_vloop: 5057*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 5058*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m9 5059*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m10 5060*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m3, m11 5061*c0909341SAndroid Build Coastguard Worker paddd m4, [rsp+0x20] 5062*c0909341SAndroid Build Coastguard Worker paddd m6, m7 5063*c0909341SAndroid Build Coastguard Worker paddd m4, m5 5064*c0909341SAndroid Build Coastguard Worker paddd m4, m6 5065*c0909341SAndroid Build Coastguard Worker%if isput 5066*c0909341SAndroid Build Coastguard Worker psrad m4, [rsp+0x48] 5067*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 5068*c0909341SAndroid Build Coastguard Worker packusdw xm4, xm5 5069*c0909341SAndroid Build Coastguard Worker pminsw xm4, [rsp+0xc0] 5070*c0909341SAndroid Build Coastguard Worker mova [dstq], xm4 5071*c0909341SAndroid Build Coastguard Worker add dstq, dsm 5072*c0909341SAndroid Build Coastguard Worker%else 5073*c0909341SAndroid Build Coastguard Worker psrad m4, 6 5074*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 5075*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 5076*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 5077*c0909341SAndroid Build Coastguard Worker add tmpq, tmp_stridem 5078*c0909341SAndroid Build Coastguard Worker%endif 5079*c0909341SAndroid Build Coastguard Worker dec hd 5080*c0909341SAndroid Build Coastguard Worker jz .dy2_hloop_prep 5081*c0909341SAndroid Build Coastguard Worker mova m0, m1 5082*c0909341SAndroid Build Coastguard Worker mova m1, m2 5083*c0909341SAndroid Build Coastguard Worker mova m2, m3 5084*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ r4*2] 5085*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ r6*2] 5086*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ r7*2] 5087*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ r9*2] 5088*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+r10*2], 1 5089*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r11*2], 1 5090*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r13*2], 1 5091*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+ rX*2], 1 5092*c0909341SAndroid Build Coastguard Worker add srcq, ssq 5093*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 5094*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 5095*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m14 5096*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m15 5097*c0909341SAndroid Build Coastguard Worker phaddd m3, m4 5098*c0909341SAndroid Build Coastguard Worker phaddd m5, m6 5099*c0909341SAndroid Build Coastguard Worker phaddd m3, m5 5100*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ r4*2] 5101*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ r6*2] 5102*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ r7*2] 5103*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+ r9*2] 5104*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r10*2], 1 5105*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r11*2], 1 5106*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+r13*2], 1 5107*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+ rX*2], 1 5108*c0909341SAndroid Build Coastguard Worker add srcq, ssq 5109*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m12 5110*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13 5111*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 5112*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15 5113*c0909341SAndroid Build Coastguard Worker phaddd m4, m5 5114*c0909341SAndroid Build Coastguard Worker phaddd m6, m7 5115*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+0x00] 5116*c0909341SAndroid Build Coastguard Worker movd xm7, [rsp+0x40] 5117*c0909341SAndroid Build Coastguard Worker phaddd m4, m6 5118*c0909341SAndroid Build Coastguard Worker paddd m3, m5 5119*c0909341SAndroid Build Coastguard Worker paddd m4, m5 5120*c0909341SAndroid Build Coastguard Worker psrad m3, xm7 5121*c0909341SAndroid Build Coastguard Worker psrad m4, xm7 5122*c0909341SAndroid Build Coastguard Worker pslld m4, 16 5123*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 5124*c0909341SAndroid Build Coastguard Worker jmp .dy2_vloop 5125*c0909341SAndroid Build Coastguard Worker.ret: 5126*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_RET 0 5127*c0909341SAndroid Build Coastguard Worker%undef isput 5128*c0909341SAndroid Build Coastguard Worker%undef isprep 5129*c0909341SAndroid Build Coastguard Worker%endmacro 5130*c0909341SAndroid Build Coastguard Worker 5131*c0909341SAndroid Build Coastguard Worker%macro BILIN_SCALED_FN 1 5132*c0909341SAndroid Build Coastguard Workercglobal %1_bilin_scaled_16bpc 5133*c0909341SAndroid Build Coastguard Worker mov t0d, (5*15 << 16) | 5*15 5134*c0909341SAndroid Build Coastguard Worker mov t1d, t0d 5135*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) 5136*c0909341SAndroid Build Coastguard Worker%endmacro 5137*c0909341SAndroid Build Coastguard Worker 5138*c0909341SAndroid Build Coastguard Worker%if WIN64 5139*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 5 5140*c0909341SAndroid Build Coastguard Worker%else 5141*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 8 5142*c0909341SAndroid Build Coastguard Worker%endif 5143*c0909341SAndroid Build Coastguard Worker 5144*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 5145*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN put 5146*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc 5147*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc 5148*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc 5149*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc 5150*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc 5151*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc 5152*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc 5153*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc 5154*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 5155*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED put 5156*c0909341SAndroid Build Coastguard Worker 5157*c0909341SAndroid Build Coastguard Worker%if WIN64 5158*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 4 5159*c0909341SAndroid Build Coastguard Worker%else 5160*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7 5161*c0909341SAndroid Build Coastguard Worker%endif 5162*c0909341SAndroid Build Coastguard Worker 5163*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 5164*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN prep 5165*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc 5166*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc 5167*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc 5168*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc 5169*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc 5170*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc 5171*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc 5172*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc 5173*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 5174*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED prep 5175*c0909341SAndroid Build Coastguard Worker 5176*c0909341SAndroid Build Coastguard Worker%macro WARP_V 5 ; dst, 01, 23, 45, 67 5177*c0909341SAndroid Build Coastguard Worker lea tmp1d, [myq+deltaq*4] 5178*c0909341SAndroid Build Coastguard Worker lea tmp2d, [myq+deltaq*1] 5179*c0909341SAndroid Build Coastguard Worker shr myd, 10 5180*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5181*c0909341SAndroid Build Coastguard Worker movq xm8, [filterq+myq *8] 5182*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [filterq+tmp1q*8], 1 ; a e 5183*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+deltaq*4] 5184*c0909341SAndroid Build Coastguard Worker lea myd, [tmp2q+deltaq*1] 5185*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 5186*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5187*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+tmp2q*8] 5188*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 ; b f 5189*c0909341SAndroid Build Coastguard Worker lea tmp1d, [myq+deltaq*4] 5190*c0909341SAndroid Build Coastguard Worker lea tmp2d, [myq+deltaq*1] 5191*c0909341SAndroid Build Coastguard Worker shr myd, 10 5192*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5193*c0909341SAndroid Build Coastguard Worker movq xm9, [filterq+myq *8] 5194*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [filterq+tmp1q*8], 1 ; c g 5195*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+deltaq*4] 5196*c0909341SAndroid Build Coastguard Worker lea myd, [tmp2q+gammaq] ; my += gamma 5197*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m0 5198*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 5199*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5200*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+tmp2q*8] 5201*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 ; d h 5202*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m9, m0 5203*c0909341SAndroid Build Coastguard Worker punpckldq m9, m8, m0 5204*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m8, m0 5205*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 5206*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 5207*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m8 5208*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m%3 5209*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 5210*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 5211*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m%4 5212*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m%5 5213*c0909341SAndroid Build Coastguard Worker paddd m9, m%2 5214*c0909341SAndroid Build Coastguard Worker mova m%2, m%3 5215*c0909341SAndroid Build Coastguard Worker paddd m0, m8 5216*c0909341SAndroid Build Coastguard Worker mova m%3, m%4 5217*c0909341SAndroid Build Coastguard Worker mova m%4, m%5 5218*c0909341SAndroid Build Coastguard Worker paddd m%1, m0, m9 5219*c0909341SAndroid Build Coastguard Worker%endmacro 5220*c0909341SAndroid Build Coastguard Worker 5221*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts 5222*c0909341SAndroid Build Coastguard Worker mov r6d, r7m 5223*c0909341SAndroid Build Coastguard Worker lea r9, [$$] 5224*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5225*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] 5226*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [warp8x8t_rnd] 5227*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main 5228*c0909341SAndroid Build Coastguard Worker jmp .start 5229*c0909341SAndroid Build Coastguard Worker.loop: 5230*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 5231*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+tsq*4] 5232*c0909341SAndroid Build Coastguard Worker.start: 5233*c0909341SAndroid Build Coastguard Worker paddd m7, m14 5234*c0909341SAndroid Build Coastguard Worker paddd m0, m14 5235*c0909341SAndroid Build Coastguard Worker psrad m7, 15 5236*c0909341SAndroid Build Coastguard Worker psrad m0, 15 5237*c0909341SAndroid Build Coastguard Worker packssdw m7, m0 5238*c0909341SAndroid Build Coastguard Worker vpermq m7, m7, q3120 5239*c0909341SAndroid Build Coastguard Worker mova [tmpq+tsq*0], xm7 5240*c0909341SAndroid Build Coastguard Worker vextracti128 [tmpq+tsq*2], m7, 1 5241*c0909341SAndroid Build Coastguard Worker dec r4d 5242*c0909341SAndroid Build Coastguard Worker jg .loop 5243*c0909341SAndroid Build Coastguard Worker.end: 5244*c0909341SAndroid Build Coastguard Worker RET 5245*c0909341SAndroid Build Coastguard Worker 5246*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ 5247*c0909341SAndroid Build Coastguard Worker alpha, beta, filter, tmp1, delta, \ 5248*c0909341SAndroid Build Coastguard Worker my, gamma 5249*c0909341SAndroid Build Coastguard Worker mov r6d, r7m 5250*c0909341SAndroid Build Coastguard Worker lea filterq, [$$] 5251*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5252*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] 5253*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] 5254*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r7m ; pixel_max 5255*c0909341SAndroid Build Coastguard Worker call .main 5256*c0909341SAndroid Build Coastguard Worker jmp .start 5257*c0909341SAndroid Build Coastguard Worker.loop: 5258*c0909341SAndroid Build Coastguard Worker call .main2 5259*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5260*c0909341SAndroid Build Coastguard Worker.start: 5261*c0909341SAndroid Build Coastguard Worker psrad m7, 16 5262*c0909341SAndroid Build Coastguard Worker psrad m0, 16 5263*c0909341SAndroid Build Coastguard Worker packusdw m7, m0 5264*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m14 5265*c0909341SAndroid Build Coastguard Worker pminsw m7, m15 5266*c0909341SAndroid Build Coastguard Worker vpermq m7, m7, q3120 5267*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm7 5268*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m7, 1 5269*c0909341SAndroid Build Coastguard Worker dec r4d 5270*c0909341SAndroid Build Coastguard Worker jg .loop 5271*c0909341SAndroid Build Coastguard Worker.end: 5272*c0909341SAndroid Build Coastguard Worker RET 5273*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5274*c0909341SAndroid Build Coastguard Worker.main: 5275*c0909341SAndroid Build Coastguard Worker ; Stack args offset by one (r4m -> r5m etc.) due to call 5276*c0909341SAndroid Build Coastguard Worker%if WIN64 5277*c0909341SAndroid Build Coastguard Worker mov abcdq, r5m 5278*c0909341SAndroid Build Coastguard Worker mov mxd, r6m 5279*c0909341SAndroid Build Coastguard Worker%endif 5280*c0909341SAndroid Build Coastguard Worker movsx alphad, word [abcdq+2*0] 5281*c0909341SAndroid Build Coastguard Worker movsx betad, word [abcdq+2*1] 5282*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [pd_32768] 5283*c0909341SAndroid Build Coastguard Worker pxor m11, m11 5284*c0909341SAndroid Build Coastguard Worker add filterq, mc_warp_filter-$$ 5285*c0909341SAndroid Build Coastguard Worker lea tmp1q, [ssq*3] 5286*c0909341SAndroid Build Coastguard Worker add mxd, 512+(64<<10) 5287*c0909341SAndroid Build Coastguard Worker lea tmp2d, [alphaq*3] 5288*c0909341SAndroid Build Coastguard Worker sub srcq, tmp1q ; src -= src_stride*3 5289*c0909341SAndroid Build Coastguard Worker sub betad, tmp2d ; beta -= alpha*3 5290*c0909341SAndroid Build Coastguard Worker mov myd, r7m 5291*c0909341SAndroid Build Coastguard Worker call .h 5292*c0909341SAndroid Build Coastguard Worker psrld m1, m0, 16 5293*c0909341SAndroid Build Coastguard Worker call .h 5294*c0909341SAndroid Build Coastguard Worker pblendw m1, m0, 0xaa ; 01 5295*c0909341SAndroid Build Coastguard Worker psrld m2, m0, 16 5296*c0909341SAndroid Build Coastguard Worker call .h 5297*c0909341SAndroid Build Coastguard Worker pblendw m2, m0, 0xaa ; 12 5298*c0909341SAndroid Build Coastguard Worker psrld m3, m0, 16 5299*c0909341SAndroid Build Coastguard Worker call .h 5300*c0909341SAndroid Build Coastguard Worker pblendw m3, m0, 0xaa ; 23 5301*c0909341SAndroid Build Coastguard Worker psrld m4, m0, 16 5302*c0909341SAndroid Build Coastguard Worker call .h 5303*c0909341SAndroid Build Coastguard Worker pblendw m4, m0, 0xaa ; 34 5304*c0909341SAndroid Build Coastguard Worker psrld m5, m0, 16 5305*c0909341SAndroid Build Coastguard Worker call .h 5306*c0909341SAndroid Build Coastguard Worker pblendw m5, m0, 0xaa ; 45 5307*c0909341SAndroid Build Coastguard Worker psrld m6, m0, 16 5308*c0909341SAndroid Build Coastguard Worker call .h 5309*c0909341SAndroid Build Coastguard Worker pblendw m6, m0, 0xaa ; 56 5310*c0909341SAndroid Build Coastguard Worker movsx deltad, word [abcdq+2*2] 5311*c0909341SAndroid Build Coastguard Worker movsx gammad, word [abcdq+2*3] 5312*c0909341SAndroid Build Coastguard Worker add myd, 512+(64<<10) 5313*c0909341SAndroid Build Coastguard Worker mov r4d, 4 5314*c0909341SAndroid Build Coastguard Worker lea tmp1d, [deltaq*3] 5315*c0909341SAndroid Build Coastguard Worker sub gammad, tmp1d ; gamma -= delta*3 5316*c0909341SAndroid Build Coastguard Worker.main2: 5317*c0909341SAndroid Build Coastguard Worker call .h 5318*c0909341SAndroid Build Coastguard Worker psrld m7, m6, 16 5319*c0909341SAndroid Build Coastguard Worker pblendw m7, m0, 0xaa ; 67 5320*c0909341SAndroid Build Coastguard Worker WARP_V 7, 1, 3, 5, 7 5321*c0909341SAndroid Build Coastguard Worker call .h 5322*c0909341SAndroid Build Coastguard Worker psrld m10, m5, 16 5323*c0909341SAndroid Build Coastguard Worker pblendw m10, m0, 0xaa ; 78 5324*c0909341SAndroid Build Coastguard Worker WARP_V 0, 2, 4, 6, 10 5325*c0909341SAndroid Build Coastguard Worker ret 5326*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5327*c0909341SAndroid Build Coastguard Worker.h: 5328*c0909341SAndroid Build Coastguard Worker lea tmp1d, [mxq+alphaq*4] 5329*c0909341SAndroid Build Coastguard Worker lea tmp2d, [mxq+alphaq*1] 5330*c0909341SAndroid Build Coastguard Worker movu xm10, [srcq-6] 5331*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [srcq+2], 1 5332*c0909341SAndroid Build Coastguard Worker shr mxd, 10 ; 0 5333*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 ; 4 5334*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+mxq *8] 5335*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 5336*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+alphaq*4] 5337*c0909341SAndroid Build Coastguard Worker lea mxd, [tmp2q+alphaq*1] 5338*c0909341SAndroid Build Coastguard Worker movu xm8, [srcq-4] 5339*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [srcq+4], 1 5340*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 ; 1 5341*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 ; 5 5342*c0909341SAndroid Build Coastguard Worker movq xm9, [filterq+tmp2q*8] 5343*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [filterq+tmp1q*8], 1 5344*c0909341SAndroid Build Coastguard Worker lea tmp1d, [mxq+alphaq*4] 5345*c0909341SAndroid Build Coastguard Worker lea tmp2d, [mxq+alphaq*1] 5346*c0909341SAndroid Build Coastguard Worker shr mxd, 10 ; 2 5347*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 ; 6 5348*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m11, m0 5349*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 5350*c0909341SAndroid Build Coastguard Worker movu xm10, [srcq-2] 5351*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [srcq+6], 1 5352*c0909341SAndroid Build Coastguard Worker punpcklbw m9, m11, m9 5353*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m8 5354*c0909341SAndroid Build Coastguard Worker movq xm8, [filterq+mxq *8] 5355*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [filterq+tmp1q*8], 1 5356*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+alphaq*4] 5357*c0909341SAndroid Build Coastguard Worker lea mxd, [tmp2q+betaq] ; mx += beta 5358*c0909341SAndroid Build Coastguard Worker phaddd m0, m9 ; 0 1 4 5 5359*c0909341SAndroid Build Coastguard Worker movu xm9, [srcq+0] 5360*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [srcq+8], 1 5361*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 ; 3 5362*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 ; 7 5363*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m8 5364*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m10 5365*c0909341SAndroid Build Coastguard Worker movq xm10, [filterq+tmp2q*8] 5366*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [filterq+tmp1q*8], 1 5367*c0909341SAndroid Build Coastguard Worker punpcklbw m10, m11, m10 5368*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m10 5369*c0909341SAndroid Build Coastguard Worker add srcq, ssq 5370*c0909341SAndroid Build Coastguard Worker phaddd m8, m9 ; 2 3 6 7 5371*c0909341SAndroid Build Coastguard Worker phaddd m0, m8 ; 0 1 2 3 4 5 6 7 5372*c0909341SAndroid Build Coastguard Worker vpsllvd m0, m13 5373*c0909341SAndroid Build Coastguard Worker paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword 5374*c0909341SAndroid Build Coastguard Worker ret 5375*c0909341SAndroid Build Coastguard Worker 5376*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 0 5377*c0909341SAndroid Build Coastguard Worker call .main 5378*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5379*c0909341SAndroid Build Coastguard Worker jmp wq 5380*c0909341SAndroid Build Coastguard Worker.w4: 5381*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm0 5382*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 5383*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 5384*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 5385*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 5386*c0909341SAndroid Build Coastguard Worker cmp hd, 4 5387*c0909341SAndroid Build Coastguard Worker je .ret 5388*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5389*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm1 5390*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 5391*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 5392*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 5393*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5394*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5395*c0909341SAndroid Build Coastguard Worker je .ret 5396*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5397*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm2 5398*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5399*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m2, 1 5400*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5401*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5402*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5403*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm3 5404*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm3 5405*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m3, 1 5406*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm3 5407*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 5408*c0909341SAndroid Build Coastguard Worker.ret: 5409*c0909341SAndroid Build Coastguard Worker RET 5410*c0909341SAndroid Build Coastguard Worker.w8: 5411*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5412*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 5413*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 5414*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 5415*c0909341SAndroid Build Coastguard Worker cmp hd, 4 5416*c0909341SAndroid Build Coastguard Worker jne .w8_loop_start 5417*c0909341SAndroid Build Coastguard Worker RET 5418*c0909341SAndroid Build Coastguard Worker.w8_loop: 5419*c0909341SAndroid Build Coastguard Worker call .main 5420*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5421*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5422*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 5423*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 5424*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 5425*c0909341SAndroid Build Coastguard Worker.w8_loop_start: 5426*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5427*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm2 5428*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m2, 1 5429*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm3 5430*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m3, 1 5431*c0909341SAndroid Build Coastguard Worker sub hd, 8 5432*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5433*c0909341SAndroid Build Coastguard Worker RET 5434*c0909341SAndroid Build Coastguard Worker.w16_loop: 5435*c0909341SAndroid Build Coastguard Worker call .main 5436*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5437*c0909341SAndroid Build Coastguard Worker.w16: 5438*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5439*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 5440*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 5441*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m3 5442*c0909341SAndroid Build Coastguard Worker sub hd, 4 5443*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5444*c0909341SAndroid Build Coastguard Worker RET 5445*c0909341SAndroid Build Coastguard Worker.w32_loop: 5446*c0909341SAndroid Build Coastguard Worker call .main 5447*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5448*c0909341SAndroid Build Coastguard Worker.w32: 5449*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 5450*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 5451*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m2 5452*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m3 5453*c0909341SAndroid Build Coastguard Worker sub hd, 2 5454*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5455*c0909341SAndroid Build Coastguard Worker RET 5456*c0909341SAndroid Build Coastguard Worker.w64_loop: 5457*c0909341SAndroid Build Coastguard Worker call .main 5458*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5459*c0909341SAndroid Build Coastguard Worker.w64: 5460*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 5461*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 5462*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 5463*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 5464*c0909341SAndroid Build Coastguard Worker dec hd 5465*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5466*c0909341SAndroid Build Coastguard Worker RET 5467*c0909341SAndroid Build Coastguard Worker.w128_loop: 5468*c0909341SAndroid Build Coastguard Worker call .main 5469*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5470*c0909341SAndroid Build Coastguard Worker.w128: 5471*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 5472*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 5473*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 5474*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 5475*c0909341SAndroid Build Coastguard Worker call .main 5476*c0909341SAndroid Build Coastguard Worker mova [dstq+32*4], m0 5477*c0909341SAndroid Build Coastguard Worker mova [dstq+32*5], m1 5478*c0909341SAndroid Build Coastguard Worker mova [dstq+32*6], m2 5479*c0909341SAndroid Build Coastguard Worker mova [dstq+32*7], m3 5480*c0909341SAndroid Build Coastguard Worker dec hd 5481*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5482*c0909341SAndroid Build Coastguard Worker RET 5483*c0909341SAndroid Build Coastguard Worker%endmacro 5484*c0909341SAndroid Build Coastguard Worker 5485*c0909341SAndroid Build Coastguard Worker%if WIN64 5486*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5 5487*c0909341SAndroid Build Coastguard Worker%else 5488*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 5489*c0909341SAndroid Build Coastguard Worker%endif 5490*c0909341SAndroid Build Coastguard Worker 5491*c0909341SAndroid Build Coastguard Workercglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 5492*c0909341SAndroid Build Coastguard Worker%define base r6-avg_avx2_table 5493*c0909341SAndroid Build Coastguard Worker lea r6, [avg_avx2_table] 5494*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5495*c0909341SAndroid Build Coastguard Worker mov t0d, r6m ; pixel_max 5496*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 5497*c0909341SAndroid Build Coastguard Worker shr t0d, 11 5498*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+bidir_rnd+t0*4] 5499*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+bidir_mul+t0*4] 5500*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5501*c0909341SAndroid Build Coastguard Worker add wq, r6 5502*c0909341SAndroid Build Coastguard Worker BIDIR_FN 5503*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5504*c0909341SAndroid Build Coastguard Worker.main: 5505*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+32*0] 5506*c0909341SAndroid Build Coastguard Worker paddsw m0, [tmp2q+32*0] 5507*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+32*1] 5508*c0909341SAndroid Build Coastguard Worker paddsw m1, [tmp2q+32*1] 5509*c0909341SAndroid Build Coastguard Worker mova m2, [tmp1q+32*2] 5510*c0909341SAndroid Build Coastguard Worker paddsw m2, [tmp2q+32*2] 5511*c0909341SAndroid Build Coastguard Worker mova m3, [tmp1q+32*3] 5512*c0909341SAndroid Build Coastguard Worker paddsw m3, [tmp2q+32*3] 5513*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 5514*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 5515*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m4 5516*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m4 5517*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m4 5518*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m4 5519*c0909341SAndroid Build Coastguard Worker psubsw m0, m4 5520*c0909341SAndroid Build Coastguard Worker psubsw m1, m4 5521*c0909341SAndroid Build Coastguard Worker psubsw m2, m4 5522*c0909341SAndroid Build Coastguard Worker psubsw m3, m4 5523*c0909341SAndroid Build Coastguard Worker pmulhw m0, m5 5524*c0909341SAndroid Build Coastguard Worker pmulhw m1, m5 5525*c0909341SAndroid Build Coastguard Worker pmulhw m2, m5 5526*c0909341SAndroid Build Coastguard Worker pmulhw m3, m5 5527*c0909341SAndroid Build Coastguard Worker ret 5528*c0909341SAndroid Build Coastguard Worker 5529*c0909341SAndroid Build Coastguard Workercglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 5530*c0909341SAndroid Build Coastguard Worker lea r6, [w_avg_avx2_table] 5531*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5532*c0909341SAndroid Build Coastguard Worker mov t0d, r6m ; weight 5533*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, r7m ; pixel_max 5534*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] 5535*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 5536*c0909341SAndroid Build Coastguard Worker paddw m7, m8 5537*c0909341SAndroid Build Coastguard Worker add wq, r6 5538*c0909341SAndroid Build Coastguard Worker lea r6d, [t0-16] 5539*c0909341SAndroid Build Coastguard Worker shl t0d, 16 5540*c0909341SAndroid Build Coastguard Worker sub t0d, r6d ; 16-weight, weight 5541*c0909341SAndroid Build Coastguard Worker pslld m7, 7 5542*c0909341SAndroid Build Coastguard Worker rorx r6d, t0d, 30 ; << 2 5543*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 5544*c0909341SAndroid Build Coastguard Worker cmovz r6d, t0d 5545*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5546*c0909341SAndroid Build Coastguard Worker movd xm6, r6d 5547*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, xm6 5548*c0909341SAndroid Build Coastguard Worker BIDIR_FN 5549*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5550*c0909341SAndroid Build Coastguard Worker.main: 5551*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+32*0] 5552*c0909341SAndroid Build Coastguard Worker mova m0, [tmp2q+32*0] 5553*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m0, m4 5554*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 5555*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+32*1] 5556*c0909341SAndroid Build Coastguard Worker mova m1, [tmp2q+32*1] 5557*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6 5558*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 5559*c0909341SAndroid Build Coastguard Worker paddd m5, m7 5560*c0909341SAndroid Build Coastguard Worker paddd m0, m7 5561*c0909341SAndroid Build Coastguard Worker psrad m5, 8 5562*c0909341SAndroid Build Coastguard Worker psrad m0, 8 5563*c0909341SAndroid Build Coastguard Worker packusdw m0, m5 5564*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1, m4 5565*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4 5566*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+32*2] 5567*c0909341SAndroid Build Coastguard Worker mova m2, [tmp2q+32*2] 5568*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6 5569*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m6 5570*c0909341SAndroid Build Coastguard Worker paddd m5, m7 5571*c0909341SAndroid Build Coastguard Worker paddd m1, m7 5572*c0909341SAndroid Build Coastguard Worker psrad m5, 8 5573*c0909341SAndroid Build Coastguard Worker psrad m1, 8 5574*c0909341SAndroid Build Coastguard Worker packusdw m1, m5 5575*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m2, m4 5576*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 5577*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+32*3] 5578*c0909341SAndroid Build Coastguard Worker mova m3, [tmp2q+32*3] 5579*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 5580*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 5581*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6 5582*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m6 5583*c0909341SAndroid Build Coastguard Worker paddd m5, m7 5584*c0909341SAndroid Build Coastguard Worker paddd m2, m7 5585*c0909341SAndroid Build Coastguard Worker psrad m5, 8 5586*c0909341SAndroid Build Coastguard Worker psrad m2, 8 5587*c0909341SAndroid Build Coastguard Worker packusdw m2, m5 5588*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m4 5589*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 5590*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6 5591*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 5592*c0909341SAndroid Build Coastguard Worker paddd m5, m7 5593*c0909341SAndroid Build Coastguard Worker paddd m3, m7 5594*c0909341SAndroid Build Coastguard Worker psrad m5, 8 5595*c0909341SAndroid Build Coastguard Worker psrad m3, 8 5596*c0909341SAndroid Build Coastguard Worker packusdw m3, m5 5597*c0909341SAndroid Build Coastguard Worker pminsw m0, m8 5598*c0909341SAndroid Build Coastguard Worker pminsw m1, m8 5599*c0909341SAndroid Build Coastguard Worker pminsw m2, m8 5600*c0909341SAndroid Build Coastguard Worker pminsw m3, m8 5601*c0909341SAndroid Build Coastguard Worker ret 5602*c0909341SAndroid Build Coastguard Worker 5603*c0909341SAndroid Build Coastguard Workercglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 5604*c0909341SAndroid Build Coastguard Worker%define base r7-mask_avx2_table 5605*c0909341SAndroid Build Coastguard Worker lea r7, [mask_avx2_table] 5606*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5607*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; pixel_max 5608*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5609*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5610*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5611*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pw_64] 5612*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+bidir_rnd+r6*4] 5613*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+bidir_mul+r6*4] 5614*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5615*c0909341SAndroid Build Coastguard Worker add wq, r7 5616*c0909341SAndroid Build Coastguard Worker BIDIR_FN 5617*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5618*c0909341SAndroid Build Coastguard Worker.main: 5619*c0909341SAndroid Build Coastguard Worker%macro MASK 1 5620*c0909341SAndroid Build Coastguard Worker pmovzxbw m5, [maskq+16*%1] 5621*c0909341SAndroid Build Coastguard Worker mova m%1, [tmp1q+32*%1] 5622*c0909341SAndroid Build Coastguard Worker mova m6, [tmp2q+32*%1] 5623*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m%1, m6 5624*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m6 5625*c0909341SAndroid Build Coastguard Worker psubw m7, m8, m5 5626*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m5, m7 ; m, 64-m 5627*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m7 5628*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) 5629*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m5 5630*c0909341SAndroid Build Coastguard Worker psrad m4, 5 5631*c0909341SAndroid Build Coastguard Worker psrad m%1, 5 5632*c0909341SAndroid Build Coastguard Worker packssdw m%1, m4 5633*c0909341SAndroid Build Coastguard Worker pmaxsw m%1, m9 5634*c0909341SAndroid Build Coastguard Worker psubsw m%1, m9 5635*c0909341SAndroid Build Coastguard Worker pmulhw m%1, m10 5636*c0909341SAndroid Build Coastguard Worker%endmacro 5637*c0909341SAndroid Build Coastguard Worker MASK 0 5638*c0909341SAndroid Build Coastguard Worker MASK 1 5639*c0909341SAndroid Build Coastguard Worker MASK 2 5640*c0909341SAndroid Build Coastguard Worker MASK 3 5641*c0909341SAndroid Build Coastguard Worker add maskq, 16*4 5642*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 5643*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 5644*c0909341SAndroid Build Coastguard Worker ret 5645*c0909341SAndroid Build Coastguard Worker 5646*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5647*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx2_table 5648*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_420_avx2_table] 5649*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5650*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; pixel_max 5651*c0909341SAndroid Build Coastguard Worker movd xm0, r7m ; sign 5652*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5653*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5654*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5655*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5656*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_64] 5657*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+bidir_rnd+r6*4] 5658*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+bidir_mul+r6*4] 5659*c0909341SAndroid Build Coastguard Worker movd xm14, [base+pw_2] 5660*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5661*c0909341SAndroid Build Coastguard Worker psubw xm14, xm0 5662*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, xm14 5663*c0909341SAndroid Build Coastguard Worker add wq, r7 5664*c0909341SAndroid Build Coastguard Worker call .main 5665*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5666*c0909341SAndroid Build Coastguard Worker jmp wq 5667*c0909341SAndroid Build Coastguard Worker.w4: 5668*c0909341SAndroid Build Coastguard Worker phaddd m4, m5 5669*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5670*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5671*c0909341SAndroid Build Coastguard Worker packuswb m4, m4 5672*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 5673*c0909341SAndroid Build Coastguard Worker punpcklwd xm4, xm5 5674*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 5675*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 5676*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 5677*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 5678*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 5679*c0909341SAndroid Build Coastguard Worker mova [maskq], xm4 5680*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5681*c0909341SAndroid Build Coastguard Worker jl .w4_end 5682*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5683*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 5684*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 5685*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 5686*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 5687*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5688*c0909341SAndroid Build Coastguard Worker je .w4_end 5689*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5690*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5691*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5692*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m2, 1 5693*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5694*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5695*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5696*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm3 5697*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm3 5698*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m3, 1 5699*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm3 5700*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 5701*c0909341SAndroid Build Coastguard Worker.w4_end: 5702*c0909341SAndroid Build Coastguard Worker RET 5703*c0909341SAndroid Build Coastguard Worker.w8_loop: 5704*c0909341SAndroid Build Coastguard Worker call .main 5705*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5706*c0909341SAndroid Build Coastguard Worker add maskq, 16 5707*c0909341SAndroid Build Coastguard Worker.w8: 5708*c0909341SAndroid Build Coastguard Worker vperm2i128 m6, m4, m5, 0x21 5709*c0909341SAndroid Build Coastguard Worker vpblendd m4, m5, 0xf0 5710*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5711*c0909341SAndroid Build Coastguard Worker paddw m4, m6 5712*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5713*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 5714*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm5 5715*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5716*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 5717*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 5718*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 5719*c0909341SAndroid Build Coastguard Worker mova [maskq], xm4 5720*c0909341SAndroid Build Coastguard Worker sub hd, 8 5721*c0909341SAndroid Build Coastguard Worker jl .w8_end 5722*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5723*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm2 5724*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m2, 1 5725*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm3 5726*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m3, 1 5727*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5728*c0909341SAndroid Build Coastguard Worker.w8_end: 5729*c0909341SAndroid Build Coastguard Worker RET 5730*c0909341SAndroid Build Coastguard Worker.w16_loop: 5731*c0909341SAndroid Build Coastguard Worker call .main 5732*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5733*c0909341SAndroid Build Coastguard Worker add maskq, 16 5734*c0909341SAndroid Build Coastguard Worker.w16: 5735*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m4, m5 5736*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m5 5737*c0909341SAndroid Build Coastguard Worker paddw m6, m14 5738*c0909341SAndroid Build Coastguard Worker paddw m4, m6 5739*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5740*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 5741*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm5 5742*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm4, q3120 5743*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5744*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 5745*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 5746*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m3 5747*c0909341SAndroid Build Coastguard Worker mova [maskq], xm4 5748*c0909341SAndroid Build Coastguard Worker sub hd, 4 5749*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5750*c0909341SAndroid Build Coastguard Worker RET 5751*c0909341SAndroid Build Coastguard Worker.w32_loop: 5752*c0909341SAndroid Build Coastguard Worker call .main 5753*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5754*c0909341SAndroid Build Coastguard Worker add maskq, 32 5755*c0909341SAndroid Build Coastguard Worker.w32: 5756*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5757*c0909341SAndroid Build Coastguard Worker paddw m4, m5 5758*c0909341SAndroid Build Coastguard Worker psrlw m15, m4, 2 5759*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 5760*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 5761*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m2 5762*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m3 5763*c0909341SAndroid Build Coastguard Worker call .main 5764*c0909341SAndroid Build Coastguard Worker mova m6, [deint_shuf] 5765*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5766*c0909341SAndroid Build Coastguard Worker paddw m4, m5 5767*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5768*c0909341SAndroid Build Coastguard Worker packuswb m15, m4 5769*c0909341SAndroid Build Coastguard Worker vpermd m4, m6, m15 5770*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+32*0], m0 5771*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+32*1], m1 5772*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +32*0], m2 5773*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +32*1], m3 5774*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 5775*c0909341SAndroid Build Coastguard Worker sub hd, 4 5776*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5777*c0909341SAndroid Build Coastguard Worker RET 5778*c0909341SAndroid Build Coastguard Worker.w64_loop: 5779*c0909341SAndroid Build Coastguard Worker call .main 5780*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5781*c0909341SAndroid Build Coastguard Worker add maskq, 32 5782*c0909341SAndroid Build Coastguard Worker.w64: 5783*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5784*c0909341SAndroid Build Coastguard Worker paddw m15, m14, m5 5785*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 5786*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 5787*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*2], m2 5788*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*3], m3 5789*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 ; no available registers 5790*c0909341SAndroid Build Coastguard Worker call .main 5791*c0909341SAndroid Build Coastguard Worker paddw m4, [maskq] 5792*c0909341SAndroid Build Coastguard Worker mova m6, [deint_shuf] 5793*c0909341SAndroid Build Coastguard Worker paddw m5, m15 5794*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5795*c0909341SAndroid Build Coastguard Worker psrlw m5, 2 5796*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 ; 0 2 4 6 1 3 5 7 5797*c0909341SAndroid Build Coastguard Worker vpermd m4, m6, m4 5798*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m0 5799*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m1 5800*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*2], m2 5801*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*3], m3 5802*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 5803*c0909341SAndroid Build Coastguard Worker sub hd, 2 5804*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5805*c0909341SAndroid Build Coastguard Worker RET 5806*c0909341SAndroid Build Coastguard Worker.w128_loop: 5807*c0909341SAndroid Build Coastguard Worker call .main 5808*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5809*c0909341SAndroid Build Coastguard Worker add maskq, 64 5810*c0909341SAndroid Build Coastguard Worker.w128: 5811*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5812*c0909341SAndroid Build Coastguard Worker paddw m5, m14 5813*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 5814*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 5815*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*2], m2 5816*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*3], m3 5817*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m4 5818*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq], m5 5819*c0909341SAndroid Build Coastguard Worker call .main 5820*c0909341SAndroid Build Coastguard Worker paddw m4, m14 5821*c0909341SAndroid Build Coastguard Worker paddw m15, m14, m5 5822*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*4], m0 5823*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*5], m1 5824*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*6], m2 5825*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*7], m3 5826*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m4 5827*c0909341SAndroid Build Coastguard Worker call .main 5828*c0909341SAndroid Build Coastguard Worker paddw m4, [maskq+32*0] 5829*c0909341SAndroid Build Coastguard Worker paddw m5, [dstq+strideq] 5830*c0909341SAndroid Build Coastguard Worker mova m6, [deint_shuf] 5831*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5832*c0909341SAndroid Build Coastguard Worker psrlw m5, 2 5833*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 5834*c0909341SAndroid Build Coastguard Worker vpermd m4, m6, m4 5835*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m0 5836*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m1 5837*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*2], m2 5838*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*3], m3 5839*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m4 5840*c0909341SAndroid Build Coastguard Worker call .main 5841*c0909341SAndroid Build Coastguard Worker paddw m4, [maskq+32*1] 5842*c0909341SAndroid Build Coastguard Worker mova m6, [deint_shuf] 5843*c0909341SAndroid Build Coastguard Worker paddw m5, m15 5844*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 5845*c0909341SAndroid Build Coastguard Worker psrlw m5, 2 5846*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 5847*c0909341SAndroid Build Coastguard Worker vpermd m4, m6, m4 5848*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*4], m0 5849*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*5], m1 5850*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*6], m2 5851*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*7], m3 5852*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m4 5853*c0909341SAndroid Build Coastguard Worker sub hd, 2 5854*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5855*c0909341SAndroid Build Coastguard Worker RET 5856*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5857*c0909341SAndroid Build Coastguard Worker.main: 5858*c0909341SAndroid Build Coastguard Worker%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul 5859*c0909341SAndroid Build Coastguard Worker mova m%1, [tmp1q+32*%1] 5860*c0909341SAndroid Build Coastguard Worker mova m%2, [tmp2q+32*%1] 5861*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m%2, m%1 5862*c0909341SAndroid Build Coastguard Worker punpckhwd m9, m%2, m%1 5863*c0909341SAndroid Build Coastguard Worker psubsw m%1, m%2 5864*c0909341SAndroid Build Coastguard Worker pabsw m%1, m%1 5865*c0909341SAndroid Build Coastguard Worker psubusw m7, m10, m%1 5866*c0909341SAndroid Build Coastguard Worker psrlw m7, 10 ; 64-m 5867*c0909341SAndroid Build Coastguard Worker psubw m%2, m%3, m7 ; m 5868*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m7, m%2 5869*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m%2 5870*c0909341SAndroid Build Coastguard Worker pmaddwd m%1, m8 5871*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m9 5872*c0909341SAndroid Build Coastguard Worker psrad m%1, 5 5873*c0909341SAndroid Build Coastguard Worker psrad m7, 5 5874*c0909341SAndroid Build Coastguard Worker packssdw m%1, m7 5875*c0909341SAndroid Build Coastguard Worker pmaxsw m%1, m%4 5876*c0909341SAndroid Build Coastguard Worker psubsw m%1, m%4 5877*c0909341SAndroid Build Coastguard Worker pmulhw m%1, m%5 5878*c0909341SAndroid Build Coastguard Worker%endmacro 5879*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4 5880*c0909341SAndroid Build Coastguard Worker W_MASK 1, 5 5881*c0909341SAndroid Build Coastguard Worker phaddw m4, m5 5882*c0909341SAndroid Build Coastguard Worker W_MASK 2, 5 5883*c0909341SAndroid Build Coastguard Worker W_MASK 3, 6 5884*c0909341SAndroid Build Coastguard Worker phaddw m5, m6 5885*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 5886*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 5887*c0909341SAndroid Build Coastguard Worker ret 5888*c0909341SAndroid Build Coastguard Worker 5889*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5890*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx2_table 5891*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_422_avx2_table] 5892*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5893*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; pixel_max 5894*c0909341SAndroid Build Coastguard Worker vpbroadcastb m14, r7m ; sign 5895*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5896*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5897*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5898*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pw_27615] 5899*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_64] 5900*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+bidir_rnd+r6*4] 5901*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+bidir_mul+r6*4] 5902*c0909341SAndroid Build Coastguard Worker mova m15, [base+deint_shuf] 5903*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5904*c0909341SAndroid Build Coastguard Worker add wq, r7 5905*c0909341SAndroid Build Coastguard Worker call .main 5906*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5907*c0909341SAndroid Build Coastguard Worker jmp wq 5908*c0909341SAndroid Build Coastguard Worker.w4: 5909*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 5910*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 5911*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 5912*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 5913*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 5914*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5915*c0909341SAndroid Build Coastguard Worker jl .w4_end 5916*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5917*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 5918*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 5919*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 5920*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 5921*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5922*c0909341SAndroid Build Coastguard Worker je .w4_end 5923*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5924*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5925*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5926*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m2, 1 5927*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5928*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5929*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5930*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm3 5931*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm3 5932*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m3, 1 5933*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm3 5934*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 5935*c0909341SAndroid Build Coastguard Worker.w4_end: 5936*c0909341SAndroid Build Coastguard Worker RET 5937*c0909341SAndroid Build Coastguard Worker.w8_loop: 5938*c0909341SAndroid Build Coastguard Worker call .main 5939*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5940*c0909341SAndroid Build Coastguard Worker.w8: 5941*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5942*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 5943*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 5944*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 5945*c0909341SAndroid Build Coastguard Worker sub hd, 8 5946*c0909341SAndroid Build Coastguard Worker jl .w8_end 5947*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5948*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm2 5949*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m2, 1 5950*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm3 5951*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m3, 1 5952*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5953*c0909341SAndroid Build Coastguard Worker.w8_end: 5954*c0909341SAndroid Build Coastguard Worker RET 5955*c0909341SAndroid Build Coastguard Worker.w16_loop: 5956*c0909341SAndroid Build Coastguard Worker call .main 5957*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5958*c0909341SAndroid Build Coastguard Worker.w16: 5959*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5960*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 5961*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 5962*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m3 5963*c0909341SAndroid Build Coastguard Worker sub hd, 4 5964*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5965*c0909341SAndroid Build Coastguard Worker RET 5966*c0909341SAndroid Build Coastguard Worker.w32_loop: 5967*c0909341SAndroid Build Coastguard Worker call .main 5968*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5969*c0909341SAndroid Build Coastguard Worker.w32: 5970*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*0], m0 5971*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+32*1], m1 5972*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*0], m2 5973*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+32*1], m3 5974*c0909341SAndroid Build Coastguard Worker sub hd, 2 5975*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5976*c0909341SAndroid Build Coastguard Worker RET 5977*c0909341SAndroid Build Coastguard Worker.w64_loop: 5978*c0909341SAndroid Build Coastguard Worker call .main 5979*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5980*c0909341SAndroid Build Coastguard Worker.w64: 5981*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 5982*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 5983*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 5984*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 5985*c0909341SAndroid Build Coastguard Worker dec hd 5986*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5987*c0909341SAndroid Build Coastguard Worker RET 5988*c0909341SAndroid Build Coastguard Worker.w128_loop: 5989*c0909341SAndroid Build Coastguard Worker call .main 5990*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5991*c0909341SAndroid Build Coastguard Worker.w128: 5992*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 5993*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 5994*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 5995*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 5996*c0909341SAndroid Build Coastguard Worker call .main 5997*c0909341SAndroid Build Coastguard Worker mova [dstq+32*4], m0 5998*c0909341SAndroid Build Coastguard Worker mova [dstq+32*5], m1 5999*c0909341SAndroid Build Coastguard Worker mova [dstq+32*6], m2 6000*c0909341SAndroid Build Coastguard Worker mova [dstq+32*7], m3 6001*c0909341SAndroid Build Coastguard Worker dec hd 6002*c0909341SAndroid Build Coastguard Worker jg .w128_loop 6003*c0909341SAndroid Build Coastguard Worker RET 6004*c0909341SAndroid Build Coastguard WorkerALIGN function_align 6005*c0909341SAndroid Build Coastguard Worker.main: 6006*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4 6007*c0909341SAndroid Build Coastguard Worker W_MASK 1, 5 6008*c0909341SAndroid Build Coastguard Worker phaddw m4, m5 6009*c0909341SAndroid Build Coastguard Worker W_MASK 2, 5 6010*c0909341SAndroid Build Coastguard Worker W_MASK 3, 6 6011*c0909341SAndroid Build Coastguard Worker phaddw m5, m6 6012*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 6013*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 6014*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6015*c0909341SAndroid Build Coastguard Worker pxor m5, m5 6016*c0909341SAndroid Build Coastguard Worker psubb m4, m14 6017*c0909341SAndroid Build Coastguard Worker pavgb m4, m5 6018*c0909341SAndroid Build Coastguard Worker vpermd m4, m15, m4 6019*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 6020*c0909341SAndroid Build Coastguard Worker add maskq, 32 6021*c0909341SAndroid Build Coastguard Worker ret 6022*c0909341SAndroid Build Coastguard Worker 6023*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 6024*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx2_table 6025*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_444_avx2_table] 6026*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6027*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; pixel_max 6028*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 6029*c0909341SAndroid Build Coastguard Worker shr r6d, 11 6030*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 6031*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pw_27615] 6032*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pw_64] 6033*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+bidir_rnd+r6*4] 6034*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+bidir_mul+r6*4] 6035*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 6036*c0909341SAndroid Build Coastguard Worker add wq, r7 6037*c0909341SAndroid Build Coastguard Worker call .main 6038*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 6039*c0909341SAndroid Build Coastguard Worker jmp wq 6040*c0909341SAndroid Build Coastguard Worker.w4: 6041*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 6042*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 6043*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 6044*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 6045*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 6046*c0909341SAndroid Build Coastguard Worker cmp hd, 8 6047*c0909341SAndroid Build Coastguard Worker jl .w4_end 6048*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6049*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 6050*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 6051*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 6052*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 6053*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 6054*c0909341SAndroid Build Coastguard Worker je .w4_end 6055*c0909341SAndroid Build Coastguard Worker call .main 6056*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6057*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 6058*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 6059*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 6060*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 6061*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 6062*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6063*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 6064*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 6065*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 6066*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 6067*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 6068*c0909341SAndroid Build Coastguard Worker.w4_end: 6069*c0909341SAndroid Build Coastguard Worker RET 6070*c0909341SAndroid Build Coastguard Worker.w8_loop: 6071*c0909341SAndroid Build Coastguard Worker call .main 6072*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6073*c0909341SAndroid Build Coastguard Worker.w8: 6074*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 6075*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 6076*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 6077*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m1, 1 6078*c0909341SAndroid Build Coastguard Worker sub hd, 4 6079*c0909341SAndroid Build Coastguard Worker jg .w8_loop 6080*c0909341SAndroid Build Coastguard Worker.w8_end: 6081*c0909341SAndroid Build Coastguard Worker RET 6082*c0909341SAndroid Build Coastguard Worker.w16_loop: 6083*c0909341SAndroid Build Coastguard Worker call .main 6084*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 6085*c0909341SAndroid Build Coastguard Worker.w16: 6086*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 6087*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 6088*c0909341SAndroid Build Coastguard Worker sub hd, 2 6089*c0909341SAndroid Build Coastguard Worker jg .w16_loop 6090*c0909341SAndroid Build Coastguard Worker RET 6091*c0909341SAndroid Build Coastguard Worker.w32_loop: 6092*c0909341SAndroid Build Coastguard Worker call .main 6093*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6094*c0909341SAndroid Build Coastguard Worker.w32: 6095*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6096*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 6097*c0909341SAndroid Build Coastguard Worker dec hd 6098*c0909341SAndroid Build Coastguard Worker jg .w32_loop 6099*c0909341SAndroid Build Coastguard Worker RET 6100*c0909341SAndroid Build Coastguard Worker.w64_loop: 6101*c0909341SAndroid Build Coastguard Worker call .main 6102*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6103*c0909341SAndroid Build Coastguard Worker.w64: 6104*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6105*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 6106*c0909341SAndroid Build Coastguard Worker call .main 6107*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 6108*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m1 6109*c0909341SAndroid Build Coastguard Worker dec hd 6110*c0909341SAndroid Build Coastguard Worker jg .w64_loop 6111*c0909341SAndroid Build Coastguard Worker RET 6112*c0909341SAndroid Build Coastguard Worker.w128_loop: 6113*c0909341SAndroid Build Coastguard Worker call .main 6114*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6115*c0909341SAndroid Build Coastguard Worker.w128: 6116*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6117*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 6118*c0909341SAndroid Build Coastguard Worker call .main 6119*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 6120*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m1 6121*c0909341SAndroid Build Coastguard Worker call .main 6122*c0909341SAndroid Build Coastguard Worker mova [dstq+32*4], m0 6123*c0909341SAndroid Build Coastguard Worker mova [dstq+32*5], m1 6124*c0909341SAndroid Build Coastguard Worker call .main 6125*c0909341SAndroid Build Coastguard Worker mova [dstq+32*6], m0 6126*c0909341SAndroid Build Coastguard Worker mova [dstq+32*7], m1 6127*c0909341SAndroid Build Coastguard Worker dec hd 6128*c0909341SAndroid Build Coastguard Worker jg .w128_loop 6129*c0909341SAndroid Build Coastguard Worker RET 6130*c0909341SAndroid Build Coastguard WorkerALIGN function_align 6131*c0909341SAndroid Build Coastguard Worker.main: 6132*c0909341SAndroid Build Coastguard Worker W_MASK 0, 2, 4, 5, 6 6133*c0909341SAndroid Build Coastguard Worker W_MASK 1, 3, 4, 5, 6 6134*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 6135*c0909341SAndroid Build Coastguard Worker vpermq m2, m2, q3120 6136*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*2 6137*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*2 6138*c0909341SAndroid Build Coastguard Worker mova [maskq], m2 6139*c0909341SAndroid Build Coastguard Worker add maskq, 32 6140*c0909341SAndroid Build Coastguard Worker ret 6141*c0909341SAndroid Build Coastguard Worker 6142*c0909341SAndroid Build Coastguard Worker; (a * (64 - m) + b * m + 32) >> 6 6143*c0909341SAndroid Build Coastguard Worker; = (((b - a) * m + 32) >> 6) + a 6144*c0909341SAndroid Build Coastguard Worker; = (((b - a) * (m << 9) + 16384) >> 15) + a 6145*c0909341SAndroid Build Coastguard Worker; except m << 9 overflows int16_t when m == 64 (which is possible), 6146*c0909341SAndroid Build Coastguard Worker; but if we negate m it works out (-64 << 9 == -32768). 6147*c0909341SAndroid Build Coastguard Worker; = (((a - b) * (m * -512) + 16384) >> 15) + a 6148*c0909341SAndroid Build Coastguard Workercglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 6149*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx2_table 6150*c0909341SAndroid Build Coastguard Worker lea r6, [blend_avx2_table] 6151*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6152*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 6153*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 6154*c0909341SAndroid Build Coastguard Worker movifnidn maskq, maskmp 6155*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_m512] 6156*c0909341SAndroid Build Coastguard Worker add wq, r6 6157*c0909341SAndroid Build Coastguard Worker lea r6, [dsq*3] 6158*c0909341SAndroid Build Coastguard Worker jmp wq 6159*c0909341SAndroid Build Coastguard Worker.w4: 6160*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [maskq] 6161*c0909341SAndroid Build Coastguard Worker movq xm0, [dstq+dsq*0] 6162*c0909341SAndroid Build Coastguard Worker movhps xm0, [dstq+dsq*1] 6163*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [dstq+dsq*2] 6164*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [dstq+r6 ] 6165*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0x30 6166*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0xc0 6167*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 6168*c0909341SAndroid Build Coastguard Worker add maskq, 16 6169*c0909341SAndroid Build Coastguard Worker add tmpq, 32 6170*c0909341SAndroid Build Coastguard Worker pmullw m3, m6 6171*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 6172*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6173*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6174*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 6175*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 6176*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*2], xm1 6177*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], xm1 6178*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 6179*c0909341SAndroid Build Coastguard Worker sub hd, 4 6180*c0909341SAndroid Build Coastguard Worker jg .w4 6181*c0909341SAndroid Build Coastguard Worker RET 6182*c0909341SAndroid Build Coastguard Worker.w8: 6183*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [maskq+16*0] 6184*c0909341SAndroid Build Coastguard Worker pmovzxbw m5, [maskq+16*1] 6185*c0909341SAndroid Build Coastguard Worker mova xm0, [dstq+dsq*0] 6186*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [dstq+dsq*1], 1 6187*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*2] 6188*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [dstq+r6 ], 1 6189*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+32*0] 6190*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+32*1] 6191*c0909341SAndroid Build Coastguard Worker add maskq, 16*2 6192*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 6193*c0909341SAndroid Build Coastguard Worker pmullw m4, m6 6194*c0909341SAndroid Build Coastguard Worker pmullw m5, m6 6195*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 6196*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 6197*c0909341SAndroid Build Coastguard Worker paddw m0, m2 6198*c0909341SAndroid Build Coastguard Worker paddw m1, m3 6199*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 6200*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 6201*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*2], xm1 6202*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+r6 ], m1, 1 6203*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 6204*c0909341SAndroid Build Coastguard Worker sub hd, 4 6205*c0909341SAndroid Build Coastguard Worker jg .w8 6206*c0909341SAndroid Build Coastguard Worker RET 6207*c0909341SAndroid Build Coastguard Worker.w16: 6208*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [maskq+16*0] 6209*c0909341SAndroid Build Coastguard Worker pmovzxbw m5, [maskq+16*1] 6210*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0] 6211*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+ 32*0] 6212*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1] 6213*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+ 32*1] 6214*c0909341SAndroid Build Coastguard Worker add maskq, 16*2 6215*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 6216*c0909341SAndroid Build Coastguard Worker pmullw m4, m6 6217*c0909341SAndroid Build Coastguard Worker pmullw m5, m6 6218*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 6219*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 6220*c0909341SAndroid Build Coastguard Worker paddw m0, m2 6221*c0909341SAndroid Build Coastguard Worker paddw m1, m3 6222*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 6223*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 6224*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6225*c0909341SAndroid Build Coastguard Worker sub hd, 2 6226*c0909341SAndroid Build Coastguard Worker jg .w16 6227*c0909341SAndroid Build Coastguard Worker RET 6228*c0909341SAndroid Build Coastguard Worker.w32: 6229*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [maskq+16*0] 6230*c0909341SAndroid Build Coastguard Worker pmovzxbw m5, [maskq+16*1] 6231*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+32*0] 6232*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+32*0] 6233*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+32*1] 6234*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+32*1] 6235*c0909341SAndroid Build Coastguard Worker add maskq, 16*2 6236*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 6237*c0909341SAndroid Build Coastguard Worker pmullw m4, m6 6238*c0909341SAndroid Build Coastguard Worker pmullw m5, m6 6239*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 6240*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 6241*c0909341SAndroid Build Coastguard Worker paddw m0, m2 6242*c0909341SAndroid Build Coastguard Worker paddw m1, m3 6243*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6244*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 6245*c0909341SAndroid Build Coastguard Worker add dstq, dsq 6246*c0909341SAndroid Build Coastguard Worker dec hd 6247*c0909341SAndroid Build Coastguard Worker jg .w32 6248*c0909341SAndroid Build Coastguard Worker RET 6249*c0909341SAndroid Build Coastguard Worker 6250*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 6251*c0909341SAndroid Build Coastguard Workercglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h 6252*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_avx2_table 6253*c0909341SAndroid Build Coastguard Worker lea r5, [blend_v_avx2_table] 6254*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6255*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 6256*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 6257*c0909341SAndroid Build Coastguard Worker add wq, r5 6258*c0909341SAndroid Build Coastguard Worker jmp wq 6259*c0909341SAndroid Build Coastguard Worker.w2: 6260*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+obmc_masks_avx2+2*2] 6261*c0909341SAndroid Build Coastguard Worker.w2_loop: 6262*c0909341SAndroid Build Coastguard Worker movd m0, [dstq+dsq*0] 6263*c0909341SAndroid Build Coastguard Worker pinsrd m0, [dstq+dsq*1], 1 6264*c0909341SAndroid Build Coastguard Worker movq m1, [tmpq] 6265*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 6266*c0909341SAndroid Build Coastguard Worker psubw m1, m0, m1 6267*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 6268*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6269*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], m0 6270*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], m0, 1 6271*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6272*c0909341SAndroid Build Coastguard Worker sub hd, 2 6273*c0909341SAndroid Build Coastguard Worker jg .w2_loop 6274*c0909341SAndroid Build Coastguard Worker RET 6275*c0909341SAndroid Build Coastguard Worker.w4: 6276*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+obmc_masks_avx2+4*2] 6277*c0909341SAndroid Build Coastguard Worker.w4_loop: 6278*c0909341SAndroid Build Coastguard Worker movq m0, [dstq+dsq*0] 6279*c0909341SAndroid Build Coastguard Worker movhps m0, [dstq+dsq*1] 6280*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 6281*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 6282*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 6283*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6284*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], m0 6285*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], m0 6286*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6287*c0909341SAndroid Build Coastguard Worker sub hd, 2 6288*c0909341SAndroid Build Coastguard Worker jg .w4_loop 6289*c0909341SAndroid Build Coastguard Worker RET 6290*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 6291*c0909341SAndroid Build Coastguard Worker.w8: 6292*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] 6293*c0909341SAndroid Build Coastguard Worker.w8_loop: 6294*c0909341SAndroid Build Coastguard Worker mova xm0, [dstq+dsq*0] 6295*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [dstq+dsq*1], 1 6296*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 6297*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 6298*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 6299*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6300*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 6301*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 6302*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6303*c0909341SAndroid Build Coastguard Worker sub hd, 2 6304*c0909341SAndroid Build Coastguard Worker jg .w8_loop 6305*c0909341SAndroid Build Coastguard Worker RET 6306*c0909341SAndroid Build Coastguard Worker.w16: 6307*c0909341SAndroid Build Coastguard Worker mova m4, [base+obmc_masks_avx2+16*2] 6308*c0909341SAndroid Build Coastguard Worker.w16_loop: 6309*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0] 6310*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+ 32*0] 6311*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1] 6312*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+ 32*1] 6313*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 6314*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 6315*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 6316*c0909341SAndroid Build Coastguard Worker paddw m0, m2 6317*c0909341SAndroid Build Coastguard Worker paddw m1, m3 6318*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 6319*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 6320*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6321*c0909341SAndroid Build Coastguard Worker sub hd, 2 6322*c0909341SAndroid Build Coastguard Worker jg .w16_loop 6323*c0909341SAndroid Build Coastguard Worker RET 6324*c0909341SAndroid Build Coastguard Worker.w32: 6325*c0909341SAndroid Build Coastguard Worker%if WIN64 6326*c0909341SAndroid Build Coastguard Worker movaps [rsp+ 8], xmm6 6327*c0909341SAndroid Build Coastguard Worker movaps [rsp+24], xmm7 6328*c0909341SAndroid Build Coastguard Worker%endif 6329*c0909341SAndroid Build Coastguard Worker mova m6, [base+obmc_masks_avx2+32*2] 6330*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] 6331*c0909341SAndroid Build Coastguard Worker.w32_loop: 6332*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0+32*0] 6333*c0909341SAndroid Build Coastguard Worker psubw m3, m0, [tmpq +32*0] 6334*c0909341SAndroid Build Coastguard Worker mova xm2, [dstq+dsq*0+32*1] 6335*c0909341SAndroid Build Coastguard Worker mova xm5, [tmpq +32*1] 6336*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1+32*0] 6337*c0909341SAndroid Build Coastguard Worker psubw m4, m1, [tmpq +32*2] 6338*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [dstq+dsq*1+32*1], 1 6339*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [tmpq +32*3], 1 6340*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 6341*c0909341SAndroid Build Coastguard Worker psubw m5, m2, m5 6342*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 6343*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m6 6344*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 6345*c0909341SAndroid Build Coastguard Worker paddw m0, m3 6346*c0909341SAndroid Build Coastguard Worker paddw m1, m4 6347*c0909341SAndroid Build Coastguard Worker paddw m2, m5 6348*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*0], m0 6349*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*0], m1 6350*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*1], xm2 6351*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1+32*1], m2, 1 6352*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6353*c0909341SAndroid Build Coastguard Worker sub hd, 2 6354*c0909341SAndroid Build Coastguard Worker jg .w32_loop 6355*c0909341SAndroid Build Coastguard Worker%if WIN64 6356*c0909341SAndroid Build Coastguard Worker movaps xmm6, [rsp+ 8] 6357*c0909341SAndroid Build Coastguard Worker movaps xmm7, [rsp+24] 6358*c0909341SAndroid Build Coastguard Worker%endif 6359*c0909341SAndroid Build Coastguard Worker RET 6360*c0909341SAndroid Build Coastguard Worker 6361*c0909341SAndroid Build Coastguard Worker%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp 6362*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+32*(%1+0)] 6363*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+32*(%2+0)] 6364*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+32*(%1+1)] 6365*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+32*(%2+1)] 6366*c0909341SAndroid Build Coastguard Worker%if %3 6367*c0909341SAndroid Build Coastguard Worker add tmpq, 32*%3 6368*c0909341SAndroid Build Coastguard Worker%endif 6369*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 6370*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 6371*c0909341SAndroid Build Coastguard Worker paddw m0, m2 6372*c0909341SAndroid Build Coastguard Worker paddw m1, m3 6373*c0909341SAndroid Build Coastguard Worker mova [dstq+32*(%1+0)], m0 6374*c0909341SAndroid Build Coastguard Worker mova [dstq+32*(%1+1)], m1 6375*c0909341SAndroid Build Coastguard Worker%endmacro 6376*c0909341SAndroid Build Coastguard Worker 6377*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 6378*c0909341SAndroid Build Coastguard Workercglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 6379*c0909341SAndroid Build Coastguard Worker%define base r5-blend_h_avx2_table 6380*c0909341SAndroid Build Coastguard Worker lea r5, [blend_h_avx2_table] 6381*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6382*c0909341SAndroid Build Coastguard Worker mov hd, hm 6383*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 6384*c0909341SAndroid Build Coastguard Worker add wq, r5 6385*c0909341SAndroid Build Coastguard Worker lea maskq, [base+obmc_masks_avx2+hq*2] 6386*c0909341SAndroid Build Coastguard Worker lea hd, [hq*3] 6387*c0909341SAndroid Build Coastguard Worker shr hd, 2 ; h * 3/4 6388*c0909341SAndroid Build Coastguard Worker lea maskq, [maskq+hq*2] 6389*c0909341SAndroid Build Coastguard Worker neg hq 6390*c0909341SAndroid Build Coastguard Worker jmp wq 6391*c0909341SAndroid Build Coastguard Worker.w2: 6392*c0909341SAndroid Build Coastguard Worker movd m0, [dstq+dsq*0] 6393*c0909341SAndroid Build Coastguard Worker pinsrd m0, [dstq+dsq*1], 1 6394*c0909341SAndroid Build Coastguard Worker movd m2, [maskq+hq*2] 6395*c0909341SAndroid Build Coastguard Worker movq m1, [tmpq] 6396*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 6397*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m2 6398*c0909341SAndroid Build Coastguard Worker psubw m1, m0, m1 6399*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 6400*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6401*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], m0 6402*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], m0, 1 6403*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6404*c0909341SAndroid Build Coastguard Worker add hq, 2 6405*c0909341SAndroid Build Coastguard Worker jl .w2 6406*c0909341SAndroid Build Coastguard Worker RET 6407*c0909341SAndroid Build Coastguard Worker.w4: 6408*c0909341SAndroid Build Coastguard Worker mova m3, [blend_shuf] 6409*c0909341SAndroid Build Coastguard Worker.w4_loop: 6410*c0909341SAndroid Build Coastguard Worker movq m0, [dstq+dsq*0] 6411*c0909341SAndroid Build Coastguard Worker movhps m0, [dstq+dsq*1] 6412*c0909341SAndroid Build Coastguard Worker movd m2, [maskq+hq*2] 6413*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 6414*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 6415*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 6416*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 6417*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6418*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], m0 6419*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], m0 6420*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6421*c0909341SAndroid Build Coastguard Worker add hq, 2 6422*c0909341SAndroid Build Coastguard Worker jl .w4_loop 6423*c0909341SAndroid Build Coastguard Worker RET 6424*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 6425*c0909341SAndroid Build Coastguard Worker.w8: 6426*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [blend_shuf] 6427*c0909341SAndroid Build Coastguard Worker shufpd m3, m3, 0x0c 6428*c0909341SAndroid Build Coastguard Worker.w8_loop: 6429*c0909341SAndroid Build Coastguard Worker mova xm0, [dstq+dsq*0] 6430*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [dstq+dsq*1], 1 6431*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [maskq+hq*2] 6432*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 6433*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 6434*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 6435*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 6436*c0909341SAndroid Build Coastguard Worker paddw m0, m1 6437*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 6438*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 6439*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6440*c0909341SAndroid Build Coastguard Worker add hq, 2 6441*c0909341SAndroid Build Coastguard Worker jl .w8_loop 6442*c0909341SAndroid Build Coastguard Worker RET 6443*c0909341SAndroid Build Coastguard Worker.w16: 6444*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [maskq+hq*2] 6445*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, [maskq+hq*2+2] 6446*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0] 6447*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+ 32*0] 6448*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1] 6449*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+ 32*1] 6450*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 6451*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 6452*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 6453*c0909341SAndroid Build Coastguard Worker paddw m0, m2 6454*c0909341SAndroid Build Coastguard Worker paddw m1, m3 6455*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 6456*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 6457*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 6458*c0909341SAndroid Build Coastguard Worker add hq, 2 6459*c0909341SAndroid Build Coastguard Worker jl .w16 6460*c0909341SAndroid Build Coastguard Worker RET 6461*c0909341SAndroid Build Coastguard Worker.w32: 6462*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [maskq+hq*2] 6463*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 0, 0, 2 6464*c0909341SAndroid Build Coastguard Worker add dstq, dsq 6465*c0909341SAndroid Build Coastguard Worker inc hq 6466*c0909341SAndroid Build Coastguard Worker jl .w32 6467*c0909341SAndroid Build Coastguard Worker RET 6468*c0909341SAndroid Build Coastguard Worker.w64: 6469*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [maskq+hq*2] 6470*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 0, 0 6471*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 2, 2, 4 6472*c0909341SAndroid Build Coastguard Worker add dstq, dsq 6473*c0909341SAndroid Build Coastguard Worker inc hq 6474*c0909341SAndroid Build Coastguard Worker jl .w64 6475*c0909341SAndroid Build Coastguard Worker RET 6476*c0909341SAndroid Build Coastguard Worker.w128: 6477*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [maskq+hq*2] 6478*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 0, 0 6479*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 2, 2, 8 6480*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 4, -4 6481*c0909341SAndroid Build Coastguard Worker BLEND_H_ROW 6, -2 6482*c0909341SAndroid Build Coastguard Worker add dstq, dsq 6483*c0909341SAndroid Build Coastguard Worker inc hq 6484*c0909341SAndroid Build Coastguard Worker jl .w128 6485*c0909341SAndroid Build Coastguard Worker RET 6486*c0909341SAndroid Build Coastguard Worker 6487*c0909341SAndroid Build Coastguard Workercglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ 6488*c0909341SAndroid Build Coastguard Worker bottomext, rightext 6489*c0909341SAndroid Build Coastguard Worker ; we assume that the buffer (stride) is larger than width, so we can 6490*c0909341SAndroid Build Coastguard Worker ; safely overwrite by a few bytes 6491*c0909341SAndroid Build Coastguard Worker 6492*c0909341SAndroid Build Coastguard Worker ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 6493*c0909341SAndroid Build Coastguard Worker xor r12d, r12d 6494*c0909341SAndroid Build Coastguard Worker lea r10, [ihq-1] 6495*c0909341SAndroid Build Coastguard Worker cmp yq, ihq 6496*c0909341SAndroid Build Coastguard Worker cmovs r10, yq 6497*c0909341SAndroid Build Coastguard Worker test yq, yq 6498*c0909341SAndroid Build Coastguard Worker cmovs r10, r12 6499*c0909341SAndroid Build Coastguard Worker imul r10, sstrideq 6500*c0909341SAndroid Build Coastguard Worker add srcq, r10 6501*c0909341SAndroid Build Coastguard Worker 6502*c0909341SAndroid Build Coastguard Worker ; ref += iclip(x, 0, iw - 1) 6503*c0909341SAndroid Build Coastguard Worker lea r10, [iwq-1] 6504*c0909341SAndroid Build Coastguard Worker cmp xq, iwq 6505*c0909341SAndroid Build Coastguard Worker cmovs r10, xq 6506*c0909341SAndroid Build Coastguard Worker test xq, xq 6507*c0909341SAndroid Build Coastguard Worker cmovs r10, r12 6508*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+r10*2] 6509*c0909341SAndroid Build Coastguard Worker 6510*c0909341SAndroid Build Coastguard Worker ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 6511*c0909341SAndroid Build Coastguard Worker lea bottomextq, [yq+bhq] 6512*c0909341SAndroid Build Coastguard Worker sub bottomextq, ihq 6513*c0909341SAndroid Build Coastguard Worker lea r3, [bhq-1] 6514*c0909341SAndroid Build Coastguard Worker cmovs bottomextq, r12 6515*c0909341SAndroid Build Coastguard Worker 6516*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ 6517*c0909341SAndroid Build Coastguard Worker bottomext, rightext 6518*c0909341SAndroid Build Coastguard Worker 6519*c0909341SAndroid Build Coastguard Worker ; top_ext = iclip(-y, 0, bh - 1) 6520*c0909341SAndroid Build Coastguard Worker neg topextq 6521*c0909341SAndroid Build Coastguard Worker cmovs topextq, r12 6522*c0909341SAndroid Build Coastguard Worker cmp bottomextq, bhq 6523*c0909341SAndroid Build Coastguard Worker cmovns bottomextq, r3 6524*c0909341SAndroid Build Coastguard Worker cmp topextq, bhq 6525*c0909341SAndroid Build Coastguard Worker cmovg topextq, r3 6526*c0909341SAndroid Build Coastguard Worker 6527*c0909341SAndroid Build Coastguard Worker ; right_ext = iclip(x + bw - iw, 0, bw - 1) 6528*c0909341SAndroid Build Coastguard Worker lea rightextq, [xq+bwq] 6529*c0909341SAndroid Build Coastguard Worker sub rightextq, iwq 6530*c0909341SAndroid Build Coastguard Worker lea r2, [bwq-1] 6531*c0909341SAndroid Build Coastguard Worker cmovs rightextq, r12 6532*c0909341SAndroid Build Coastguard Worker 6533*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ 6534*c0909341SAndroid Build Coastguard Worker bottomext, rightext 6535*c0909341SAndroid Build Coastguard Worker 6536*c0909341SAndroid Build Coastguard Worker ; left_ext = iclip(-x, 0, bw - 1) 6537*c0909341SAndroid Build Coastguard Worker neg leftextq 6538*c0909341SAndroid Build Coastguard Worker cmovs leftextq, r12 6539*c0909341SAndroid Build Coastguard Worker cmp rightextq, bwq 6540*c0909341SAndroid Build Coastguard Worker cmovns rightextq, r2 6541*c0909341SAndroid Build Coastguard Worker cmp leftextq, bwq 6542*c0909341SAndroid Build Coastguard Worker cmovns leftextq, r2 6543*c0909341SAndroid Build Coastguard Worker 6544*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ 6545*c0909341SAndroid Build Coastguard Worker dst, dstride, src, sstride, bottomext, rightext 6546*c0909341SAndroid Build Coastguard Worker 6547*c0909341SAndroid Build Coastguard Worker ; center_h = bh - top_ext - bottom_ext 6548*c0909341SAndroid Build Coastguard Worker lea r3, [bottomextq+topextq] 6549*c0909341SAndroid Build Coastguard Worker sub centerhq, r3 6550*c0909341SAndroid Build Coastguard Worker 6551*c0909341SAndroid Build Coastguard Worker ; blk += top_ext * PXSTRIDE(dst_stride) 6552*c0909341SAndroid Build Coastguard Worker mov r2, topextq 6553*c0909341SAndroid Build Coastguard Worker imul r2, dstrideq 6554*c0909341SAndroid Build Coastguard Worker add dstq, r2 6555*c0909341SAndroid Build Coastguard Worker mov r9m, dstq 6556*c0909341SAndroid Build Coastguard Worker 6557*c0909341SAndroid Build Coastguard Worker ; center_w = bw - left_ext - right_ext 6558*c0909341SAndroid Build Coastguard Worker mov centerwq, bwq 6559*c0909341SAndroid Build Coastguard Worker lea r3, [rightextq+leftextq] 6560*c0909341SAndroid Build Coastguard Worker sub centerwq, r3 6561*c0909341SAndroid Build Coastguard Worker 6562*c0909341SAndroid Build Coastguard Worker%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 6563*c0909341SAndroid Build Coastguard Worker.v_loop_%3: 6564*c0909341SAndroid Build Coastguard Worker%if %1 6565*c0909341SAndroid Build Coastguard Worker ; left extension 6566*c0909341SAndroid Build Coastguard Worker xor r3, r3 6567*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [srcq] 6568*c0909341SAndroid Build Coastguard Worker.left_loop_%3: 6569*c0909341SAndroid Build Coastguard Worker mova [dstq+r3*2], m0 6570*c0909341SAndroid Build Coastguard Worker add r3, 16 6571*c0909341SAndroid Build Coastguard Worker cmp r3, leftextq 6572*c0909341SAndroid Build Coastguard Worker jl .left_loop_%3 6573*c0909341SAndroid Build Coastguard Worker 6574*c0909341SAndroid Build Coastguard Worker ; body 6575*c0909341SAndroid Build Coastguard Worker lea r12, [dstq+leftextq*2] 6576*c0909341SAndroid Build Coastguard Worker%endif 6577*c0909341SAndroid Build Coastguard Worker xor r3, r3 6578*c0909341SAndroid Build Coastguard Worker.body_loop_%3: 6579*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r3*2] 6580*c0909341SAndroid Build Coastguard Worker%if %1 6581*c0909341SAndroid Build Coastguard Worker movu [r12+r3*2], m0 6582*c0909341SAndroid Build Coastguard Worker%else 6583*c0909341SAndroid Build Coastguard Worker movu [dstq+r3*2], m0 6584*c0909341SAndroid Build Coastguard Worker%endif 6585*c0909341SAndroid Build Coastguard Worker add r3, 16 6586*c0909341SAndroid Build Coastguard Worker cmp r3, centerwq 6587*c0909341SAndroid Build Coastguard Worker jl .body_loop_%3 6588*c0909341SAndroid Build Coastguard Worker 6589*c0909341SAndroid Build Coastguard Worker%if %2 6590*c0909341SAndroid Build Coastguard Worker ; right extension 6591*c0909341SAndroid Build Coastguard Worker%if %1 6592*c0909341SAndroid Build Coastguard Worker lea r12, [r12+centerwq*2] 6593*c0909341SAndroid Build Coastguard Worker%else 6594*c0909341SAndroid Build Coastguard Worker lea r12, [dstq+centerwq*2] 6595*c0909341SAndroid Build Coastguard Worker%endif 6596*c0909341SAndroid Build Coastguard Worker xor r3, r3 6597*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [srcq+centerwq*2-2] 6598*c0909341SAndroid Build Coastguard Worker.right_loop_%3: 6599*c0909341SAndroid Build Coastguard Worker movu [r12+r3*2], m0 6600*c0909341SAndroid Build Coastguard Worker add r3, 16 6601*c0909341SAndroid Build Coastguard Worker cmp r3, rightextq 6602*c0909341SAndroid Build Coastguard Worker jl .right_loop_%3 6603*c0909341SAndroid Build Coastguard Worker 6604*c0909341SAndroid Build Coastguard Worker%endif 6605*c0909341SAndroid Build Coastguard Worker add dstq, dstrideq 6606*c0909341SAndroid Build Coastguard Worker add srcq, sstrideq 6607*c0909341SAndroid Build Coastguard Worker dec centerhq 6608*c0909341SAndroid Build Coastguard Worker jg .v_loop_%3 6609*c0909341SAndroid Build Coastguard Worker%endmacro 6610*c0909341SAndroid Build Coastguard Worker 6611*c0909341SAndroid Build Coastguard Worker test leftextq, leftextq 6612*c0909341SAndroid Build Coastguard Worker jnz .need_left_ext 6613*c0909341SAndroid Build Coastguard Worker test rightextq, rightextq 6614*c0909341SAndroid Build Coastguard Worker jnz .need_right_ext 6615*c0909341SAndroid Build Coastguard Worker v_loop 0, 0, 0 6616*c0909341SAndroid Build Coastguard Worker jmp .body_done 6617*c0909341SAndroid Build Coastguard Worker 6618*c0909341SAndroid Build Coastguard Worker.need_left_ext: 6619*c0909341SAndroid Build Coastguard Worker test rightextq, rightextq 6620*c0909341SAndroid Build Coastguard Worker jnz .need_left_right_ext 6621*c0909341SAndroid Build Coastguard Worker v_loop 1, 0, 1 6622*c0909341SAndroid Build Coastguard Worker jmp .body_done 6623*c0909341SAndroid Build Coastguard Worker 6624*c0909341SAndroid Build Coastguard Worker.need_left_right_ext: 6625*c0909341SAndroid Build Coastguard Worker v_loop 1, 1, 2 6626*c0909341SAndroid Build Coastguard Worker jmp .body_done 6627*c0909341SAndroid Build Coastguard Worker 6628*c0909341SAndroid Build Coastguard Worker.need_right_ext: 6629*c0909341SAndroid Build Coastguard Worker v_loop 0, 1, 3 6630*c0909341SAndroid Build Coastguard Worker 6631*c0909341SAndroid Build Coastguard Worker.body_done: 6632*c0909341SAndroid Build Coastguard Worker ; bottom edge extension 6633*c0909341SAndroid Build Coastguard Worker test bottomextq, bottomextq 6634*c0909341SAndroid Build Coastguard Worker jz .top 6635*c0909341SAndroid Build Coastguard Worker mov srcq, dstq 6636*c0909341SAndroid Build Coastguard Worker sub srcq, dstrideq 6637*c0909341SAndroid Build Coastguard Worker xor r1, r1 6638*c0909341SAndroid Build Coastguard Worker.bottom_x_loop: 6639*c0909341SAndroid Build Coastguard Worker mova m0, [srcq+r1*2] 6640*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r1*2] 6641*c0909341SAndroid Build Coastguard Worker mov r4, bottomextq 6642*c0909341SAndroid Build Coastguard Worker.bottom_y_loop: 6643*c0909341SAndroid Build Coastguard Worker mova [r3], m0 6644*c0909341SAndroid Build Coastguard Worker add r3, dstrideq 6645*c0909341SAndroid Build Coastguard Worker dec r4 6646*c0909341SAndroid Build Coastguard Worker jg .bottom_y_loop 6647*c0909341SAndroid Build Coastguard Worker add r1, 16 6648*c0909341SAndroid Build Coastguard Worker cmp r1, bwq 6649*c0909341SAndroid Build Coastguard Worker jl .bottom_x_loop 6650*c0909341SAndroid Build Coastguard Worker 6651*c0909341SAndroid Build Coastguard Worker.top: 6652*c0909341SAndroid Build Coastguard Worker ; top edge extension 6653*c0909341SAndroid Build Coastguard Worker test topextq, topextq 6654*c0909341SAndroid Build Coastguard Worker jz .end 6655*c0909341SAndroid Build Coastguard Worker mov srcq, r9m 6656*c0909341SAndroid Build Coastguard Worker mov dstq, dstm 6657*c0909341SAndroid Build Coastguard Worker xor r1, r1 6658*c0909341SAndroid Build Coastguard Worker.top_x_loop: 6659*c0909341SAndroid Build Coastguard Worker mova m0, [srcq+r1*2] 6660*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r1*2] 6661*c0909341SAndroid Build Coastguard Worker mov r4, topextq 6662*c0909341SAndroid Build Coastguard Worker.top_y_loop: 6663*c0909341SAndroid Build Coastguard Worker mova [r3], m0 6664*c0909341SAndroid Build Coastguard Worker add r3, dstrideq 6665*c0909341SAndroid Build Coastguard Worker dec r4 6666*c0909341SAndroid Build Coastguard Worker jg .top_y_loop 6667*c0909341SAndroid Build Coastguard Worker add r1, 16 6668*c0909341SAndroid Build Coastguard Worker cmp r1, bwq 6669*c0909341SAndroid Build Coastguard Worker jl .top_x_loop 6670*c0909341SAndroid Build Coastguard Worker 6671*c0909341SAndroid Build Coastguard Worker.end: 6672*c0909341SAndroid Build Coastguard Worker RET 6673*c0909341SAndroid Build Coastguard Worker 6674*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ 6675*c0909341SAndroid Build Coastguard Worker dst_w, h, src_w, dx, mx0, pxmax 6676*c0909341SAndroid Build Coastguard Worker sub dword mx0m, 4<<14 6677*c0909341SAndroid Build Coastguard Worker sub dword src_wm, 8 6678*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, dxm 6679*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, mx0m 6680*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, src_wm 6681*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax 6682*c0909341SAndroid Build Coastguard Worker LEA r7, $$ 6683*c0909341SAndroid Build Coastguard Worker%define base r7-$$ 6684*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+pd_64] 6685*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm7, pxmaxm 6686*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] 6687*c0909341SAndroid Build Coastguard Worker pslld m5, 3 ; dx*8 6688*c0909341SAndroid Build Coastguard Worker pslld m6, 14 6689*c0909341SAndroid Build Coastguard Worker paddd m8, m2 ; mx+[0..7]*dx 6690*c0909341SAndroid Build Coastguard Worker.loop_y: 6691*c0909341SAndroid Build Coastguard Worker xor xd, xd 6692*c0909341SAndroid Build Coastguard Worker mova m4, m8 ; per-line working version of mx 6693*c0909341SAndroid Build Coastguard Worker.loop_x: 6694*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_63] 6695*c0909341SAndroid Build Coastguard Worker pxor m2, m2 6696*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m4, m2 6697*c0909341SAndroid Build Coastguard Worker psrad m9, m4, 8 ; filter offset (unmasked) 6698*c0909341SAndroid Build Coastguard Worker pminsd m0, m6 ; iclip(mx, 0, src_w-8) 6699*c0909341SAndroid Build Coastguard Worker psubd m1, m4, m0 ; pshufb offset 6700*c0909341SAndroid Build Coastguard Worker psrad m0, 14 ; clipped src_x offset 6701*c0909341SAndroid Build Coastguard Worker psrad m1, 14 ; pshufb edge_emu offset 6702*c0909341SAndroid Build Coastguard Worker pand m9, m10 ; filter offset (masked) 6703*c0909341SAndroid Build Coastguard Worker ; load source pixels 6704*c0909341SAndroid Build Coastguard Worker movd r8d, xm0 6705*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm0, 1 6706*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm0, 2 6707*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm0, 3 6708*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 6709*c0909341SAndroid Build Coastguard Worker movu xm10, [srcq+r8*2] 6710*c0909341SAndroid Build Coastguard Worker movu xm11, [srcq+r9*2] 6711*c0909341SAndroid Build Coastguard Worker movu xm12, [srcq+r10*2] 6712*c0909341SAndroid Build Coastguard Worker movu xm13, [srcq+r11*2] 6713*c0909341SAndroid Build Coastguard Worker movd r8d, xm0 6714*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm0, 1 6715*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm0, 2 6716*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm0, 3 6717*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [srcq+r8*2], 1 6718*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [srcq+r9*2], 1 6719*c0909341SAndroid Build Coastguard Worker vinserti128 m12, [srcq+r10*2], 1 6720*c0909341SAndroid Build Coastguard Worker vinserti128 m13, [srcq+r11*2], 1 6721*c0909341SAndroid Build Coastguard Worker ptest m1, m1 6722*c0909341SAndroid Build Coastguard Worker jz .filter 6723*c0909341SAndroid Build Coastguard Worker movq r9, xm1 6724*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 6725*c0909341SAndroid Build Coastguard Worker movsxd r8, r9d 6726*c0909341SAndroid Build Coastguard Worker sar r9, 32 6727*c0909341SAndroid Build Coastguard Worker movsxd r10, r11d 6728*c0909341SAndroid Build Coastguard Worker sar r11, 32 6729*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 6730*c0909341SAndroid Build Coastguard Worker movu xm14, [base+resize_shuf+8+r8*2] 6731*c0909341SAndroid Build Coastguard Worker movu xm15, [base+resize_shuf+8+r9*2] 6732*c0909341SAndroid Build Coastguard Worker movu xm0, [base+resize_shuf+8+r10*2] 6733*c0909341SAndroid Build Coastguard Worker movu xm2, [base+resize_shuf+8+r11*2] 6734*c0909341SAndroid Build Coastguard Worker movq r9, xm1 6735*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 6736*c0909341SAndroid Build Coastguard Worker movsxd r8, r9d 6737*c0909341SAndroid Build Coastguard Worker sar r9, 32 6738*c0909341SAndroid Build Coastguard Worker movsxd r10, r11d 6739*c0909341SAndroid Build Coastguard Worker sar r11, 32 6740*c0909341SAndroid Build Coastguard Worker vinserti128 m14, [base+resize_shuf+8+r8*2], 1 6741*c0909341SAndroid Build Coastguard Worker vinserti128 m15, [base+resize_shuf+8+r9*2], 1 6742*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [base+resize_shuf+8+r10*2], 1 6743*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [base+resize_shuf+8+r11*2], 1 6744*c0909341SAndroid Build Coastguard Worker pshufb m10, m14 6745*c0909341SAndroid Build Coastguard Worker pshufb m11, m15 6746*c0909341SAndroid Build Coastguard Worker pshufb m12, m0 6747*c0909341SAndroid Build Coastguard Worker pshufb m13, m2 6748*c0909341SAndroid Build Coastguard Worker.filter: 6749*c0909341SAndroid Build Coastguard Worker movd r8d, xm9 6750*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm9, 1 6751*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm9, 2 6752*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm9, 3 6753*c0909341SAndroid Build Coastguard Worker vextracti128 xm9, m9, 1 6754*c0909341SAndroid Build Coastguard Worker movq xm14, [base+resize_filter+r8*8] 6755*c0909341SAndroid Build Coastguard Worker movq xm15, [base+resize_filter+r9*8] 6756*c0909341SAndroid Build Coastguard Worker movq xm0, [base+resize_filter+r10*8] 6757*c0909341SAndroid Build Coastguard Worker movq xm2, [base+resize_filter+r11*8] 6758*c0909341SAndroid Build Coastguard Worker movd r8d, xm9 6759*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm9, 1 6760*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm9, 2 6761*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm9, 3 6762*c0909341SAndroid Build Coastguard Worker movhps xm14, [base+resize_filter+r8*8] 6763*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+resize_filter+r9*8] 6764*c0909341SAndroid Build Coastguard Worker movhps xm0, [base+resize_filter+r10*8] 6765*c0909341SAndroid Build Coastguard Worker movhps xm2, [base+resize_filter+r11*8] 6766*c0909341SAndroid Build Coastguard Worker pmovsxbw m14, xm14 6767*c0909341SAndroid Build Coastguard Worker pmovsxbw m15, xm15 6768*c0909341SAndroid Build Coastguard Worker pmovsxbw m0, xm0 6769*c0909341SAndroid Build Coastguard Worker pmovsxbw m2, xm2 6770*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m14 6771*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m15 6772*c0909341SAndroid Build Coastguard Worker pmaddwd m12, m0 6773*c0909341SAndroid Build Coastguard Worker pmaddwd m13, m2 6774*c0909341SAndroid Build Coastguard Worker phaddd m10, m11 6775*c0909341SAndroid Build Coastguard Worker phaddd m12, m13 6776*c0909341SAndroid Build Coastguard Worker phaddd m10, m12 6777*c0909341SAndroid Build Coastguard Worker psubd m10, m3, m10 6778*c0909341SAndroid Build Coastguard Worker psrad m10, 7 6779*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m10, 1 6780*c0909341SAndroid Build Coastguard Worker packusdw xm10, xm0 6781*c0909341SAndroid Build Coastguard Worker pminsw xm10, xm7 6782*c0909341SAndroid Build Coastguard Worker mova [dstq+xq*2], xm10 6783*c0909341SAndroid Build Coastguard Worker paddd m4, m5 6784*c0909341SAndroid Build Coastguard Worker add xd, 8 6785*c0909341SAndroid Build Coastguard Worker cmp xd, dst_wd 6786*c0909341SAndroid Build Coastguard Worker jl .loop_x 6787*c0909341SAndroid Build Coastguard Worker add dstq, dst_strideq 6788*c0909341SAndroid Build Coastguard Worker add srcq, src_strideq 6789*c0909341SAndroid Build Coastguard Worker dec hd 6790*c0909341SAndroid Build Coastguard Worker jg .loop_y 6791*c0909341SAndroid Build Coastguard Worker RET 6792*c0909341SAndroid Build Coastguard Worker 6793*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 6794