1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Worker; dav1d_obmc_masks[] with 64-x interleaved 34*c0909341SAndroid Build Coastguard Workerobmc_masks: db 0, 0, 0, 0 35*c0909341SAndroid Build Coastguard Worker ; 2 36*c0909341SAndroid Build Coastguard Worker db 45, 19, 64, 0 37*c0909341SAndroid Build Coastguard Worker ; 4 38*c0909341SAndroid Build Coastguard Worker db 39, 25, 50, 14, 59, 5, 64, 0 39*c0909341SAndroid Build Coastguard Worker ; 8 40*c0909341SAndroid Build Coastguard Worker db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 41*c0909341SAndroid Build Coastguard Worker ; 16 42*c0909341SAndroid Build Coastguard Worker db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 43*c0909341SAndroid Build Coastguard Worker db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 44*c0909341SAndroid Build Coastguard Worker ; 32 45*c0909341SAndroid Build Coastguard Worker db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 46*c0909341SAndroid Build Coastguard Worker db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 47*c0909341SAndroid Build Coastguard Worker db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 48*c0909341SAndroid Build Coastguard Worker db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 49*c0909341SAndroid Build Coastguard Worker 50*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 51*c0909341SAndroid Build Coastguard Worker db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 52*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 53*c0909341SAndroid Build Coastguard Worker db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 54*c0909341SAndroid Build Coastguard Workersubpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 55*c0909341SAndroid Build Coastguard Worker db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 56*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 57*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 58*c0909341SAndroid Build Coastguard Workersubpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 59*c0909341SAndroid Build Coastguard Workersubpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 60*c0909341SAndroid Build Coastguard Workersubpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 61*c0909341SAndroid Build Coastguard Workersubpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 62*c0909341SAndroid Build Coastguard Workerbilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 63*c0909341SAndroid Build Coastguard Workerbilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 64*c0909341SAndroid Build Coastguard Workerdeint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 65*c0909341SAndroid Build Coastguard Workerblend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 66*c0909341SAndroid Build Coastguard Workerpb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 67*c0909341SAndroid Build Coastguard Workerbdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 68*c0909341SAndroid Build Coastguard Workerwswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 69*c0909341SAndroid Build Coastguard Workerresize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 70*c0909341SAndroid Build Coastguard Workerrescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 71*c0909341SAndroid Build Coastguard Worker 72*c0909341SAndroid Build Coastguard Workerwm_420_sign: dd 0x01020102, 0x01010101 73*c0909341SAndroid Build Coastguard Workerwm_422_sign: dd 0x80808080, 0x7f7f7f7f 74*c0909341SAndroid Build Coastguard Worker 75*c0909341SAndroid Build Coastguard Workerpb_64: times 4 db 64 76*c0909341SAndroid Build Coastguard Workerpw_m256: times 2 dw -256 77*c0909341SAndroid Build Coastguard Workerpw_15: times 2 dw 15 78*c0909341SAndroid Build Coastguard Workerpw_32: times 2 dw 32 79*c0909341SAndroid Build Coastguard Workerpw_34: times 2 dw 34 80*c0909341SAndroid Build Coastguard Workerpw_258: times 2 dw 258 81*c0909341SAndroid Build Coastguard Workerpw_512: times 2 dw 512 82*c0909341SAndroid Build Coastguard Workerpw_1024: times 2 dw 1024 83*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 84*c0909341SAndroid Build Coastguard Workerpw_6903: times 2 dw 6903 85*c0909341SAndroid Build Coastguard Workerpw_8192: times 2 dw 8192 86*c0909341SAndroid Build Coastguard Workerpd_32: dd 32 87*c0909341SAndroid Build Coastguard Workerpd_63: dd 63 88*c0909341SAndroid Build Coastguard Workerpd_512: dd 512 89*c0909341SAndroid Build Coastguard Workerpd_32768: dd 32768 90*c0909341SAndroid Build Coastguard Workerpd_0x3ff: dd 0x3ff 91*c0909341SAndroid Build Coastguard Workerpd_0x4000: dd 0x4000 92*c0909341SAndroid Build Coastguard Workerpq_0x40000000: dq 0x40000000 93*c0909341SAndroid Build Coastguard Worker 94*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters 95*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter2 96*c0909341SAndroid Build Coastguard Workercextern resize_filter 97*c0909341SAndroid Build Coastguard Workercextern z_filter_s 98*c0909341SAndroid Build Coastguard Worker 99*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 100*c0909341SAndroid Build Coastguard Worker 101*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-* 102*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - %3) 103*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2 104*c0909341SAndroid Build Coastguard Worker %%table: 105*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 106*c0909341SAndroid Build Coastguard Worker dw %%base %+ _w%3 - %%base 107*c0909341SAndroid Build Coastguard Worker %rotate 1 108*c0909341SAndroid Build Coastguard Worker %endrep 109*c0909341SAndroid Build Coastguard Worker%endmacro 110*c0909341SAndroid Build Coastguard Worker 111*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-* 112*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) 113*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%3 114*c0909341SAndroid Build Coastguard Worker %assign %%types %4 115*c0909341SAndroid Build Coastguard Worker %if %%types & 1 116*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_h_%3_table (%%h - %5) 117*c0909341SAndroid Build Coastguard Worker %%h: 118*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 119*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .h_w%5 - %%base 120*c0909341SAndroid Build Coastguard Worker %rotate 1 121*c0909341SAndroid Build Coastguard Worker %endrep 122*c0909341SAndroid Build Coastguard Worker %rotate 4 123*c0909341SAndroid Build Coastguard Worker %endif 124*c0909341SAndroid Build Coastguard Worker %if %%types & 2 125*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_v_%3_table (%%v - %5) 126*c0909341SAndroid Build Coastguard Worker %%v: 127*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 128*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .v_w%5 - %%base 129*c0909341SAndroid Build Coastguard Worker %rotate 1 130*c0909341SAndroid Build Coastguard Worker %endrep 131*c0909341SAndroid Build Coastguard Worker %rotate 4 132*c0909341SAndroid Build Coastguard Worker %endif 133*c0909341SAndroid Build Coastguard Worker %if %%types & 4 134*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_hv_%3_table (%%hv - %5) 135*c0909341SAndroid Build Coastguard Worker %%hv: 136*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 137*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .hv_w%5 - %%base 138*c0909341SAndroid Build Coastguard Worker %rotate 1 139*c0909341SAndroid Build Coastguard Worker %endrep 140*c0909341SAndroid Build Coastguard Worker %endif 141*c0909341SAndroid Build Coastguard Worker%endmacro 142*c0909341SAndroid Build Coastguard Worker 143*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-* 144*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*%3) 145*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2_table 146*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 147*c0909341SAndroid Build Coastguard Worker %%table: 148*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 149*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .w%3 - %%base 150*c0909341SAndroid Build Coastguard Worker %rotate 1 151*c0909341SAndroid Build Coastguard Worker %endrep 152*c0909341SAndroid Build Coastguard Worker%endmacro 153*c0909341SAndroid Build Coastguard Worker 154*c0909341SAndroid Build Coastguard Worker%macro SCALED_JMP_TABLE 2-* 155*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - %3) 156*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 157*c0909341SAndroid Build Coastguard Worker%%table: 158*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 159*c0909341SAndroid Build Coastguard Worker dw %%base %+ .w%3 - %%base 160*c0909341SAndroid Build Coastguard Worker %rotate 1 161*c0909341SAndroid Build Coastguard Worker %endrep 162*c0909341SAndroid Build Coastguard Worker %rotate 2 163*c0909341SAndroid Build Coastguard Worker%%dy_1024: 164*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 165*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 166*c0909341SAndroid Build Coastguard Worker dw %%base %+ .dy1_w%3 - %%base 167*c0909341SAndroid Build Coastguard Worker %rotate 1 168*c0909341SAndroid Build Coastguard Worker %endrep 169*c0909341SAndroid Build Coastguard Worker %rotate 2 170*c0909341SAndroid Build Coastguard Worker%%dy_2048: 171*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 172*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 173*c0909341SAndroid Build Coastguard Worker dw %%base %+ .dy2_w%3 - %%base 174*c0909341SAndroid Build Coastguard Worker %rotate 1 175*c0909341SAndroid Build Coastguard Worker %endrep 176*c0909341SAndroid Build Coastguard Worker%endmacro 177*c0909341SAndroid Build Coastguard Worker 178*c0909341SAndroid Build Coastguard Worker%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) 179*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) 180*c0909341SAndroid Build Coastguard Worker 181*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 182*c0909341SAndroid Build Coastguard Worker 183*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 184*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 185*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 186*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 187*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 188*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 189*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128 190*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 191*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 192*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 193*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 194*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 195*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 196*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 197*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 198*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 199*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 200*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 201*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32 202*c0909341SAndroid Build Coastguard Worker 203*c0909341SAndroid Build Coastguard WorkerSECTION .text 204*c0909341SAndroid Build Coastguard Worker 205*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 206*c0909341SAndroid Build Coastguard Workercglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy 207*c0909341SAndroid Build Coastguard Worker movifnidn mxyd, r6m ; mx 208*c0909341SAndroid Build Coastguard Worker lea r7, [put_avx2] 209*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 210*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 211*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 212*c0909341SAndroid Build Coastguard Worker jnz .h 213*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 214*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 215*c0909341SAndroid Build Coastguard Worker jnz .v 216*c0909341SAndroid Build Coastguard Worker.put: 217*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put,)] 218*c0909341SAndroid Build Coastguard Worker add wq, r7 219*c0909341SAndroid Build Coastguard Worker jmp wq 220*c0909341SAndroid Build Coastguard Worker.put_w2: 221*c0909341SAndroid Build Coastguard Worker movzx r6d, word [srcq+ssq*0] 222*c0909341SAndroid Build Coastguard Worker movzx r7d, word [srcq+ssq*1] 223*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 224*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6w 225*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7w 226*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 227*c0909341SAndroid Build Coastguard Worker sub hd, 2 228*c0909341SAndroid Build Coastguard Worker jg .put_w2 229*c0909341SAndroid Build Coastguard Worker RET 230*c0909341SAndroid Build Coastguard Worker.put_w4: 231*c0909341SAndroid Build Coastguard Worker mov r6d, [srcq+ssq*0] 232*c0909341SAndroid Build Coastguard Worker mov r7d, [srcq+ssq*1] 233*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 234*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6d 235*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7d 236*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 237*c0909341SAndroid Build Coastguard Worker sub hd, 2 238*c0909341SAndroid Build Coastguard Worker jg .put_w4 239*c0909341SAndroid Build Coastguard Worker RET 240*c0909341SAndroid Build Coastguard Worker.put_w8: 241*c0909341SAndroid Build Coastguard Worker mov r6, [srcq+ssq*0] 242*c0909341SAndroid Build Coastguard Worker mov r7, [srcq+ssq*1] 243*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 244*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6 245*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7 246*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 247*c0909341SAndroid Build Coastguard Worker sub hd, 2 248*c0909341SAndroid Build Coastguard Worker jg .put_w8 249*c0909341SAndroid Build Coastguard Worker RET 250*c0909341SAndroid Build Coastguard Worker.put_w16: 251*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 252*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*1] 253*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 254*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 255*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 256*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 257*c0909341SAndroid Build Coastguard Worker sub hd, 2 258*c0909341SAndroid Build Coastguard Worker jg .put_w16 259*c0909341SAndroid Build Coastguard Worker RET 260*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 261*c0909341SAndroid Build Coastguard Worker.put_w32: 262*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 263*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*1] 264*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 265*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 266*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 267*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 268*c0909341SAndroid Build Coastguard Worker sub hd, 2 269*c0909341SAndroid Build Coastguard Worker jg .put_w32 270*c0909341SAndroid Build Coastguard Worker RET 271*c0909341SAndroid Build Coastguard Worker.put_w64: 272*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+32*0] 273*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+32*1] 274*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*1+32*0] 275*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1+32*1] 276*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 277*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*0], m0 278*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+32*1], m1 279*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*0], m2 280*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+32*1], m3 281*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 282*c0909341SAndroid Build Coastguard Worker sub hd, 2 283*c0909341SAndroid Build Coastguard Worker jg .put_w64 284*c0909341SAndroid Build Coastguard Worker RET 285*c0909341SAndroid Build Coastguard Worker.put_w128: 286*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+32*0] 287*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+32*1] 288*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+32*2] 289*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+32*3] 290*c0909341SAndroid Build Coastguard Worker add srcq, ssq 291*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 292*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 293*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m2 294*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m3 295*c0909341SAndroid Build Coastguard Worker add dstq, dsq 296*c0909341SAndroid Build Coastguard Worker dec hd 297*c0909341SAndroid Build Coastguard Worker jg .put_w128 298*c0909341SAndroid Build Coastguard Worker RET 299*c0909341SAndroid Build Coastguard Worker.h: 300*c0909341SAndroid Build Coastguard Worker ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 301*c0909341SAndroid Build Coastguard Worker ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 302*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 303*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [z_filter_s+2] 304*c0909341SAndroid Build Coastguard Worker add mxyd, 16 305*c0909341SAndroid Build Coastguard Worker movd xm5, mxyd 306*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 307*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 308*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 309*c0909341SAndroid Build Coastguard Worker jnz .hv 310*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 311*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_2048] 312*c0909341SAndroid Build Coastguard Worker add wq, r7 313*c0909341SAndroid Build Coastguard Worker jmp wq 314*c0909341SAndroid Build Coastguard Worker.h_w2: 315*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 316*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [srcq+ssq*1], 1 317*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 318*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm4 319*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm5 320*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm3 321*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 322*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm0, 0 323*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm0, 2 324*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 325*c0909341SAndroid Build Coastguard Worker sub hd, 2 326*c0909341SAndroid Build Coastguard Worker jg .h_w2 327*c0909341SAndroid Build Coastguard Worker RET 328*c0909341SAndroid Build Coastguard Worker.h_w4: 329*c0909341SAndroid Build Coastguard Worker mova xm4, [bilin_h_shuf4] 330*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 331*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 332*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*1] 333*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 334*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm4 335*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm5 336*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm3 337*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 338*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 339*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 1 340*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 341*c0909341SAndroid Build Coastguard Worker sub hd, 2 342*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 343*c0909341SAndroid Build Coastguard Worker RET 344*c0909341SAndroid Build Coastguard Worker.h_w8: 345*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 346*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 347*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 348*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm4 349*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm4 350*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm5 351*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm5 352*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm3 353*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm3 354*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 355*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 356*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 357*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 358*c0909341SAndroid Build Coastguard Worker sub hd, 2 359*c0909341SAndroid Build Coastguard Worker jg .h_w8 360*c0909341SAndroid Build Coastguard Worker RET 361*c0909341SAndroid Build Coastguard Worker.h_w16: 362*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+8*0] 363*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+8*0], 1 364*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0+8*1] 365*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1+8*1], 1 366*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 367*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 368*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 369*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 370*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 371*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 372*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 373*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 374*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 375*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 376*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 377*c0909341SAndroid Build Coastguard Worker sub hd, 2 378*c0909341SAndroid Build Coastguard Worker jg .h_w16 379*c0909341SAndroid Build Coastguard Worker RET 380*c0909341SAndroid Build Coastguard Worker.h_w32: 381*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 382*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 383*c0909341SAndroid Build Coastguard Worker add srcq, ssq 384*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 385*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 386*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 387*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 388*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 389*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 390*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 391*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 392*c0909341SAndroid Build Coastguard Worker add dstq, dsq 393*c0909341SAndroid Build Coastguard Worker dec hd 394*c0909341SAndroid Build Coastguard Worker jg .h_w32 395*c0909341SAndroid Build Coastguard Worker RET 396*c0909341SAndroid Build Coastguard Worker.h_w64: 397*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 398*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 399*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 400*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 401*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 402*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 403*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 404*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 405*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 406*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*4] 407*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+8*5] 408*c0909341SAndroid Build Coastguard Worker add srcq, ssq 409*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 410*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 411*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 412*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 413*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 414*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 415*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 416*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 417*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m1 418*c0909341SAndroid Build Coastguard Worker add dstq, dsq 419*c0909341SAndroid Build Coastguard Worker dec hd 420*c0909341SAndroid Build Coastguard Worker jg .h_w64 421*c0909341SAndroid Build Coastguard Worker RET 422*c0909341SAndroid Build Coastguard Worker.h_w128: 423*c0909341SAndroid Build Coastguard Worker mov r6, -32*3 424*c0909341SAndroid Build Coastguard Worker.h_w128_loop: 425*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6+32*3+8*0] 426*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6+32*3+8*1] 427*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 428*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 429*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 430*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 431*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 432*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 433*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 434*c0909341SAndroid Build Coastguard Worker mova [dstq+r6+32*3], m0 435*c0909341SAndroid Build Coastguard Worker add r6, 32 436*c0909341SAndroid Build Coastguard Worker jle .h_w128_loop 437*c0909341SAndroid Build Coastguard Worker add srcq, ssq 438*c0909341SAndroid Build Coastguard Worker add dstq, dsq 439*c0909341SAndroid Build Coastguard Worker dec hd 440*c0909341SAndroid Build Coastguard Worker jg .h_w128 441*c0909341SAndroid Build Coastguard Worker RET 442*c0909341SAndroid Build Coastguard Worker.v: 443*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 444*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 445*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_2048] 446*c0909341SAndroid Build Coastguard Worker add mxyd, 16 447*c0909341SAndroid Build Coastguard Worker add wq, r7 448*c0909341SAndroid Build Coastguard Worker movd xm4, mxyd 449*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, xm4 450*c0909341SAndroid Build Coastguard Worker jmp wq 451*c0909341SAndroid Build Coastguard Worker.v_w2: 452*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 453*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 454*c0909341SAndroid Build Coastguard Worker pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 455*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 456*c0909341SAndroid Build Coastguard Worker pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 457*c0909341SAndroid Build Coastguard Worker pshuflw xm1, xm1, q2301 ; 1 0 458*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm0 459*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm4 460*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 461*c0909341SAndroid Build Coastguard Worker packuswb xm1, xm1 462*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm1, 1 463*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm1, 0 464*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 465*c0909341SAndroid Build Coastguard Worker sub hd, 2 466*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 467*c0909341SAndroid Build Coastguard Worker RET 468*c0909341SAndroid Build Coastguard Worker.v_w4: 469*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+ssq*0] 470*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 471*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [srcq+ssq*1] 472*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 473*c0909341SAndroid Build Coastguard Worker vpblendd xm1, xm2, xm0, 0x01 ; 0 1 474*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 475*c0909341SAndroid Build Coastguard Worker vpblendd xm2, xm0, 0x02 ; 1 2 476*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm2 477*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm4 478*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 479*c0909341SAndroid Build Coastguard Worker packuswb xm1, xm1 480*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm1 481*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm1, 1 482*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 483*c0909341SAndroid Build Coastguard Worker sub hd, 2 484*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 485*c0909341SAndroid Build Coastguard Worker RET 486*c0909341SAndroid Build Coastguard Worker.v_w8: 487*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 488*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 489*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+ssq*1] 490*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 491*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm0, xm2 492*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 493*c0909341SAndroid Build Coastguard Worker punpcklbw xm2, xm0 494*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm4 495*c0909341SAndroid Build Coastguard Worker pmaddubsw xm2, xm4 496*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 497*c0909341SAndroid Build Coastguard Worker pmulhrsw xm2, xm5 498*c0909341SAndroid Build Coastguard Worker packuswb xm1, xm2 499*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm1 500*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm1 501*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 502*c0909341SAndroid Build Coastguard Worker sub hd, 2 503*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 504*c0909341SAndroid Build Coastguard Worker RET 505*c0909341SAndroid Build Coastguard Worker.v_w16: 506*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 507*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 508*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [srcq+ssq*1] 509*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 510*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, m0, 0x0f ; 0 1 511*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 512*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0xf0 ; 1 2 513*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 514*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 515*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 516*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 517*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 518*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 519*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 520*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm1 521*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m1, 1 522*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 523*c0909341SAndroid Build Coastguard Worker sub hd, 2 524*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 525*c0909341SAndroid Build Coastguard Worker RET 526*c0909341SAndroid Build Coastguard Worker.v_w32: 527*c0909341SAndroid Build Coastguard Worker%macro PUT_BILIN_V_W32 0 528*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 529*c0909341SAndroid Build Coastguard Worker%%loop: 530*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1] 531*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 532*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0, m3 533*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m3 534*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 535*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 536*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 537*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 538*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 539*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 540*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m0 541*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0 542*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 543*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m4 544*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 545*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 546*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 547*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m1 548*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m2 549*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 550*c0909341SAndroid Build Coastguard Worker sub hd, 2 551*c0909341SAndroid Build Coastguard Worker jg %%loop 552*c0909341SAndroid Build Coastguard Worker%endmacro 553*c0909341SAndroid Build Coastguard Worker PUT_BILIN_V_W32 554*c0909341SAndroid Build Coastguard Worker RET 555*c0909341SAndroid Build Coastguard Worker.v_w64: 556*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+32*0] 557*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+32*1] 558*c0909341SAndroid Build Coastguard Worker.v_w64_loop: 559*c0909341SAndroid Build Coastguard Worker add srcq, ssq 560*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+32*0] 561*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m0, m3 562*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m3 563*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 564*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m4 565*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 566*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 567*c0909341SAndroid Build Coastguard Worker packuswb m2, m0 568*c0909341SAndroid Build Coastguard Worker mova m0, m3 569*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+32*1] 570*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m2 571*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1, m3 572*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 573*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 574*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 575*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 576*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 577*c0909341SAndroid Build Coastguard Worker packuswb m2, m1 578*c0909341SAndroid Build Coastguard Worker mova m1, m3 579*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m2 580*c0909341SAndroid Build Coastguard Worker add dstq, dsq 581*c0909341SAndroid Build Coastguard Worker dec hd 582*c0909341SAndroid Build Coastguard Worker jg .v_w64_loop 583*c0909341SAndroid Build Coastguard Worker RET 584*c0909341SAndroid Build Coastguard Worker.v_w128: 585*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+(3<<8)] 586*c0909341SAndroid Build Coastguard Worker mov r4, srcq 587*c0909341SAndroid Build Coastguard Worker mov r7, dstq 588*c0909341SAndroid Build Coastguard Worker.v_w128_loop: 589*c0909341SAndroid Build Coastguard Worker PUT_BILIN_V_W32 590*c0909341SAndroid Build Coastguard Worker add r4, 32 591*c0909341SAndroid Build Coastguard Worker add r7, 32 592*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 593*c0909341SAndroid Build Coastguard Worker mov srcq, r4 594*c0909341SAndroid Build Coastguard Worker mov dstq, r7 595*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 596*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop 597*c0909341SAndroid Build Coastguard Worker RET 598*c0909341SAndroid Build Coastguard Worker.hv: 599*c0909341SAndroid Build Coastguard Worker ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 600*c0909341SAndroid Build Coastguard Worker ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 601*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 602*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 603*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 ; can't shift by 12 due to signed overflow 604*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_15] 605*c0909341SAndroid Build Coastguard Worker movd xm6, mxyd 606*c0909341SAndroid Build Coastguard Worker add wq, r7 607*c0909341SAndroid Build Coastguard Worker paddb m5, m5 608*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 609*c0909341SAndroid Build Coastguard Worker jmp wq 610*c0909341SAndroid Build Coastguard Worker.hv_w2: 611*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 612*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm4 613*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm5 614*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 615*c0909341SAndroid Build Coastguard Worker movd xm1, [srcq+ssq*1] 616*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 617*c0909341SAndroid Build Coastguard Worker pinsrd xm1, [srcq+ssq*0], 1 618*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm4 619*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm5 ; 1 _ 2 _ 620*c0909341SAndroid Build Coastguard Worker shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ 621*c0909341SAndroid Build Coastguard Worker mova xm0, xm1 622*c0909341SAndroid Build Coastguard Worker psubw xm1, xm2 623*c0909341SAndroid Build Coastguard Worker pmulhw xm1, xm6 624*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm7 625*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 626*c0909341SAndroid Build Coastguard Worker psrlw xm1, 4 627*c0909341SAndroid Build Coastguard Worker packuswb xm1, xm1 628*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm1, 0 629*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm1, 2 630*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 631*c0909341SAndroid Build Coastguard Worker sub hd, 2 632*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 633*c0909341SAndroid Build Coastguard Worker RET 634*c0909341SAndroid Build Coastguard Worker.hv_w4: 635*c0909341SAndroid Build Coastguard Worker mova xm4, [bilin_h_shuf4] 636*c0909341SAndroid Build Coastguard Worker movddup xm0, [srcq+ssq*0] 637*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm4 638*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm5 639*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 640*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*1] 641*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 642*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ssq*0] 643*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm4 644*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm5 ; 1 2 645*c0909341SAndroid Build Coastguard Worker shufps xm2, xm0, xm1, q1032 ; 0 1 646*c0909341SAndroid Build Coastguard Worker mova xm0, xm1 647*c0909341SAndroid Build Coastguard Worker psubw xm1, xm2 648*c0909341SAndroid Build Coastguard Worker pmulhw xm1, xm6 649*c0909341SAndroid Build Coastguard Worker pavgw xm2, xm7 650*c0909341SAndroid Build Coastguard Worker paddw xm1, xm2 651*c0909341SAndroid Build Coastguard Worker psrlw xm1, 4 652*c0909341SAndroid Build Coastguard Worker packuswb xm1, xm1 653*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm1 654*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm1, 1 655*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 656*c0909341SAndroid Build Coastguard Worker sub hd, 2 657*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 658*c0909341SAndroid Build Coastguard Worker RET 659*c0909341SAndroid Build Coastguard Worker.hv_w8: 660*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 661*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 662*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 663*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 664*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 665*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 666*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*0], 1 667*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 668*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 ; 1 2 669*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m0, m1, 0x21 ; 0 1 670*c0909341SAndroid Build Coastguard Worker mova m0, m1 671*c0909341SAndroid Build Coastguard Worker psubw m1, m2 672*c0909341SAndroid Build Coastguard Worker pmulhw m1, m6 673*c0909341SAndroid Build Coastguard Worker pavgw m2, m7 674*c0909341SAndroid Build Coastguard Worker paddw m1, m2 675*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 676*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m1, 1 677*c0909341SAndroid Build Coastguard Worker packuswb xm1, xm2 678*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm1 679*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm1 680*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 681*c0909341SAndroid Build Coastguard Worker sub hd, 2 682*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 683*c0909341SAndroid Build Coastguard Worker RET 684*c0909341SAndroid Build Coastguard Worker.hv_w16: 685*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+8*0] 686*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0+8*1], 1 687*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 688*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 689*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 690*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*1+8*0] 691*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1+8*1], 1 692*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 693*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*0+8*0] 694*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*0+8*1], 1 695*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 696*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 697*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 698*c0909341SAndroid Build Coastguard Worker psubw m1, m2, m0 699*c0909341SAndroid Build Coastguard Worker pmulhw m1, m6 700*c0909341SAndroid Build Coastguard Worker pavgw m0, m7 701*c0909341SAndroid Build Coastguard Worker paddw m1, m0 702*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3, m5 703*c0909341SAndroid Build Coastguard Worker psubw m3, m0, m2 704*c0909341SAndroid Build Coastguard Worker pmulhw m3, m6 705*c0909341SAndroid Build Coastguard Worker pavgw m2, m7 706*c0909341SAndroid Build Coastguard Worker paddw m3, m2 707*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 708*c0909341SAndroid Build Coastguard Worker psrlw m3, 4 709*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 710*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 711*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm1 712*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m1, 1 713*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 714*c0909341SAndroid Build Coastguard Worker sub hd, 2 715*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 716*c0909341SAndroid Build Coastguard Worker RET 717*c0909341SAndroid Build Coastguard Worker.hv_w128: 718*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+(3<<16)] 719*c0909341SAndroid Build Coastguard Worker jmp .hv_w32_start 720*c0909341SAndroid Build Coastguard Worker.hv_w64: 721*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+(1<<16)] 722*c0909341SAndroid Build Coastguard Worker.hv_w32_start: 723*c0909341SAndroid Build Coastguard Worker mov r4, srcq 724*c0909341SAndroid Build Coastguard Worker mov r7, dstq 725*c0909341SAndroid Build Coastguard Worker.hv_w32: 726*c0909341SAndroid Build Coastguard Worker%if WIN64 727*c0909341SAndroid Build Coastguard Worker movaps r4m, xmm8 728*c0909341SAndroid Build Coastguard Worker%endif 729*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0: 730*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 731*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 732*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 733*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 734*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 735*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 736*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 737*c0909341SAndroid Build Coastguard Worker add srcq, ssq 738*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+8*0] 739*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+8*1] 740*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 741*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 742*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 743*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 744*c0909341SAndroid Build Coastguard Worker psubw m8, m2, m0 745*c0909341SAndroid Build Coastguard Worker pmulhw m8, m6 746*c0909341SAndroid Build Coastguard Worker pavgw m0, m7 747*c0909341SAndroid Build Coastguard Worker paddw m8, m0 748*c0909341SAndroid Build Coastguard Worker mova m0, m2 749*c0909341SAndroid Build Coastguard Worker psubw m2, m3, m1 750*c0909341SAndroid Build Coastguard Worker pmulhw m2, m6 751*c0909341SAndroid Build Coastguard Worker pavgw m1, m7 752*c0909341SAndroid Build Coastguard Worker paddw m2, m1 753*c0909341SAndroid Build Coastguard Worker mova m1, m3 754*c0909341SAndroid Build Coastguard Worker psrlw m8, 4 755*c0909341SAndroid Build Coastguard Worker psrlw m2, 4 756*c0909341SAndroid Build Coastguard Worker packuswb m8, m2 757*c0909341SAndroid Build Coastguard Worker mova [dstq], m8 758*c0909341SAndroid Build Coastguard Worker add dstq, dsq 759*c0909341SAndroid Build Coastguard Worker dec hd 760*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 761*c0909341SAndroid Build Coastguard Worker add r4, 32 762*c0909341SAndroid Build Coastguard Worker add r7, 32 763*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 764*c0909341SAndroid Build Coastguard Worker mov srcq, r4 765*c0909341SAndroid Build Coastguard Worker mov dstq, r7 766*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<16 767*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop0 768*c0909341SAndroid Build Coastguard Worker%if WIN64 769*c0909341SAndroid Build Coastguard Worker movaps xmm8, r4m 770*c0909341SAndroid Build Coastguard Worker%endif 771*c0909341SAndroid Build Coastguard Worker RET 772*c0909341SAndroid Build Coastguard Worker 773*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 774*c0909341SAndroid Build Coastguard Worker movifnidn mxyd, r5m ; mx 775*c0909341SAndroid Build Coastguard Worker lea r6, [prep%+SUFFIX] 776*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 777*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 778*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 779*c0909341SAndroid Build Coastguard Worker jnz .h 780*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 781*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 782*c0909341SAndroid Build Coastguard Worker jnz .v 783*c0909341SAndroid Build Coastguard Worker.prep: 784*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep,)] 785*c0909341SAndroid Build Coastguard Worker add wq, r6 786*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 787*c0909341SAndroid Build Coastguard Worker jmp wq 788*c0909341SAndroid Build Coastguard Worker.prep_w4: 789*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+strideq*0] 790*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [srcq+strideq*1], 1 791*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [srcq+strideq*2], 2 792*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [srcq+stride3q ], 3 793*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 794*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, xm0 795*c0909341SAndroid Build Coastguard Worker psllw m0, 4 796*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 797*c0909341SAndroid Build Coastguard Worker add tmpq, 32 798*c0909341SAndroid Build Coastguard Worker sub hd, 4 799*c0909341SAndroid Build Coastguard Worker jg .prep_w4 800*c0909341SAndroid Build Coastguard Worker RET 801*c0909341SAndroid Build Coastguard Worker.prep_w8: 802*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 803*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+strideq*1] 804*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*2] 805*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+stride3q ] 806*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 807*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, xm0 808*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, xm1 809*c0909341SAndroid Build Coastguard Worker psllw m0, 4 810*c0909341SAndroid Build Coastguard Worker psllw m1, 4 811*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 812*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 813*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 814*c0909341SAndroid Build Coastguard Worker sub hd, 4 815*c0909341SAndroid Build Coastguard Worker jg .prep_w8 816*c0909341SAndroid Build Coastguard Worker RET 817*c0909341SAndroid Build Coastguard Worker.prep_w16: 818*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+strideq*0] 819*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+strideq*1] 820*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+strideq*2] 821*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+stride3q ] 822*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 823*c0909341SAndroid Build Coastguard Worker psllw m0, 4 824*c0909341SAndroid Build Coastguard Worker psllw m1, 4 825*c0909341SAndroid Build Coastguard Worker psllw m2, 4 826*c0909341SAndroid Build Coastguard Worker psllw m3, 4 827*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 828*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 829*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 830*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 831*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 832*c0909341SAndroid Build Coastguard Worker sub hd, 4 833*c0909341SAndroid Build Coastguard Worker jg .prep_w16 834*c0909341SAndroid Build Coastguard Worker RET 835*c0909341SAndroid Build Coastguard Worker.prep_w32: 836*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+strideq*0+16*0] 837*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+strideq*0+16*1] 838*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+strideq*1+16*0] 839*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+strideq*1+16*1] 840*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 841*c0909341SAndroid Build Coastguard Worker psllw m0, 4 842*c0909341SAndroid Build Coastguard Worker psllw m1, 4 843*c0909341SAndroid Build Coastguard Worker psllw m2, 4 844*c0909341SAndroid Build Coastguard Worker psllw m3, 4 845*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 846*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 847*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 848*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 849*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 850*c0909341SAndroid Build Coastguard Worker sub hd, 2 851*c0909341SAndroid Build Coastguard Worker jg .prep_w32 852*c0909341SAndroid Build Coastguard Worker RET 853*c0909341SAndroid Build Coastguard Worker.prep_w64: 854*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+16*0] 855*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+16*1] 856*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+16*2] 857*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+16*3] 858*c0909341SAndroid Build Coastguard Worker add srcq, strideq 859*c0909341SAndroid Build Coastguard Worker psllw m0, 4 860*c0909341SAndroid Build Coastguard Worker psllw m1, 4 861*c0909341SAndroid Build Coastguard Worker psllw m2, 4 862*c0909341SAndroid Build Coastguard Worker psllw m3, 4 863*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 864*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 865*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 866*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 867*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 868*c0909341SAndroid Build Coastguard Worker dec hd 869*c0909341SAndroid Build Coastguard Worker jg .prep_w64 870*c0909341SAndroid Build Coastguard Worker RET 871*c0909341SAndroid Build Coastguard Worker.prep_w128: 872*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+16*0] 873*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+16*1] 874*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+16*2] 875*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+16*3] 876*c0909341SAndroid Build Coastguard Worker psllw m0, 4 877*c0909341SAndroid Build Coastguard Worker psllw m1, 4 878*c0909341SAndroid Build Coastguard Worker psllw m2, 4 879*c0909341SAndroid Build Coastguard Worker psllw m3, 4 880*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 881*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 882*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 883*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 884*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+16*4] 885*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+16*5] 886*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+16*6] 887*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+16*7] 888*c0909341SAndroid Build Coastguard Worker add tmpq, 32*8 889*c0909341SAndroid Build Coastguard Worker add srcq, strideq 890*c0909341SAndroid Build Coastguard Worker psllw m0, 4 891*c0909341SAndroid Build Coastguard Worker psllw m1, 4 892*c0909341SAndroid Build Coastguard Worker psllw m2, 4 893*c0909341SAndroid Build Coastguard Worker psllw m3, 4 894*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*4], m0 895*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*3], m1 896*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*2], m2 897*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*1], m3 898*c0909341SAndroid Build Coastguard Worker dec hd 899*c0909341SAndroid Build Coastguard Worker jg .prep_w128 900*c0909341SAndroid Build Coastguard Worker RET 901*c0909341SAndroid Build Coastguard Worker.h: 902*c0909341SAndroid Build Coastguard Worker ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 903*c0909341SAndroid Build Coastguard Worker ; = (16 - mx) * src[x] + mx * src[x + 1] 904*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 905*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [z_filter_s+2] 906*c0909341SAndroid Build Coastguard Worker add mxyd, 16 907*c0909341SAndroid Build Coastguard Worker movd xm5, mxyd 908*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 909*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, xm5 910*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 911*c0909341SAndroid Build Coastguard Worker jnz .hv 912*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 913*c0909341SAndroid Build Coastguard Worker add wq, r6 914*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 915*c0909341SAndroid Build Coastguard Worker jmp wq 916*c0909341SAndroid Build Coastguard Worker.h_w4: 917*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [bilin_h_shuf4] 918*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 919*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 920*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+strideq*1] 921*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*2] 922*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+stride3q ] 923*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 924*c0909341SAndroid Build Coastguard Worker vinserti128 m0, xm1, 1 925*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 926*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 927*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 928*c0909341SAndroid Build Coastguard Worker add tmpq, 32 929*c0909341SAndroid Build Coastguard Worker sub hd, 4 930*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 931*c0909341SAndroid Build Coastguard Worker RET 932*c0909341SAndroid Build Coastguard Worker.h_w8: 933*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 934*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 935*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*1], 1 936*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*2] 937*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+stride3q ], 1 938*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 939*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 940*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 941*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 942*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 943*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 944*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 945*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 946*c0909341SAndroid Build Coastguard Worker sub hd, 4 947*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 948*c0909341SAndroid Build Coastguard Worker RET 949*c0909341SAndroid Build Coastguard Worker.h_w16: 950*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 951*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+8*0] 952*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*0+8*1], 1 953*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1+8*0] 954*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*1+8*1], 1 955*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*2+8*0] 956*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*2+8*1], 1 957*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+stride3q +8*0] 958*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+stride3q +8*1], 1 959*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 960*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 961*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 962*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 963*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 964*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 965*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 966*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 967*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 968*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 969*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 970*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 971*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 972*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 973*c0909341SAndroid Build Coastguard Worker sub hd, 4 974*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 975*c0909341SAndroid Build Coastguard Worker RET 976*c0909341SAndroid Build Coastguard Worker.h_w32: 977*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 978*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+8*0] 979*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*0+8*1], 1 980*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0+8*2] 981*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*0+8*3], 1 982*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*1+8*0] 983*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*1+8*1], 1 984*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+strideq*1+8*2] 985*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+strideq*1+8*3], 1 986*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 987*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 988*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 989*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 990*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 991*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 992*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 993*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 994*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 995*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 996*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 997*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 998*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 999*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 1000*c0909341SAndroid Build Coastguard Worker sub hd, 2 1001*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop 1002*c0909341SAndroid Build Coastguard Worker RET 1003*c0909341SAndroid Build Coastguard Worker.h_w64: 1004*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+8*0] 1005*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+8*1], 1 1006*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+8*2] 1007*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+8*3], 1 1008*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+8*4] 1009*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+8*5], 1 1010*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+8*6] 1011*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+8*7], 1 1012*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1013*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1014*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1015*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1016*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 1017*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1018*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1019*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1020*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1021*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 1022*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 1023*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 1024*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 1025*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 1026*c0909341SAndroid Build Coastguard Worker dec hd 1027*c0909341SAndroid Build Coastguard Worker jg .h_w64 1028*c0909341SAndroid Build Coastguard Worker RET 1029*c0909341SAndroid Build Coastguard Worker.h_w128: 1030*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+8*0] 1031*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+8*1], 1 1032*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+8*2] 1033*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+8*3], 1 1034*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+8*4] 1035*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+8*5], 1 1036*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+8*6] 1037*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+8*7], 1 1038*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1039*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1040*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1041*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 1042*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1043*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1044*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1045*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1046*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 1047*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m1 1048*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m2 1049*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m3 1050*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+8* 8] 1051*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+8* 9], 1 1052*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+8*10] 1053*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+8*11], 1 1054*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+8*12] 1055*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+8*13], 1 1056*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+8*14] 1057*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+8*15], 1 1058*c0909341SAndroid Build Coastguard Worker add tmpq, 32*8 1059*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1060*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1061*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1062*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1063*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 1064*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1065*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1066*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1067*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1068*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*4], m0 1069*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*3], m1 1070*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*2], m2 1071*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*1], m3 1072*c0909341SAndroid Build Coastguard Worker dec hd 1073*c0909341SAndroid Build Coastguard Worker jg .h_w128 1074*c0909341SAndroid Build Coastguard Worker RET 1075*c0909341SAndroid Build Coastguard Worker.v: 1076*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 1077*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1078*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 1079*c0909341SAndroid Build Coastguard Worker add mxyd, 16 1080*c0909341SAndroid Build Coastguard Worker add wq, r6 1081*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1082*c0909341SAndroid Build Coastguard Worker movd xm6, mxyd 1083*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 1084*c0909341SAndroid Build Coastguard Worker jmp wq 1085*c0909341SAndroid Build Coastguard Worker.v_w4: 1086*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+strideq*0] 1087*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1088*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [srcq+strideq*2] 1089*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [srcq+strideq*1] 1090*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [srcq+stride3q ] 1091*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1092*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0x05 ; 0 2 2 2 1093*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [srcq+strideq*0] 1094*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x0f ; 1 1 3 3 1095*c0909341SAndroid Build Coastguard Worker vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 1096*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xaa ; 0 1 2 3 1097*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0x55 ; 1 2 3 4 1098*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2 1099*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1100*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1101*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1102*c0909341SAndroid Build Coastguard Worker sub hd, 4 1103*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1104*c0909341SAndroid Build Coastguard Worker RET 1105*c0909341SAndroid Build Coastguard Worker.v_w8: 1106*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 1107*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1108*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+strideq*2] 1109*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+strideq*1] 1110*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+stride3q ] 1111*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1112*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0x03 ; 0 2 2 2 1113*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*0] 1114*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0xcc ; 1 3 1 3 1115*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 1116*c0909341SAndroid Build Coastguard Worker vpblendd m2, m1, 0x0f ; 0 2 1 3 1117*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0xc0 ; 1 3 2 4 1118*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 1119*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 1120*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1121*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1122*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m1 1123*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m2 1124*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 1125*c0909341SAndroid Build Coastguard Worker sub hd, 4 1126*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1127*c0909341SAndroid Build Coastguard Worker RET 1128*c0909341SAndroid Build Coastguard Worker.v_w16: 1129*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+strideq*0] 1130*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1131*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [srcq+strideq*1] 1132*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [srcq+strideq*2] 1133*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [srcq+stride3q ] 1134*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1135*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m2, 0x0c ; 0 2 1136*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+strideq*0] 1137*c0909341SAndroid Build Coastguard Worker shufpd m1, m3, 0x0c ; 1 3 1138*c0909341SAndroid Build Coastguard Worker shufpd m2, m0, 0x0c ; 2 4 1139*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m1 1140*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m1, m2 1141*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m1 1142*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1143*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1144*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m6 1145*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1146*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1147*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m3 1148*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m5 1149*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m4 1150*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m1 1151*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 1152*c0909341SAndroid Build Coastguard Worker sub hd, 4 1153*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1154*c0909341SAndroid Build Coastguard Worker RET 1155*c0909341SAndroid Build Coastguard Worker.v_w32: 1156*c0909341SAndroid Build Coastguard Worker vpermq m0, [srcq+strideq*0], q3120 1157*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 1158*c0909341SAndroid Build Coastguard Worker vpermq m1, [srcq+strideq*1], q3120 1159*c0909341SAndroid Build Coastguard Worker vpermq m2, [srcq+strideq*2], q3120 1160*c0909341SAndroid Build Coastguard Worker vpermq m3, [srcq+stride3q ], q3120 1161*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1162*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0, m1 1163*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m0, m1 1164*c0909341SAndroid Build Coastguard Worker vpermq m0, [srcq+strideq*0], q3120 1165*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1166*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m6 1167*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m4 1168*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m5 1169*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m1, m2 1170*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1171*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1172*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1173*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m2, m3 1174*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 1175*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m6 1176*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1177*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m4 1178*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m1 1179*c0909341SAndroid Build Coastguard Worker add tmpq, 32*8 1180*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3, m0 1181*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0 1182*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1183*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1184*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*4], m5 1185*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*3], m2 1186*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*2], m1 1187*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*1], m3 1188*c0909341SAndroid Build Coastguard Worker sub hd, 4 1189*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 1190*c0909341SAndroid Build Coastguard Worker RET 1191*c0909341SAndroid Build Coastguard Worker.v_w64: 1192*c0909341SAndroid Build Coastguard Worker vpermq m0, [srcq+strideq*0+32*0], q3120 1193*c0909341SAndroid Build Coastguard Worker vpermq m1, [srcq+strideq*0+32*1], q3120 1194*c0909341SAndroid Build Coastguard Worker.v_w64_loop: 1195*c0909341SAndroid Build Coastguard Worker vpermq m2, [srcq+strideq*1+32*0], q3120 1196*c0909341SAndroid Build Coastguard Worker vpermq m3, [srcq+strideq*1+32*1], q3120 1197*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1198*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0, m2 1199*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2 1200*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1201*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6 1202*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m4 1203*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m0 1204*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m1, m3 1205*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m1, m3 1206*c0909341SAndroid Build Coastguard Worker vpermq m0, [srcq+strideq*0+32*0], q3120 1207*c0909341SAndroid Build Coastguard Worker vpermq m1, [srcq+strideq*0+32*1], q3120 1208*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1209*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m6 1210*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*2], m4 1211*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*3], m5 1212*c0909341SAndroid Build Coastguard Worker add tmpq, 32*8 1213*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2, m0 1214*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0 1215*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m3, m1 1216*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 1217*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1218*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1219*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m6 1220*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1221*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*4], m4 1222*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*3], m2 1223*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*2], m5 1224*c0909341SAndroid Build Coastguard Worker mova [tmpq-32*1], m3 1225*c0909341SAndroid Build Coastguard Worker sub hd, 2 1226*c0909341SAndroid Build Coastguard Worker jg .v_w64_loop 1227*c0909341SAndroid Build Coastguard Worker RET 1228*c0909341SAndroid Build Coastguard Worker.v_w128: 1229*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+(3<<8)] 1230*c0909341SAndroid Build Coastguard Worker mov r3, srcq 1231*c0909341SAndroid Build Coastguard Worker mov r5, tmpq 1232*c0909341SAndroid Build Coastguard Worker.v_w128_loop0: 1233*c0909341SAndroid Build Coastguard Worker vpermq m0, [srcq+strideq*0], q3120 1234*c0909341SAndroid Build Coastguard Worker.v_w128_loop: 1235*c0909341SAndroid Build Coastguard Worker vpermq m1, [srcq+strideq*1], q3120 1236*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1237*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m0, m1 1238*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0, m1 1239*c0909341SAndroid Build Coastguard Worker vpermq m0, [srcq+strideq*0], q3120 1240*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1241*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1242*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m1, m0 1243*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0 1244*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1245*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1246*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m2 1247*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m3 1248*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*8], m4 1249*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*9], m1 1250*c0909341SAndroid Build Coastguard Worker add tmpq, 32*16 1251*c0909341SAndroid Build Coastguard Worker sub hd, 2 1252*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop 1253*c0909341SAndroid Build Coastguard Worker add r3, 32 1254*c0909341SAndroid Build Coastguard Worker add r5, 64 1255*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 1256*c0909341SAndroid Build Coastguard Worker mov srcq, r3 1257*c0909341SAndroid Build Coastguard Worker mov tmpq, r5 1258*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 1259*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop0 1260*c0909341SAndroid Build Coastguard Worker RET 1261*c0909341SAndroid Build Coastguard Worker.hv: 1262*c0909341SAndroid Build Coastguard Worker ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1263*c0909341SAndroid Build Coastguard Worker ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1264*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 1265*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1266*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 1267*c0909341SAndroid Build Coastguard Worker movd xm6, mxyd 1268*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, xm6 1269*c0909341SAndroid Build Coastguard Worker add wq, r6 1270*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1271*c0909341SAndroid Build Coastguard Worker jmp wq 1272*c0909341SAndroid Build Coastguard Worker.hv_w4: 1273*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [bilin_h_shuf4] 1274*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*0] 1275*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1276*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1277*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1278*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*1] 1279*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+strideq*2] 1280*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+stride3q ] 1281*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1282*c0909341SAndroid Build Coastguard Worker movhps xm2, [srcq+strideq*0] 1283*c0909341SAndroid Build Coastguard Worker vinserti128 m1, xm2, 1 1284*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1285*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 ; 1 2 3 4 1286*c0909341SAndroid Build Coastguard Worker vpblendd m2, m1, m0, 0xc0 1287*c0909341SAndroid Build Coastguard Worker vpermq m2, m2, q2103 ; 0 1 2 3 1288*c0909341SAndroid Build Coastguard Worker mova m0, m1 1289*c0909341SAndroid Build Coastguard Worker psubw m1, m2 1290*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1291*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1292*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1293*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1294*c0909341SAndroid Build Coastguard Worker sub hd, 4 1295*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1296*c0909341SAndroid Build Coastguard Worker RET 1297*c0909341SAndroid Build Coastguard Worker.hv_w8: 1298*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+strideq*0] 1299*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1300*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1301*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1302*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1] 1303*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*2], 1 1304*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+stride3q ] 1305*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1306*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*0], 1 1307*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1308*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1309*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 ; 1 2 1310*c0909341SAndroid Build Coastguard Worker vperm2i128 m3, m0, m1, 0x21 ; 0 1 1311*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 ; 3 4 1312*c0909341SAndroid Build Coastguard Worker vperm2i128 m2, m1, m0, 0x21 ; 2 3 1313*c0909341SAndroid Build Coastguard Worker psubw m1, m3 1314*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1315*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1316*c0909341SAndroid Build Coastguard Worker psubw m3, m0, m2 1317*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1318*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1319*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m1 1320*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m3 1321*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 1322*c0909341SAndroid Build Coastguard Worker sub hd, 4 1323*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 1324*c0909341SAndroid Build Coastguard Worker RET 1325*c0909341SAndroid Build Coastguard Worker.hv_w16: 1326*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+8*0] 1327*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*0+8*1], 1 1328*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1329*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1330*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 1331*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1+8*0] 1332*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*1+8*1], 1 1333*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1334*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*0+8*0] 1335*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*0+8*1], 1 1336*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1337*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1338*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1339*c0909341SAndroid Build Coastguard Worker psubw m3, m1, m0 1340*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1341*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1342*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 1343*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m1 1344*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 1345*c0909341SAndroid Build Coastguard Worker paddw m2, m1 1346*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m3 1347*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m2 1348*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 1349*c0909341SAndroid Build Coastguard Worker sub hd, 2 1350*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 1351*c0909341SAndroid Build Coastguard Worker RET 1352*c0909341SAndroid Build Coastguard Worker.hv_w32: 1353*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+8*0] 1354*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+8*1], 1 1355*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+8*2] 1356*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+8*3], 1 1357*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1358*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1359*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1360*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1361*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 1362*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1363*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+8*0] 1364*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+8*1], 1 1365*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1366*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1367*c0909341SAndroid Build Coastguard Worker psubw m3, m2, m0 1368*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1369*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1370*c0909341SAndroid Build Coastguard Worker mova m0, m2 1371*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+8*2] 1372*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+8*3], 1 1373*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1374*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1375*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m3 1376*c0909341SAndroid Build Coastguard Worker psubw m3, m2, m1 1377*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1378*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1379*c0909341SAndroid Build Coastguard Worker mova m1, m2 1380*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m3 1381*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 1382*c0909341SAndroid Build Coastguard Worker dec hd 1383*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 1384*c0909341SAndroid Build Coastguard Worker RET 1385*c0909341SAndroid Build Coastguard Worker.hv_w128: 1386*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+(7<<8)] 1387*c0909341SAndroid Build Coastguard Worker mov r6d, 256 1388*c0909341SAndroid Build Coastguard Worker jmp .hv_w64_start 1389*c0909341SAndroid Build Coastguard Worker.hv_w64: 1390*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+(3<<8)] 1391*c0909341SAndroid Build Coastguard Worker mov r6d, 128 1392*c0909341SAndroid Build Coastguard Worker.hv_w64_start: 1393*c0909341SAndroid Build Coastguard Worker%if WIN64 1394*c0909341SAndroid Build Coastguard Worker PUSH r7 1395*c0909341SAndroid Build Coastguard Worker%endif 1396*c0909341SAndroid Build Coastguard Worker mov r5, srcq 1397*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 1398*c0909341SAndroid Build Coastguard Worker.hv_w64_loop0: 1399*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+8*0] 1400*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*0+8*1], 1 1401*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1402*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1403*c0909341SAndroid Build Coastguard Worker.hv_w64_loop: 1404*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1+8*0] 1405*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*1+8*1], 1 1406*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1407*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*0+8*0] 1408*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+strideq*0+8*1], 1 1409*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1410*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 1411*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1412*c0909341SAndroid Build Coastguard Worker psubw m3, m1, m0 1413*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1414*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1415*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 1416*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m1 1417*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 1418*c0909341SAndroid Build Coastguard Worker paddw m2, m1 1419*c0909341SAndroid Build Coastguard Worker mova [tmpq+r6*0], m3 1420*c0909341SAndroid Build Coastguard Worker mova [tmpq+r6*1], m2 1421*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+r6*2] 1422*c0909341SAndroid Build Coastguard Worker sub hd, 2 1423*c0909341SAndroid Build Coastguard Worker jg .hv_w64_loop 1424*c0909341SAndroid Build Coastguard Worker add r5, 16 1425*c0909341SAndroid Build Coastguard Worker add r7, 32 1426*c0909341SAndroid Build Coastguard Worker movzx hd, r3b 1427*c0909341SAndroid Build Coastguard Worker mov srcq, r5 1428*c0909341SAndroid Build Coastguard Worker mov tmpq, r7 1429*c0909341SAndroid Build Coastguard Worker sub r3d, 1<<8 1430*c0909341SAndroid Build Coastguard Worker jg .hv_w64_loop0 1431*c0909341SAndroid Build Coastguard Worker%if WIN64 1432*c0909341SAndroid Build Coastguard Worker POP r7 1433*c0909341SAndroid Build Coastguard Worker%endif 1434*c0909341SAndroid Build Coastguard Worker RET 1435*c0909341SAndroid Build Coastguard Worker 1436*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8] 1437*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15 1438*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1439*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP (2*15 << 16) | 3*15 1440*c0909341SAndroid Build Coastguard Worker 1441*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to 1442*c0909341SAndroid Build Coastguard Workercglobal %1_%2_8bpc 1443*c0909341SAndroid Build Coastguard Worker mov t0d, FILTER_%3 1444*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4 1445*c0909341SAndroid Build Coastguard Worker mov t1d, t0d 1446*c0909341SAndroid Build Coastguard Worker%else 1447*c0909341SAndroid Build Coastguard Worker mov t1d, FILTER_%4 1448*c0909341SAndroid Build Coastguard Worker%endif 1449*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter 1450*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1451*c0909341SAndroid Build Coastguard Worker%endif 1452*c0909341SAndroid Build Coastguard Worker%endmacro 1453*c0909341SAndroid Build Coastguard Worker 1454*c0909341SAndroid Build Coastguard Worker%if WIN64 1455*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5 1456*c0909341SAndroid Build Coastguard Worker%else 1457*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8 1458*c0909341SAndroid Build Coastguard Worker%endif 1459*c0909341SAndroid Build Coastguard Worker 1460*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap, 1461*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc 1462*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc 1463*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc 1464*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular, REGULAR, REGULAR 1465*c0909341SAndroid Build Coastguard Worker 1466*c0909341SAndroid Build Coastguard Workercglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns 1467*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 1468*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 1469*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 1470*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 1471*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx2] 1472*c0909341SAndroid Build Coastguard Worker mov wd, wm 1473*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1474*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 1475*c0909341SAndroid Build Coastguard Worker jnz .h 1476*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1477*c0909341SAndroid Build Coastguard Worker jnz .v 1478*c0909341SAndroid Build Coastguard Worker.put: 1479*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1480*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put,)] 1481*c0909341SAndroid Build Coastguard Worker add wq, r8 1482*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 1483*c0909341SAndroid Build Coastguard Worker lea r7, [dsq*3] 1484*c0909341SAndroid Build Coastguard Worker%if WIN64 1485*c0909341SAndroid Build Coastguard Worker pop r8 1486*c0909341SAndroid Build Coastguard Worker%endif 1487*c0909341SAndroid Build Coastguard Worker jmp wq 1488*c0909341SAndroid Build Coastguard Worker.h_w2: 1489*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1490*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq-1] 1491*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2] 1492*c0909341SAndroid Build Coastguard Worker je .h_w4 1493*c0909341SAndroid Build Coastguard Worker mova xm3, [subpel_h_shuf4] 1494*c0909341SAndroid Build Coastguard Worker.h_w2_loop: 1495*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 1496*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*1] 1497*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1498*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm3 1499*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm4 1500*c0909341SAndroid Build Coastguard Worker phaddw xm0, xm0 1501*c0909341SAndroid Build Coastguard Worker paddw xm0, xm5 1502*c0909341SAndroid Build Coastguard Worker psraw xm0, 6 1503*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 1504*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm0, 0 1505*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm0, 1 1506*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1507*c0909341SAndroid Build Coastguard Worker sub hd, 2 1508*c0909341SAndroid Build Coastguard Worker jg .h_w2_loop 1509*c0909341SAndroid Build Coastguard Worker RET 1510*c0909341SAndroid Build Coastguard Worker.h_w4: 1511*c0909341SAndroid Build Coastguard Worker mova xm3, [subpel_h_shufA] 1512*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 1513*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 1514*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*1] 1515*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1516*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm3 1517*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm3 1518*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm4 1519*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm4 1520*c0909341SAndroid Build Coastguard Worker phaddw xm0, xm1 1521*c0909341SAndroid Build Coastguard Worker paddw xm0, xm5 1522*c0909341SAndroid Build Coastguard Worker psraw xm0, 6 1523*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 1524*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 1525*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 1 1526*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1527*c0909341SAndroid Build Coastguard Worker sub hd, 2 1528*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 1529*c0909341SAndroid Build Coastguard Worker RET 1530*c0909341SAndroid Build Coastguard Worker.h: 1531*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1532*c0909341SAndroid Build Coastguard Worker jnz .hv 1533*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) 1534*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1535*c0909341SAndroid Build Coastguard Worker jle .h_w2 1536*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 11 1537*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1538*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [z_filter_s+ 2] ; 01 1539*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1540*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [z_filter_s+ 6] ; 23 1541*c0909341SAndroid Build Coastguard Worker sub srcq, 2 1542*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [z_filter_s+10] ; 45 1543*c0909341SAndroid Build Coastguard Worker lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] 1544*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)] 1545*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [mxq+0] 1546*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [mxq+2] 1547*c0909341SAndroid Build Coastguard Worker add wq, r8 1548*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [mxq+4] 1549*c0909341SAndroid Build Coastguard Worker jmp wq 1550*c0909341SAndroid Build Coastguard Worker.h_w8: 1551*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] 1552*c0909341SAndroid Build Coastguard Worker pshufb m%2, m%1, m4 1553*c0909341SAndroid Build Coastguard Worker pmaddubsw m%2, m8 1554*c0909341SAndroid Build Coastguard Worker pshufb m%3, m%1, m6 1555*c0909341SAndroid Build Coastguard Worker pmaddubsw m%3, m9 1556*c0909341SAndroid Build Coastguard Worker pshufb m%1, m7 1557*c0909341SAndroid Build Coastguard Worker pmaddubsw m%1, m10 1558*c0909341SAndroid Build Coastguard Worker paddw m%2, m5 1559*c0909341SAndroid Build Coastguard Worker paddw m%1, m%3 1560*c0909341SAndroid Build Coastguard Worker paddw m%1, m%2 1561*c0909341SAndroid Build Coastguard Worker psraw m%1, 6 1562*c0909341SAndroid Build Coastguard Worker%endmacro 1563*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 1564*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 1565*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1566*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 0, 1, 2 1567*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1568*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 1569*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 1570*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 1571*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1572*c0909341SAndroid Build Coastguard Worker sub hd, 2 1573*c0909341SAndroid Build Coastguard Worker jg .h_w8 1574*c0909341SAndroid Build Coastguard Worker RET 1575*c0909341SAndroid Build Coastguard Worker.h_w16: 1576*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+8*0] 1577*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+8*0], 1 1578*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0+8*1] 1579*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1+8*1], 1 1580*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 0, 2, 3 1581*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1582*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 1, 2, 3 1583*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1584*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 1585*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 1586*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1587*c0909341SAndroid Build Coastguard Worker sub hd, 2 1588*c0909341SAndroid Build Coastguard Worker jg .h_w16 1589*c0909341SAndroid Build Coastguard Worker RET 1590*c0909341SAndroid Build Coastguard Worker.h_w32: 1591*c0909341SAndroid Build Coastguard Worker xor r6d, r6d 1592*c0909341SAndroid Build Coastguard Worker jmp .h_start 1593*c0909341SAndroid Build Coastguard Worker.h_w64: 1594*c0909341SAndroid Build Coastguard Worker mov r6, -32*1 1595*c0909341SAndroid Build Coastguard Worker jmp .h_start 1596*c0909341SAndroid Build Coastguard Worker.h_w128: 1597*c0909341SAndroid Build Coastguard Worker mov r6, -32*3 1598*c0909341SAndroid Build Coastguard Worker.h_start: 1599*c0909341SAndroid Build Coastguard Worker sub srcq, r6 1600*c0909341SAndroid Build Coastguard Worker sub dstq, r6 1601*c0909341SAndroid Build Coastguard Worker mov r4, r6 1602*c0909341SAndroid Build Coastguard Worker.h_loop: 1603*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6+8*0] 1604*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6+8*1] 1605*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 0, 2, 3 1606*c0909341SAndroid Build Coastguard Worker PUT_6TAP_H 1, 2, 3 1607*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1608*c0909341SAndroid Build Coastguard Worker mova [dstq+r6], m0 1609*c0909341SAndroid Build Coastguard Worker add r6, 32 1610*c0909341SAndroid Build Coastguard Worker jle .h_loop 1611*c0909341SAndroid Build Coastguard Worker add srcq, ssq 1612*c0909341SAndroid Build Coastguard Worker add dstq, dsq 1613*c0909341SAndroid Build Coastguard Worker mov r6, r4 1614*c0909341SAndroid Build Coastguard Worker dec hd 1615*c0909341SAndroid Build Coastguard Worker jg .h_loop 1616*c0909341SAndroid Build Coastguard Worker RET 1617*c0909341SAndroid Build Coastguard Worker.v: 1618*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 9, 12 1619*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1620*c0909341SAndroid Build Coastguard Worker shr myd, 16 1621*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1622*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1623*c0909341SAndroid Build Coastguard Worker tzcnt r6d, wd 1624*c0909341SAndroid Build Coastguard Worker movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] 1625*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_512] 1626*c0909341SAndroid Build Coastguard Worker lea myq, [r8+myq*8+subpel_filters+1-put_avx2] 1627*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, [myq+0] 1628*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [myq+2] 1629*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [myq+4] 1630*c0909341SAndroid Build Coastguard Worker add r6, r8 1631*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 1632*c0909341SAndroid Build Coastguard Worker neg nsq 1633*c0909341SAndroid Build Coastguard Worker jmp r6 1634*c0909341SAndroid Build Coastguard Worker.v_w2: 1635*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+nsq*2] 1636*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [srcq+nsq*1], 2 1637*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [srcq+ssq*0], 4 1638*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3 1639*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1640*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 1641*c0909341SAndroid Build Coastguard Worker palignr xm3, xm0, xm2, 4 ; 1 2 3 4 1642*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm2, xm3 ; 01 12 1643*c0909341SAndroid Build Coastguard Worker punpckhbw xm2, xm3 ; 23 34 1644*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 1645*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [srcq+ssq*1] 1646*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1647*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm1, xm5 ; a0 b0 1648*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1649*c0909341SAndroid Build Coastguard Worker pmaddubsw xm2, xm6 ; a1 b1 1650*c0909341SAndroid Build Coastguard Worker paddw xm3, xm2 1651*c0909341SAndroid Build Coastguard Worker vpblendd xm2, xm0, xm4, 0x02 ; 4 5 1652*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 1653*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm0, 0x02 ; 5 6 1654*c0909341SAndroid Build Coastguard Worker punpcklbw xm2, xm4 ; 67 78 1655*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm2, xm7 ; a3 b3 1656*c0909341SAndroid Build Coastguard Worker paddw xm3, xm4 1657*c0909341SAndroid Build Coastguard Worker pmulhrsw xm3, xm8 1658*c0909341SAndroid Build Coastguard Worker packuswb xm3, xm3 1659*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm3, 0 1660*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm3, 2 1661*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1662*c0909341SAndroid Build Coastguard Worker sub hd, 2 1663*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 1664*c0909341SAndroid Build Coastguard Worker RET 1665*c0909341SAndroid Build Coastguard Worker.v_w4: 1666*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+nsq*2] 1667*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+nsq*1], 1 1668*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*0], 2 1669*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 1670*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1671*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 1672*c0909341SAndroid Build Coastguard Worker palignr xm3, xm0, xm2, 4 ; 1 2 3 4 1673*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm2, xm3 ; 01 12 1674*c0909341SAndroid Build Coastguard Worker punpckhbw xm2, xm3 ; 23 34 1675*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1676*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [srcq+ssq*1] 1677*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1678*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm1, xm5 ; a0 b0 1679*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1680*c0909341SAndroid Build Coastguard Worker pmaddubsw xm2, xm6 ; a1 b1 1681*c0909341SAndroid Build Coastguard Worker paddw xm3, xm2 1682*c0909341SAndroid Build Coastguard Worker vpblendd xm2, xm0, xm4, 0x02 ; 4 5 1683*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 1684*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm0, 0x02 ; 5 6 1685*c0909341SAndroid Build Coastguard Worker punpcklbw xm2, xm4 ; 45 56 1686*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm2, xm7 ; a2 b2 1687*c0909341SAndroid Build Coastguard Worker paddw xm3, xm4 1688*c0909341SAndroid Build Coastguard Worker pmulhrsw xm3, xm8 1689*c0909341SAndroid Build Coastguard Worker packuswb xm3, xm3 1690*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm3 1691*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm3, 1 1692*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1693*c0909341SAndroid Build Coastguard Worker sub hd, 2 1694*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1695*c0909341SAndroid Build Coastguard Worker RET 1696*c0909341SAndroid Build Coastguard Worker.v_w8: 1697*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+nsq*2] 1698*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+nsq*1] 1699*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 1700*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 1701*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1702*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 1703*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0x30 1704*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x30 1705*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 ; 01 12 1706*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0x30 1707*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 1708*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 ; 23 34 1709*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1710*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 1711*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1712*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m5 ; a0 b0 1713*c0909341SAndroid Build Coastguard Worker mova m1, m2 1714*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 ; a1 b1 1715*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1716*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, m4, 0x30 1717*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 1718*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 1719*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 ; 45 56 1720*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m7 ; a2 b2 1721*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1722*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m8 1723*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m3, 1 1724*c0909341SAndroid Build Coastguard Worker packuswb xm3, xm4 1725*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm3 1726*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm3 1727*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1728*c0909341SAndroid Build Coastguard Worker sub hd, 2 1729*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1730*c0909341SAndroid Build Coastguard Worker RET 1731*c0909341SAndroid Build Coastguard Worker.v_w16: 1732*c0909341SAndroid Build Coastguard Worker.v_w32: 1733*c0909341SAndroid Build Coastguard Worker.v_w64: 1734*c0909341SAndroid Build Coastguard Worker.v_w128: 1735*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*8-128] 1736*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 12 1737*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*2] 1738*c0909341SAndroid Build Coastguard Worker.v_w16_loop0: 1739*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [srcq+nsq*2] 1740*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+nsq*1] 1741*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ssq*2] 1742*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 1743*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [srcq+ssq*1] 1744*c0909341SAndroid Build Coastguard Worker mov r7, dstq 1745*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r4+ssq*0] 1746*c0909341SAndroid Build Coastguard Worker shufpd m3, m0, 0x0c 1747*c0909341SAndroid Build Coastguard Worker shufpd m4, m1, 0x0c 1748*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3, m4 ; 01 1749*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 ; 23 1750*c0909341SAndroid Build Coastguard Worker shufpd m0, m2, 0x0c 1751*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4, m0 ; 12 1752*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m0 ; 34 1753*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1754*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [r4+ssq*1] 1755*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m1, m5 ; a0 1756*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 1757*c0909341SAndroid Build Coastguard Worker pmaddubsw m11, m2, m5 ; b0 1758*c0909341SAndroid Build Coastguard Worker mova m1, m3 1759*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 ; a1 1760*c0909341SAndroid Build Coastguard Worker mova m2, m4 1761*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 ; b1 1762*c0909341SAndroid Build Coastguard Worker paddw m10, m3 1763*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r4+ssq*0] 1764*c0909341SAndroid Build Coastguard Worker paddw m11, m4 1765*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m9, 0x0d 1766*c0909341SAndroid Build Coastguard Worker shufpd m0, m9, m3, 0x0c 1767*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m0 ; 45 1768*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m0 ; 56 1769*c0909341SAndroid Build Coastguard Worker pmaddubsw m9, m3, m7 ; a2 1770*c0909341SAndroid Build Coastguard Worker paddw m10, m9 1771*c0909341SAndroid Build Coastguard Worker pmaddubsw m9, m4, m7 ; b2 1772*c0909341SAndroid Build Coastguard Worker paddw m11, m9 1773*c0909341SAndroid Build Coastguard Worker pmulhrsw m10, m8 1774*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m8 1775*c0909341SAndroid Build Coastguard Worker packuswb m10, m11 1776*c0909341SAndroid Build Coastguard Worker vpermq m10, m10, q3120 1777*c0909341SAndroid Build Coastguard Worker mova [r7+dsq*0], xm10 1778*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+dsq*1], m10, 1 1779*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 1780*c0909341SAndroid Build Coastguard Worker sub hd, 2 1781*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1782*c0909341SAndroid Build Coastguard Worker add srcq, 16 1783*c0909341SAndroid Build Coastguard Worker add dstq, 16 1784*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 1785*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 1786*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop0 1787*c0909341SAndroid Build Coastguard Worker RET 1788*c0909341SAndroid Build Coastguard Worker.hv: 1789*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12, 16 1790*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1791*c0909341SAndroid Build Coastguard Worker jg .hv_w8 1792*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1793*c0909341SAndroid Build Coastguard Worker dec srcq 1794*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2] 1795*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1796*c0909341SAndroid Build Coastguard Worker shr myd, 16 1797*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1798*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1799*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] 1800*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 1801*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 1802*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_512] 1803*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 1804*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 1805*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q0000 1806*c0909341SAndroid Build Coastguard Worker neg nsq 1807*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q1111 1808*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q2222 1809*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1810*c0909341SAndroid Build Coastguard Worker je .hv_w4 1811*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [subpel_h_shuf4] 1812*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+nsq*2] 1813*c0909341SAndroid Build Coastguard Worker movhps xm2, [srcq+nsq*1] 1814*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 1815*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*1] 1816*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1817*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ssq*0] 1818*c0909341SAndroid Build Coastguard Worker vpblendd m2, m1, 0x30 1819*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 1820*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm5 1821*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1822*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm6 1823*c0909341SAndroid Build Coastguard Worker phaddw m2, m0 1824*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m7 1825*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m2, 1 1826*c0909341SAndroid Build Coastguard Worker palignr xm0, xm2, 4 1827*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm2, xm0 ; 01 12 1828*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm0 ; 23 34 1829*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 1830*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ssq*1] 1831*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1832*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ssq*0] 1833*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm5 1834*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm6 1835*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm9, xm1 ; a0 b0 1836*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1837*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm10 ; a1 b1 1838*c0909341SAndroid Build Coastguard Worker phaddw xm4, xm4 1839*c0909341SAndroid Build Coastguard Worker paddd xm3, xm2 1840*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm7 1841*c0909341SAndroid Build Coastguard Worker palignr xm2, xm4, xm0, 12 1842*c0909341SAndroid Build Coastguard Worker mova xm0, xm4 1843*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm4 ; 45 56 1844*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm11, xm2 ; a2 b2 1845*c0909341SAndroid Build Coastguard Worker paddd xm3, xm8 1846*c0909341SAndroid Build Coastguard Worker paddd xm3, xm4 1847*c0909341SAndroid Build Coastguard Worker psrad xm3, 10 1848*c0909341SAndroid Build Coastguard Worker packssdw xm3, xm3 1849*c0909341SAndroid Build Coastguard Worker packuswb xm3, xm3 1850*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm3, 0 1851*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm3, 1 1852*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1853*c0909341SAndroid Build Coastguard Worker sub hd, 2 1854*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 1855*c0909341SAndroid Build Coastguard Worker RET 1856*c0909341SAndroid Build Coastguard Worker.hv_w4: 1857*c0909341SAndroid Build Coastguard Worker mova m5, [subpel_h_shuf4] 1858*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+nsq*2] 1859*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+nsq*1] 1860*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ssq*0] 1861*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 1862*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1863*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 1864*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xcc ; 0 1 1865*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xcc ; 2 3 1866*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 1867*c0909341SAndroid Build Coastguard Worker pshufb m1, m5 1868*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 1869*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1870*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1871*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6 1872*c0909341SAndroid Build Coastguard Worker phaddw m2, m1 1873*c0909341SAndroid Build Coastguard Worker phaddw m0, m0 1874*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m7 1875*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 1876*c0909341SAndroid Build Coastguard Worker palignr m3, m0, m2, 4 1877*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 ; 01 12 1878*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 ; 23 34 1879*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1880*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 1881*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1882*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9, m1 ; a0 b0 1883*c0909341SAndroid Build Coastguard Worker mova m1, m2 1884*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m10 ; a1 b1 1885*c0909341SAndroid Build Coastguard Worker paddd m3, m2 1886*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 1887*c0909341SAndroid Build Coastguard Worker vpblendd m4, m2, 0xcc ; 5 6 1888*c0909341SAndroid Build Coastguard Worker pshufb m4, m5 1889*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1890*c0909341SAndroid Build Coastguard Worker phaddw m4, m4 1891*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 1892*c0909341SAndroid Build Coastguard Worker palignr m2, m4, m0, 12 1893*c0909341SAndroid Build Coastguard Worker mova m0, m4 1894*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 ; 45 56 1895*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m11, m2 ; a2 b2 1896*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1897*c0909341SAndroid Build Coastguard Worker paddd m3, m4 1898*c0909341SAndroid Build Coastguard Worker psrad m3, 10 1899*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m3, 1 1900*c0909341SAndroid Build Coastguard Worker packssdw xm3, xm4 1901*c0909341SAndroid Build Coastguard Worker packuswb xm3, xm3 1902*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm3, q3120 1903*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm3 1904*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm3, 1 1905*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1906*c0909341SAndroid Build Coastguard Worker sub hd, 2 1907*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1908*c0909341SAndroid Build Coastguard Worker RET 1909*c0909341SAndroid Build Coastguard Worker.hv_w8: 1910*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1911*c0909341SAndroid Build Coastguard Worker sub srcq, 2 1912*c0909341SAndroid Build Coastguard Worker lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] 1913*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 16 1914*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [mxq+0] 1915*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [mxq+2] 1916*c0909341SAndroid Build Coastguard Worker vpbroadcastw m12, [mxq+4] 1917*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1918*c0909341SAndroid Build Coastguard Worker shr myd, 16 1919*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1920*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1921*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] 1922*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*8-64] 1923*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [z_filter_s+ 6] 1924*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 1925*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [z_filter_s+10] 1926*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 1927*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 1928*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q0000 1929*c0909341SAndroid Build Coastguard Worker neg nsq 1930*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q1111 1931*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*4] 1932*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q2222 1933*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 1934*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [z_filter_s+2] 1935*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+nsq*2] 1936*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ssq*2] 1937*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+nsq*1] 1938*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [srcq+ssq*0] 1939*c0909341SAndroid Build Coastguard Worker mov r7, dstq 1940*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+ssq*1], 1 ; 1 3 1941*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0xf0 ; 0 2 1942*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r4+ssq*0], 1 ; 2 4 1943*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_8192] 1944*c0909341SAndroid Build Coastguard Worker%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3] 1945*c0909341SAndroid Build Coastguard Worker pshufb %2, %1, %4 1946*c0909341SAndroid Build Coastguard Worker pmaddubsw %2, m10 1947*c0909341SAndroid Build Coastguard Worker pshufb %3, %1, %5 1948*c0909341SAndroid Build Coastguard Worker pmaddubsw %3, m11 1949*c0909341SAndroid Build Coastguard Worker pshufb %1, %6 1950*c0909341SAndroid Build Coastguard Worker pmaddubsw %1, m12 1951*c0909341SAndroid Build Coastguard Worker paddw %2, %3 1952*c0909341SAndroid Build Coastguard Worker paddw %1, %2 1953*c0909341SAndroid Build Coastguard Worker%endmacro 1954*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 1955*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 1956*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 1957*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 1958*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 1959*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 1960*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 1961*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 1962*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 1963*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 ; 01 1964*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; 23 1965*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 ; 12 1966*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 34 1967*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1968*c0909341SAndroid Build Coastguard Worker movu xm7, [r4+ssq*1] 1969*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 1970*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [r4+ssq*0], 1 ; 5 6 1971*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13, m1 ; a0 1972*c0909341SAndroid Build Coastguard Worker mova m1, m3 1973*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m13, m2 ; b0 1974*c0909341SAndroid Build Coastguard Worker mova m2, m4 1975*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 ; a1 1976*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 ; b1 1977*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1978*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [z_filter_s+2] 1979*c0909341SAndroid Build Coastguard Worker paddd m6, m4 1980*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 1981*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_8192] 1982*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_512] 1983*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m3 1984*c0909341SAndroid Build Coastguard Worker paddd m5, m4 1985*c0909341SAndroid Build Coastguard Worker paddd m6, m4 1986*c0909341SAndroid Build Coastguard Worker mova m4, m0 1987*c0909341SAndroid Build Coastguard Worker vpermq m0, m7, q3120 1988*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, 0x05 1989*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 45 1990*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15, m3 ; a2 1991*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 67 1992*c0909341SAndroid Build Coastguard Worker paddd m5, m7 1993*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15, m4 ; b2 1994*c0909341SAndroid Build Coastguard Worker paddd m6, m7 1995*c0909341SAndroid Build Coastguard Worker psrad m5, 10 1996*c0909341SAndroid Build Coastguard Worker psrad m6, 10 1997*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 1998*c0909341SAndroid Build Coastguard Worker vextracti128 xm6, m5, 1 1999*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm6 2000*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q3120 2001*c0909341SAndroid Build Coastguard Worker movq [r7+dsq*0], xm5 2002*c0909341SAndroid Build Coastguard Worker movhps [r7+dsq*1], xm5 2003*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 2004*c0909341SAndroid Build Coastguard Worker sub hd, 2 2005*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 2006*c0909341SAndroid Build Coastguard Worker add srcq, 8 2007*c0909341SAndroid Build Coastguard Worker add dstq, 8 2008*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2009*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 2010*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 2011*c0909341SAndroid Build Coastguard Worker RET 2012*c0909341SAndroid Build Coastguard Worker 2013*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc 2014*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc 2015*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc 2016*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc 2017*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp, SHARP, SHARP 2018*c0909341SAndroid Build Coastguard Worker 2019*c0909341SAndroid Build Coastguard Workercglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 2020*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2021*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 2022*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2023*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 2024*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx2] 2025*c0909341SAndroid Build Coastguard Worker movsxd wq, wm 2026*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2027*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2028*c0909341SAndroid Build Coastguard Worker jnz .h 2029*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2030*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put 2031*c0909341SAndroid Build Coastguard Worker.v: 2032*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12, 15 2033*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2034*c0909341SAndroid Build Coastguard Worker shr myd, 16 2035*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2036*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2037*c0909341SAndroid Build Coastguard Worker tzcnt r6d, wd 2038*c0909341SAndroid Build Coastguard Worker movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] 2039*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_512] 2040*c0909341SAndroid Build Coastguard Worker lea myq, [r8+myq*8+subpel_filters-put_avx2] 2041*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+0] 2042*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [myq+2] 2043*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [myq+4] 2044*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [myq+6] 2045*c0909341SAndroid Build Coastguard Worker add r6, r8 2046*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2047*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 2048*c0909341SAndroid Build Coastguard Worker jmp r6 2049*c0909341SAndroid Build Coastguard Worker.v_w2: 2050*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+ssq*0] 2051*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [srcq+ssq*1], 2 2052*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [srcq+ssq*2], 4 2053*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2054*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 2055*c0909341SAndroid Build Coastguard Worker movd xm3, [srcq+ssq*1] 2056*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm1, [srcq+ssq*2] 2057*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2058*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 2059*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm1, 0x02 ; 4 5 2060*c0909341SAndroid Build Coastguard Worker vpblendd xm1, xm0, 0x02 ; 5 6 2061*c0909341SAndroid Build Coastguard Worker palignr xm4, xm3, xm2, 4 ; 1 2 3 4 2062*c0909341SAndroid Build Coastguard Worker punpcklbw xm3, xm1 ; 45 56 2063*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm2, xm4 ; 01 12 2064*c0909341SAndroid Build Coastguard Worker punpckhbw xm2, xm4 ; 23 34 2065*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 2066*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm1, xm8 ; a0 b0 2067*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 2068*c0909341SAndroid Build Coastguard Worker pmaddubsw xm2, xm9 ; a1 b1 2069*c0909341SAndroid Build Coastguard Worker paddw xm5, xm2 2070*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 2071*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm10 ; a2 b2 2072*c0909341SAndroid Build Coastguard Worker paddw xm5, xm3 2073*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [srcq+ssq*1] 2074*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2075*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm0, xm4, 0x02 ; 6 7 2076*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 2077*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm0, 0x02 ; 7 8 2078*c0909341SAndroid Build Coastguard Worker punpcklbw xm3, xm4 ; 67 78 2079*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm3, xm11 ; a3 b3 2080*c0909341SAndroid Build Coastguard Worker paddw xm5, xm4 2081*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm7 2082*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm5 2083*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm5, 0 2084*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm5, 2 2085*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2086*c0909341SAndroid Build Coastguard Worker sub hd, 2 2087*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 2088*c0909341SAndroid Build Coastguard Worker RET 2089*c0909341SAndroid Build Coastguard Worker.v_w4: 2090*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+ssq*0] 2091*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*1], 1 2092*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*2], 2 2093*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2094*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 2095*c0909341SAndroid Build Coastguard Worker movd xm3, [srcq+ssq*1] 2096*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm1, [srcq+ssq*2] 2097*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2098*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 2099*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm1, 0x02 ; 4 5 2100*c0909341SAndroid Build Coastguard Worker vpblendd xm1, xm0, 0x02 ; 5 6 2101*c0909341SAndroid Build Coastguard Worker palignr xm4, xm3, xm2, 4 ; 1 2 3 4 2102*c0909341SAndroid Build Coastguard Worker punpcklbw xm3, xm1 ; 45 56 2103*c0909341SAndroid Build Coastguard Worker punpcklbw xm1, xm2, xm4 ; 01 12 2104*c0909341SAndroid Build Coastguard Worker punpckhbw xm2, xm4 ; 23 34 2105*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 2106*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm1, xm8 ; a0 b0 2107*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 2108*c0909341SAndroid Build Coastguard Worker pmaddubsw xm2, xm9 ; a1 b1 2109*c0909341SAndroid Build Coastguard Worker paddw xm5, xm2 2110*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 2111*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm10 ; a2 b2 2112*c0909341SAndroid Build Coastguard Worker paddw xm5, xm3 2113*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [srcq+ssq*1] 2114*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2115*c0909341SAndroid Build Coastguard Worker vpblendd xm3, xm0, xm4, 0x02 ; 6 7 2116*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+ssq*0] 2117*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm0, 0x02 ; 7 8 2118*c0909341SAndroid Build Coastguard Worker punpcklbw xm3, xm4 ; 67 78 2119*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm3, xm11 ; a3 b3 2120*c0909341SAndroid Build Coastguard Worker paddw xm5, xm4 2121*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm7 2122*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm5 2123*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm5 2124*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm5, 1 2125*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2126*c0909341SAndroid Build Coastguard Worker sub hd, 2 2127*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 2128*c0909341SAndroid Build Coastguard Worker RET 2129*c0909341SAndroid Build Coastguard Worker.v_w8: 2130*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*0] 2131*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 2132*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*2] 2133*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2134*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+ssq*0] 2135*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2136*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+ssq*2] 2137*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2138*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 2139*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0x30 2140*c0909341SAndroid Build Coastguard Worker vpblendd m4, m2, 0x30 2141*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4 ; 01 12 2142*c0909341SAndroid Build Coastguard Worker vpblendd m2, m5, 0x30 2143*c0909341SAndroid Build Coastguard Worker vpblendd m5, m3, 0x30 2144*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m5 ; 23 34 2145*c0909341SAndroid Build Coastguard Worker vpblendd m3, m6, 0x30 2146*c0909341SAndroid Build Coastguard Worker vpblendd m6, m0, 0x30 2147*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m6 ; 45 56 2148*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 2149*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 2150*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2151*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m1, m8 ; a0 b0 2152*c0909341SAndroid Build Coastguard Worker mova m1, m2 2153*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 ; a1 b1 2154*c0909341SAndroid Build Coastguard Worker paddw m5, m2 2155*c0909341SAndroid Build Coastguard Worker mova m2, m3 2156*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m10 ; a2 b2 2157*c0909341SAndroid Build Coastguard Worker paddw m5, m3 2158*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, m4, 0x30 2159*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*0] 2160*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 2161*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4 ; 67 78 2162*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m3, m11 ; a3 b3 2163*c0909341SAndroid Build Coastguard Worker paddw m5, m4 2164*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 2165*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m5, 1 2166*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm4 2167*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm5 2168*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm5 2169*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2170*c0909341SAndroid Build Coastguard Worker sub hd, 2 2171*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2172*c0909341SAndroid Build Coastguard Worker RET 2173*c0909341SAndroid Build Coastguard Worker.v_w16: 2174*c0909341SAndroid Build Coastguard Worker.v_w32: 2175*c0909341SAndroid Build Coastguard Worker.v_w64: 2176*c0909341SAndroid Build Coastguard Worker.v_w128: 2177*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*8-128] 2178*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 15 2179*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*2] 2180*c0909341SAndroid Build Coastguard Worker.v_w16_loop0: 2181*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+ssq*0] 2182*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [srcq+ssq*1] 2183*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ss3q] 2184*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [srcq+ssq*2] 2185*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [r4+ssq*0] 2186*c0909341SAndroid Build Coastguard Worker mov r7, dstq 2187*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [r4+ssq*1] 2188*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r4+ssq*2] 2189*c0909341SAndroid Build Coastguard Worker add r4, ss3q 2190*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r4+ssq*0] 2191*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, 0x0c 2192*c0909341SAndroid Build Coastguard Worker shufpd m5, m1, 0x0c 2193*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4, m5 ; 01 2194*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m5 ; 34 2195*c0909341SAndroid Build Coastguard Worker shufpd m6, m2, 0x0c 2196*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m5, m6 ; 12 2197*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 ; 45 2198*c0909341SAndroid Build Coastguard Worker shufpd m0, m3, 0x0c 2199*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m6, m0 ; 23 2200*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 ; 56 2201*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 2202*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [r4+ssq*1] 2203*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 2204*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m1, m8 ; a0 2205*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m2, m8 ; b0 2206*c0909341SAndroid Build Coastguard Worker mova m1, m3 2207*c0909341SAndroid Build Coastguard Worker mova m2, m4 2208*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 ; a1 2209*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m9 ; b1 2210*c0909341SAndroid Build Coastguard Worker paddw m13, m3 2211*c0909341SAndroid Build Coastguard Worker paddw m14, m4 2212*c0909341SAndroid Build Coastguard Worker mova m3, m5 2213*c0909341SAndroid Build Coastguard Worker mova m4, m6 2214*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m10 ; a2 2215*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m10 ; b2 2216*c0909341SAndroid Build Coastguard Worker paddw m13, m5 2217*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [r4+ssq*0] 2218*c0909341SAndroid Build Coastguard Worker paddw m14, m6 2219*c0909341SAndroid Build Coastguard Worker shufpd m6, m0, m12, 0x0d 2220*c0909341SAndroid Build Coastguard Worker shufpd m0, m12, m5, 0x0c 2221*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m0 ; 67 2222*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 ; 78 2223*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m5, m11 ; a3 2224*c0909341SAndroid Build Coastguard Worker paddw m13, m12 2225*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m6, m11 ; b3 2226*c0909341SAndroid Build Coastguard Worker paddw m14, m12 2227*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, m7 2228*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m7 2229*c0909341SAndroid Build Coastguard Worker packuswb m13, m14 2230*c0909341SAndroid Build Coastguard Worker vpermq m13, m13, q3120 2231*c0909341SAndroid Build Coastguard Worker mova [r7+dsq*0], xm13 2232*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+dsq*1], m13, 1 2233*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 2234*c0909341SAndroid Build Coastguard Worker sub hd, 2 2235*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 2236*c0909341SAndroid Build Coastguard Worker add srcq, 16 2237*c0909341SAndroid Build Coastguard Worker add dstq, 16 2238*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2239*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 2240*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop0 2241*c0909341SAndroid Build Coastguard Worker RET 2242*c0909341SAndroid Build Coastguard Worker.h: 2243*c0909341SAndroid Build Coastguard Worker.h_w2: 2244*c0909341SAndroid Build Coastguard Worker.h_w4: 2245*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2246*c0909341SAndroid Build Coastguard Worker jnz .hv 2247*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) 2248*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2249*c0909341SAndroid Build Coastguard Worker jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2 2250*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 11 2251*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2252*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufA] 2253*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2254*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufB] 2255*c0909341SAndroid Build Coastguard Worker sub srcq, 3 2256*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufC] 2257*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] 2258*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] 2259*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] 2260*c0909341SAndroid Build Coastguard Worker add wq, r8 2261*c0909341SAndroid Build Coastguard Worker jmp wq 2262*c0909341SAndroid Build Coastguard Worker.h_w8: 2263*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] 2264*c0909341SAndroid Build Coastguard Worker pshufb m%2, m%1, m7 2265*c0909341SAndroid Build Coastguard Worker pshufb m%3, m%1, m8 2266*c0909341SAndroid Build Coastguard Worker pshufb m%1, m6 2267*c0909341SAndroid Build Coastguard Worker pmaddubsw m%4, m%2, m9 2268*c0909341SAndroid Build Coastguard Worker pmaddubsw m%2, m10 2269*c0909341SAndroid Build Coastguard Worker pmaddubsw m%3, m10 2270*c0909341SAndroid Build Coastguard Worker pmaddubsw m%1, m9 2271*c0909341SAndroid Build Coastguard Worker paddw m%3, m%4 2272*c0909341SAndroid Build Coastguard Worker paddw m%1, m%2 2273*c0909341SAndroid Build Coastguard Worker phaddw m%1, m%3 2274*c0909341SAndroid Build Coastguard Worker paddw m%1, m5 2275*c0909341SAndroid Build Coastguard Worker psraw m%1, 6 2276*c0909341SAndroid Build Coastguard Worker%endmacro 2277*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 2278*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 2279*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2280*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 1, 2, 3 2281*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2282*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 2283*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 2284*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 2285*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2286*c0909341SAndroid Build Coastguard Worker sub hd, 2 2287*c0909341SAndroid Build Coastguard Worker jg .h_w8 2288*c0909341SAndroid Build Coastguard Worker RET 2289*c0909341SAndroid Build Coastguard Worker.h_w16: 2290*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+8*0] 2291*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+8*0], 1 2292*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0+8*1] 2293*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*1+8*1], 1 2294*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 2, 3, 4 2295*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2296*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 1, 2, 3, 4 2297*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2298*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 2299*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 2300*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2301*c0909341SAndroid Build Coastguard Worker sub hd, 2 2302*c0909341SAndroid Build Coastguard Worker jg .h_w16 2303*c0909341SAndroid Build Coastguard Worker RET 2304*c0909341SAndroid Build Coastguard Worker.h_w32: 2305*c0909341SAndroid Build Coastguard Worker xor r6d, r6d 2306*c0909341SAndroid Build Coastguard Worker jmp .h_start 2307*c0909341SAndroid Build Coastguard Worker.h_w64: 2308*c0909341SAndroid Build Coastguard Worker mov r6, -32*1 2309*c0909341SAndroid Build Coastguard Worker jmp .h_start 2310*c0909341SAndroid Build Coastguard Worker.h_w128: 2311*c0909341SAndroid Build Coastguard Worker mov r6, -32*3 2312*c0909341SAndroid Build Coastguard Worker.h_start: 2313*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2314*c0909341SAndroid Build Coastguard Worker sub dstq, r6 2315*c0909341SAndroid Build Coastguard Worker mov r4, r6 2316*c0909341SAndroid Build Coastguard Worker.h_loop: 2317*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r6+8*0] 2318*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r6+8*1] 2319*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 2, 3, 4 2320*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 1, 2, 3, 4 2321*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2322*c0909341SAndroid Build Coastguard Worker mova [dstq+r6], m0 2323*c0909341SAndroid Build Coastguard Worker add r6, 32 2324*c0909341SAndroid Build Coastguard Worker jle .h_loop 2325*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2326*c0909341SAndroid Build Coastguard Worker add dstq, dsq 2327*c0909341SAndroid Build Coastguard Worker mov r6, r4 2328*c0909341SAndroid Build Coastguard Worker dec hd 2329*c0909341SAndroid Build Coastguard Worker jg .h_loop 2330*c0909341SAndroid Build Coastguard Worker RET 2331*c0909341SAndroid Build Coastguard Worker.hv: 2332*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 14, 16 2333*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2334*c0909341SAndroid Build Coastguard Worker jg .hv_w8 2335*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2336*c0909341SAndroid Build Coastguard Worker dec srcq 2337*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] 2338*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2339*c0909341SAndroid Build Coastguard Worker shr myd, 16 2340*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2341*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2342*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] 2343*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2344*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 2345*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2346*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2347*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_8192] 2348*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_512] 2349*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q0000 2350*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q1111 2351*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q2222 2352*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q3333 2353*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2354*c0909341SAndroid Build Coastguard Worker je .hv_w4 2355*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shuf4] 2356*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+ssq*0] 2357*c0909341SAndroid Build Coastguard Worker movhps xm2, [srcq+ssq*1] 2358*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*2] 2359*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2360*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*0] 2361*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2362*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*2] 2363*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2364*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ssq*0] 2365*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0x30 2366*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0x30 2367*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xc0 2368*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 2369*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 2370*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2371*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 2372*c0909341SAndroid Build Coastguard Worker phaddw m2, m0 2373*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 2374*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m2, 1 2375*c0909341SAndroid Build Coastguard Worker palignr xm4, xm3, xm2, 4 2376*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm2, xm4 ; 01 12 2377*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm4 ; 23 34 2378*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm3, q2121 2379*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0 ; 45 56 2380*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 2381*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ssq*1] 2382*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2383*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ssq*0] 2384*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm6 2385*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm7 2386*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm1, xm10 ; a0 b0 2387*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 2388*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm11 ; a1 b1 2389*c0909341SAndroid Build Coastguard Worker paddd xm5, xm2 2390*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 2391*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm12 ; a2 b2 2392*c0909341SAndroid Build Coastguard Worker phaddw xm4, xm4 2393*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm8 2394*c0909341SAndroid Build Coastguard Worker paddd xm5, xm3 2395*c0909341SAndroid Build Coastguard Worker palignr xm3, xm4, xm0, 12 2396*c0909341SAndroid Build Coastguard Worker mova xm0, xm4 2397*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0 ; 67 78 2398*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm3, xm13 ; a3 b3 2399*c0909341SAndroid Build Coastguard Worker paddd xm5, xm9 2400*c0909341SAndroid Build Coastguard Worker paddd xm5, xm4 2401*c0909341SAndroid Build Coastguard Worker psrad xm5, 10 2402*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 2403*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm5 2404*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm5, 0 2405*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm5, 1 2406*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2407*c0909341SAndroid Build Coastguard Worker sub hd, 2 2408*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 2409*c0909341SAndroid Build Coastguard Worker RET 2410*c0909341SAndroid Build Coastguard Worker.hv_w4: 2411*c0909341SAndroid Build Coastguard Worker mova m6, [subpel_h_shuf4] 2412*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 2413*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 2414*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*2] 2415*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2416*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+ssq*0] 2417*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2418*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xcc ; 0 1 2419*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*2] 2420*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2421*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ssq*0] 2422*c0909341SAndroid Build Coastguard Worker vpblendd m0, m5, 0xcc ; 2 3 2423*c0909341SAndroid Build Coastguard Worker vpblendd m3, m4, 0xcc ; 4 5 2424*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 2425*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 2426*c0909341SAndroid Build Coastguard Worker pshufb m3, m6 2427*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 2428*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2429*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 2430*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 2431*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2432*c0909341SAndroid Build Coastguard Worker phaddw m2, m0 2433*c0909341SAndroid Build Coastguard Worker phaddw m3, m1 2434*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 2435*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m8 2436*c0909341SAndroid Build Coastguard Worker palignr m4, m3, m2, 4 2437*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m4 ; 01 12 2438*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 ; 23 34 2439*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q2121 2440*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m0 ; 45 56 2441*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 2442*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 2443*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2444*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m10 ; a0 b0 2445*c0909341SAndroid Build Coastguard Worker mova m1, m2 2446*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11 ; a1 b1 2447*c0909341SAndroid Build Coastguard Worker paddd m5, m2 2448*c0909341SAndroid Build Coastguard Worker mova m2, m3 2449*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 ; a2 b2 2450*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2451*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*0] 2452*c0909341SAndroid Build Coastguard Worker vpblendd m4, m3, 0xcc ; 7 8 2453*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 2454*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 2455*c0909341SAndroid Build Coastguard Worker phaddw m4, m4 2456*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m8 2457*c0909341SAndroid Build Coastguard Worker palignr m3, m4, m0, 12 2458*c0909341SAndroid Build Coastguard Worker mova m0, m4 2459*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m0 ; 67 78 2460*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m3, m13 ; a3 b3 2461*c0909341SAndroid Build Coastguard Worker paddd m5, m9 2462*c0909341SAndroid Build Coastguard Worker paddd m5, m4 2463*c0909341SAndroid Build Coastguard Worker psrad m5, 10 2464*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m5, 1 2465*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm4 2466*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm5 2467*c0909341SAndroid Build Coastguard Worker pshuflw xm5, xm5, q3120 2468*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm5 2469*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm5, 1 2470*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2471*c0909341SAndroid Build Coastguard Worker sub hd, 2 2472*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 2473*c0909341SAndroid Build Coastguard Worker RET 2474*c0909341SAndroid Build Coastguard Worker.hv_w8: 2475*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 16 2476*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2477*c0909341SAndroid Build Coastguard Worker sub srcq, 3 2478*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] 2479*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] 2480*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2481*c0909341SAndroid Build Coastguard Worker shr myd, 16 2482*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2483*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2484*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] 2485*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2486*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 2487*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2488*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2489*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 2490*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 2491*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 2492*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q3333 2493*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*8-64] 2494*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*4] 2495*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 2496*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufA] 2497*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 2498*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ss3q] 2499*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufB] 2500*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*1] 2501*c0909341SAndroid Build Coastguard Worker mov r7, dstq 2502*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shufC] 2503*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ssq*2] 2504*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [r4+ssq*0] 2505*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0xf0 ; 0 3 2506*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r4+ssq*1], 1 ; 1 4 2507*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r4+ssq*2], 1 ; 2 5 2508*c0909341SAndroid Build Coastguard Worker add r4, ss3q 2509*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r4+ssq*0], 1 ; 3 6 2510*c0909341SAndroid Build Coastguard Worker%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] 2511*c0909341SAndroid Build Coastguard Worker pshufb %3, %1, %6 2512*c0909341SAndroid Build Coastguard Worker pshufb %4, %1, %7 2513*c0909341SAndroid Build Coastguard Worker pshufb %1, %5 2514*c0909341SAndroid Build Coastguard Worker pmaddubsw %2, %3, m10 2515*c0909341SAndroid Build Coastguard Worker pmaddubsw %4, m11 2516*c0909341SAndroid Build Coastguard Worker pmaddubsw %3, m11 2517*c0909341SAndroid Build Coastguard Worker pmaddubsw %1, m10 2518*c0909341SAndroid Build Coastguard Worker paddw %2, %4 2519*c0909341SAndroid Build Coastguard Worker paddw %1, %3 2520*c0909341SAndroid Build Coastguard Worker phaddw %1, %2 2521*c0909341SAndroid Build Coastguard Worker%endmacro 2522*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 2523*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 2524*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 2525*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 2526*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 2527*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 2528*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 2529*c0909341SAndroid Build Coastguard Worker vpermq m6, m6, q3120 2530*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 2531*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2532*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 2533*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m7 2534*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 2535*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 ; 01 2536*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 34 2537*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5, m6 ; 12 2538*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 ; 45 2539*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m6, m7 ; 23 2540*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 56 2541*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 2542*c0909341SAndroid Build Coastguard Worker vextracti128 r6m, m0, 1 ; not enough registers 2543*c0909341SAndroid Build Coastguard Worker movu xm0, [r4+ssq*1] 2544*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 2545*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r4+ssq*0], 1 ; 7 8 2546*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m1, m12 ; a0 2547*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m2, m12 ; b0 2548*c0909341SAndroid Build Coastguard Worker mova m1, m3 2549*c0909341SAndroid Build Coastguard Worker mova m2, m4 2550*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 ; a1 2551*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 ; b1 2552*c0909341SAndroid Build Coastguard Worker paddd m8, m3 2553*c0909341SAndroid Build Coastguard Worker paddd m9, m4 2554*c0909341SAndroid Build Coastguard Worker mova m3, m5 2555*c0909341SAndroid Build Coastguard Worker mova m4, m6 2556*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m14 ; a2 2557*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 ; b2 2558*c0909341SAndroid Build Coastguard Worker paddd m8, m5 2559*c0909341SAndroid Build Coastguard Worker paddd m9, m6 2560*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufB] 2561*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufC] 2562*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [subpel_h_shufA] 2563*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 2564*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_8192] 2565*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pd_512] 2566*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, r6m 2567*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 2568*c0909341SAndroid Build Coastguard Worker paddd m8, m7 2569*c0909341SAndroid Build Coastguard Worker paddd m9, m7 2570*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 ; 7 8 2571*c0909341SAndroid Build Coastguard Worker shufpd m6, m7, 0x04 ; 6 7 2572*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m7 ; 67 2573*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 78 2574*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m5, m15 ; a3 2575*c0909341SAndroid Build Coastguard Worker paddd m8, m7 2576*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m6, m15 ; b3 2577*c0909341SAndroid Build Coastguard Worker paddd m7, m9 2578*c0909341SAndroid Build Coastguard Worker psrad m8, 10 2579*c0909341SAndroid Build Coastguard Worker psrad m7, 10 2580*c0909341SAndroid Build Coastguard Worker packssdw m8, m7 2581*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m8, 1 2582*c0909341SAndroid Build Coastguard Worker packuswb xm8, xm7 2583*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm8, q3120 2584*c0909341SAndroid Build Coastguard Worker movq [r7+dsq*0], xm7 2585*c0909341SAndroid Build Coastguard Worker movhps [r7+dsq*1], xm7 2586*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 2587*c0909341SAndroid Build Coastguard Worker sub hd, 2 2588*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 2589*c0909341SAndroid Build Coastguard Worker add srcq, 8 2590*c0909341SAndroid Build Coastguard Worker add dstq, 8 2591*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2592*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 2593*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 2594*c0909341SAndroid Build Coastguard Worker RET 2595*c0909341SAndroid Build Coastguard Worker 2596*c0909341SAndroid Build Coastguard Worker%if WIN64 2597*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4 2598*c0909341SAndroid Build Coastguard Worker%else 2599*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7 2600*c0909341SAndroid Build Coastguard Worker%endif 2601*c0909341SAndroid Build Coastguard Worker 2602*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap, 2603*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc 2604*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc 2605*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc 2606*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular, REGULAR, REGULAR 2607*c0909341SAndroid Build Coastguard Worker 2608*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns 2609*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2610*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 2611*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2612*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 2613*c0909341SAndroid Build Coastguard Worker lea r7, [prep%+SUFFIX] 2614*c0909341SAndroid Build Coastguard Worker mov wd, wm 2615*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2616*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2617*c0909341SAndroid Build Coastguard Worker jnz .h 2618*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2619*c0909341SAndroid Build Coastguard Worker jnz .v 2620*c0909341SAndroid Build Coastguard Worker.prep: 2621*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2622*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep,)] 2623*c0909341SAndroid Build Coastguard Worker add wq, r7 2624*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2625*c0909341SAndroid Build Coastguard Worker%if WIN64 2626*c0909341SAndroid Build Coastguard Worker pop r7 2627*c0909341SAndroid Build Coastguard Worker%endif 2628*c0909341SAndroid Build Coastguard Worker jmp wq 2629*c0909341SAndroid Build Coastguard Worker.v: 2630*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 10, 12 2631*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2632*c0909341SAndroid Build Coastguard Worker shr myd, 16 2633*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2634*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2635*c0909341SAndroid Build Coastguard Worker lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] 2636*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pw_8192] 2637*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [myq+0] 2638*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 2639*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [myq+2] 2640*c0909341SAndroid Build Coastguard Worker neg nsq 2641*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+4] 2642*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2643*c0909341SAndroid Build Coastguard Worker jg .v_w16 2644*c0909341SAndroid Build Coastguard Worker je .v_w8 2645*c0909341SAndroid Build Coastguard Worker.v_w4: 2646*c0909341SAndroid Build Coastguard Worker movd xm2, [srcq+nsq*2] 2647*c0909341SAndroid Build Coastguard Worker pinsrd xm2, [srcq+nsq*1], 1 2648*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [srcq+ssq*0] 2649*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [srcq+ssq*1] 2650*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [srcq+ssq*2] 2651*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [deint_shuf4] 2652*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0xeb 2653*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m0 2654*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _ 2655*c0909341SAndroid Build Coastguard Worker pshufb m1, m5 ; 01 12 23 34 2656*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 2657*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2658*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [srcq+nsq*1], 1 2659*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [srcq+ssq*0] 2660*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [srcq+ssq*1] 2661*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, 0xeb 2662*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [srcq+ssq*2] 2663*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m0 2664*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _ 2665*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 ; 45 56 67 78 2666*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m6 ; a0 b0 c0 d0 2667*c0909341SAndroid Build Coastguard Worker vperm2i128 m1, m2, 0x21 ; 23 34 45 56 2668*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m8 ; a2 b2 c2 d2 2669*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 ; a1 b1 c1 d1 2670*c0909341SAndroid Build Coastguard Worker paddw m3, m4 2671*c0909341SAndroid Build Coastguard Worker paddw m3, m1 2672*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 2673*c0909341SAndroid Build Coastguard Worker mova m1, m2 2674*c0909341SAndroid Build Coastguard Worker mova [tmpq], m3 2675*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2676*c0909341SAndroid Build Coastguard Worker sub hd, 4 2677*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 2678*c0909341SAndroid Build Coastguard Worker RET 2679*c0909341SAndroid Build Coastguard Worker.v_w8: 2680*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+nsq*2] 2681*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+nsq*1] 2682*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 2683*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 2684*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*2] 2685*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0x30 2686*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x30 2687*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 ; 01 12 2688*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0x30 2689*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 2690*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 ; 23 34 2691*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 2692*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2693*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 ; a0 2694*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+nsq*1] 2695*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m7 ; a1 2696*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m2, m6 ; b0 2697*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 2698*c0909341SAndroid Build Coastguard Worker vpblendd m0, m3, 0x30 2699*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x30 2700*c0909341SAndroid Build Coastguard Worker paddw m4, m1 2701*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0, m3 ; 45 56 2702*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2703*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*2] 2704*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0x30 2705*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0x30 2706*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 ; 67 78 2707*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m7 ; b1 2708*c0909341SAndroid Build Coastguard Worker paddw m5, m3 2709*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m8 ; a2 2710*c0909341SAndroid Build Coastguard Worker paddw m4, m3 2711*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m8 ; b2 2712*c0909341SAndroid Build Coastguard Worker paddw m5, m3 2713*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m9 2714*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m9 2715*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m4 2716*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m5 2717*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 2718*c0909341SAndroid Build Coastguard Worker sub hd, 4 2719*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2720*c0909341SAndroid Build Coastguard Worker RET 2721*c0909341SAndroid Build Coastguard Worker.v_w16: 2722*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*2-32] 2723*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+nsq*2] 2724*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 12 2725*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*8] 2726*c0909341SAndroid Build Coastguard Worker.v_w16_loop0: 2727*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [srcq+ssq*0] 2728*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+ssq*2] 2729*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+ssq*1] 2730*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 2731*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [r5+ssq*0] 2732*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [r5+ssq*1] 2733*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 2734*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r5+ssq*0] 2735*c0909341SAndroid Build Coastguard Worker shufpd m3, m0, 0x0c 2736*c0909341SAndroid Build Coastguard Worker shufpd m4, m1, 0x0c 2737*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3, m4 ; 01 2738*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 ; 23 2739*c0909341SAndroid Build Coastguard Worker shufpd m0, m2, 0x0c 2740*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4, m0 ; 12 2741*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m0 ; 34 2742*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 2743*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [r5+ssq*1] 2744*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m1, m6 ; a0 2745*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 2746*c0909341SAndroid Build Coastguard Worker pmaddubsw m11, m2, m6 ; b0 2747*c0909341SAndroid Build Coastguard Worker mova m1, m3 2748*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 ; a1 2749*c0909341SAndroid Build Coastguard Worker mova m2, m4 2750*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 ; b1 2751*c0909341SAndroid Build Coastguard Worker paddw m10, m3 2752*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r5+ssq*0] 2753*c0909341SAndroid Build Coastguard Worker paddw m11, m4 2754*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, m5, 0x0d 2755*c0909341SAndroid Build Coastguard Worker shufpd m0, m5, m3, 0x0c 2756*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m0 ; 45 2757*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m0 ; 56 2758*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m8 ; a2 2759*c0909341SAndroid Build Coastguard Worker paddw m10, m5 2760*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m4, m8 ; b2 2761*c0909341SAndroid Build Coastguard Worker paddw m11, m5 2762*c0909341SAndroid Build Coastguard Worker pmulhrsw m10, m9 2763*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m9 2764*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0], m10 2765*c0909341SAndroid Build Coastguard Worker mova [r7+wq*2], m11 2766*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*4] 2767*c0909341SAndroid Build Coastguard Worker sub hd, 2 2768*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 2769*c0909341SAndroid Build Coastguard Worker add srcq, 16 2770*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2771*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2772*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 2773*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop0 2774*c0909341SAndroid Build Coastguard Worker RET 2775*c0909341SAndroid Build Coastguard Worker.h_w4: 2776*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 2777*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2778*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [subpel_h_shufA] 2779*c0909341SAndroid Build Coastguard Worker dec srcq 2780*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 2781*c0909341SAndroid Build Coastguard Worker lea r3, [ssq*3] 2782*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 2783*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 2784*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*2] 2785*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*1] 2786*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x30 2787*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+r3 ] 2788*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2789*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0x30 2790*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 2791*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 2792*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 2793*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 2794*c0909341SAndroid Build Coastguard Worker phaddw m0, m1 2795*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 2796*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 2797*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2798*c0909341SAndroid Build Coastguard Worker sub hd, 4 2799*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 2800*c0909341SAndroid Build Coastguard Worker RET 2801*c0909341SAndroid Build Coastguard Worker.h: 2802*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2803*c0909341SAndroid Build Coastguard Worker jnz .hv 2804*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_8192] 2805*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2806*c0909341SAndroid Build Coastguard Worker je .h_w4 2807*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 10 2808*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2809*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [z_filter_s+ 2] 2810*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2811*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [z_filter_s+ 6] 2812*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2813*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [z_filter_s+10] 2814*c0909341SAndroid Build Coastguard Worker lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX] 2815*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)] 2816*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [mxq+0] 2817*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [mxq+2] 2818*c0909341SAndroid Build Coastguard Worker add wq, r7 2819*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [mxq+4] 2820*c0909341SAndroid Build Coastguard Worker jmp wq 2821*c0909341SAndroid Build Coastguard Worker.h_w8: 2822*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 2823*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1], 1 2824*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2825*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_H 0 2826*c0909341SAndroid Build Coastguard Worker pshufb m1, m0, m3 2827*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2828*c0909341SAndroid Build Coastguard Worker pshufb m2, m0, m5 2829*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m8 2830*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 2831*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m9 2832*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2833*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2834*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 2835*c0909341SAndroid Build Coastguard Worker%endmacro 2836*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 2837*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 2838*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2839*c0909341SAndroid Build Coastguard Worker sub hd, 2 2840*c0909341SAndroid Build Coastguard Worker jg .h_w8 2841*c0909341SAndroid Build Coastguard Worker RET 2842*c0909341SAndroid Build Coastguard Worker.h_w16: 2843*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+8*0] 2844*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0+8*1], 1 2845*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 2846*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 2847*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*1+8*0] 2848*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*1+8*1], 1 2849*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2850*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 2851*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m0 2852*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 2853*c0909341SAndroid Build Coastguard Worker sub hd, 2 2854*c0909341SAndroid Build Coastguard Worker jg .h_w16 2855*c0909341SAndroid Build Coastguard Worker RET 2856*c0909341SAndroid Build Coastguard Worker.h_w32: 2857*c0909341SAndroid Build Coastguard Worker xor r6d, r6d 2858*c0909341SAndroid Build Coastguard Worker jmp .h_start 2859*c0909341SAndroid Build Coastguard Worker.h_w64: 2860*c0909341SAndroid Build Coastguard Worker mov r6, -32*1 2861*c0909341SAndroid Build Coastguard Worker jmp .h_start 2862*c0909341SAndroid Build Coastguard Worker.h_w128: 2863*c0909341SAndroid Build Coastguard Worker mov r6, -32*3 2864*c0909341SAndroid Build Coastguard Worker.h_start: 2865*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2866*c0909341SAndroid Build Coastguard Worker mov r5, r6 2867*c0909341SAndroid Build Coastguard Worker.h_loop: 2868*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+r6+8*0] 2869*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+r6+8*1], 1 2870*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 2871*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 2872*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+r6+8*2] 2873*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+r6+8*3], 1 2874*c0909341SAndroid Build Coastguard Worker PREP_6TAP_H 2875*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m0 2876*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 2877*c0909341SAndroid Build Coastguard Worker add r6, 32 2878*c0909341SAndroid Build Coastguard Worker jle .h_loop 2879*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2880*c0909341SAndroid Build Coastguard Worker mov r6, r5 2881*c0909341SAndroid Build Coastguard Worker dec hd 2882*c0909341SAndroid Build Coastguard Worker jg .h_loop 2883*c0909341SAndroid Build Coastguard Worker RET 2884*c0909341SAndroid Build Coastguard Worker.hv: 2885*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 14, 16 2886*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2887*c0909341SAndroid Build Coastguard Worker jne .hv_w8 2888*c0909341SAndroid Build Coastguard Worker.hv_w4: 2889*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2890*c0909341SAndroid Build Coastguard Worker dec srcq 2891*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 2892*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2893*c0909341SAndroid Build Coastguard Worker shr myd, 16 2894*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2895*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2896*c0909341SAndroid Build Coastguard Worker mova m6, [subpel_h_shuf4] 2897*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] 2898*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 2899*c0909341SAndroid Build Coastguard Worker pmovzxbd m13, [deint_shuf4] 2900*c0909341SAndroid Build Coastguard Worker neg nsq 2901*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_8192] 2902*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_32] 2903*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2904*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+nsq*2] 2905*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2906*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+nsq*1] 2907*c0909341SAndroid Build Coastguard Worker pshufd m10, m0, q0000 2908*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ssq*0] 2909*c0909341SAndroid Build Coastguard Worker pshufd m11, m0, q1111 2910*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2911*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q2222 2912*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+ssq*2] 2913*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xcc ; 0 1 2914*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xcc ; 2 3 2915*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 2916*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 2917*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 2918*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2919*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2920*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 2921*c0909341SAndroid Build Coastguard Worker phaddw m2, m1 ; 0 1 2 3 2922*c0909341SAndroid Build Coastguard Worker phaddw m0, m0 ; 4 2923*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 2924*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m8 2925*c0909341SAndroid Build Coastguard Worker palignr m0, m2, 4 2926*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m0 ; 01 12 2927*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0 ; 23 34 2928*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 2929*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m10, m1 ; a0 b0 2930*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2931*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m2, m10 ; c0 d0 2932*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+nsq*1] 2933*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11 ; a1 b1 2934*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*0] 2935*c0909341SAndroid Build Coastguard Worker paddd m4, m2 2936*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*1] 2937*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xcc ; 5 6 2938*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*2] 2939*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0xcc ; 7 8 2940*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 2941*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 2942*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2943*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2944*c0909341SAndroid Build Coastguard Worker phaddw m1, m2 ; 5 6 7 8 2945*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m8 2946*c0909341SAndroid Build Coastguard Worker paddd m5, m9 2947*c0909341SAndroid Build Coastguard Worker paddd m4, m9 2948*c0909341SAndroid Build Coastguard Worker palignr m2, m1, m0, 12 2949*c0909341SAndroid Build Coastguard Worker mova m0, m1 2950*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m0 ; 45 56 2951*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0 ; 67 78 2952*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m11, m1 ; c1 d1 2953*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2954*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m1 ; a2 b2 2955*c0909341SAndroid Build Coastguard Worker paddd m4, m3 2956*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m2 ; c2 d2 2957*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2958*c0909341SAndroid Build Coastguard Worker psrad m4, 6 2959*c0909341SAndroid Build Coastguard Worker psrad m5, 6 2960*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2961*c0909341SAndroid Build Coastguard Worker vpermd m4, m13, m4 2962*c0909341SAndroid Build Coastguard Worker mova [tmpq], m4 2963*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2964*c0909341SAndroid Build Coastguard Worker sub hd, 4 2965*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 2966*c0909341SAndroid Build Coastguard Worker RET 2967*c0909341SAndroid Build Coastguard Worker.hv_w8: 2968*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2969*c0909341SAndroid Build Coastguard Worker lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2] 2970*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 16 2971*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [mxq+0] 2972*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [mxq+2] 2973*c0909341SAndroid Build Coastguard Worker vpbroadcastw m12, [mxq+4] 2974*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2975*c0909341SAndroid Build Coastguard Worker shr myd, 16 2976*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2977*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2978*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2] 2979*c0909341SAndroid Build Coastguard Worker lea r7, [ssq*2+2] 2980*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [z_filter_s+ 6] 2981*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 2982*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [z_filter_s+10] 2983*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 2984*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*8-64] 2985*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q0000 2986*c0909341SAndroid Build Coastguard Worker sub srcq, r7 2987*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q1111 2988*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*4] 2989*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q2222 2990*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 2991*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [z_filter_s+2] 2992*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*0] 2993*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+ssq*2] 2994*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*1] 2995*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [r5+ssq*0] 2996*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 2997*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [r5+ssq*1], 1 ; 1 3 2998*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 2999*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0xf0 ; 0 2 3000*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r5+ssq*0], 1 ; 2 4 3001*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_8192] 3002*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 3003*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 3004*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 3005*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 3006*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 3007*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 3008*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 3009*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 3010*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 3011*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 ; 01 3012*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; 23 3013*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 ; 12 3014*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 34 3015*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 3016*c0909341SAndroid Build Coastguard Worker movu xm7, [r5+ssq*1] 3017*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 3018*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [r5+ssq*0], 1 ; 5 6 3019*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13, m1 ; a0 3020*c0909341SAndroid Build Coastguard Worker mova m1, m3 3021*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m13, m2 ; b0 3022*c0909341SAndroid Build Coastguard Worker mova m2, m4 3023*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 ; a1 3024*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 ; b1 3025*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3026*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [z_filter_s+2] 3027*c0909341SAndroid Build Coastguard Worker paddd m6, m4 3028*c0909341SAndroid Build Coastguard Worker HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 3029*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_8192] 3030*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_32] 3031*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m3 3032*c0909341SAndroid Build Coastguard Worker paddd m5, m4 3033*c0909341SAndroid Build Coastguard Worker paddd m6, m4 3034*c0909341SAndroid Build Coastguard Worker mova m4, m0 3035*c0909341SAndroid Build Coastguard Worker vpermq m0, m7, q3120 3036*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, 0x05 3037*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 45 3038*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15, m3 ; a2 3039*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 67 3040*c0909341SAndroid Build Coastguard Worker paddd m5, m7 3041*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m15, m4 ; b2 3042*c0909341SAndroid Build Coastguard Worker paddd m6, m7 3043*c0909341SAndroid Build Coastguard Worker psrad m5, 6 3044*c0909341SAndroid Build Coastguard Worker psrad m6, 6 3045*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 3046*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 3047*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0], xm5 3048*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+wq*2], m5, 1 3049*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*4] 3050*c0909341SAndroid Build Coastguard Worker sub hd, 2 3051*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 3052*c0909341SAndroid Build Coastguard Worker add srcq, 8 3053*c0909341SAndroid Build Coastguard Worker add tmpq, 16 3054*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 3055*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 3056*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 3057*c0909341SAndroid Build Coastguard Worker RET 3058*c0909341SAndroid Build Coastguard Worker 3059*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc 3060*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc 3061*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc 3062*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc 3063*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp, SHARP, SHARP 3064*c0909341SAndroid Build Coastguard Worker 3065*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 3066*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 3067*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 3068*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 3069*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 3070*c0909341SAndroid Build Coastguard Worker lea r7, [prep%+SUFFIX] 3071*c0909341SAndroid Build Coastguard Worker mov wd, wm 3072*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3073*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 3074*c0909341SAndroid Build Coastguard Worker jnz .h 3075*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3076*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep 3077*c0909341SAndroid Build Coastguard Worker.v: 3078*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 12, 15 3079*c0909341SAndroid Build Coastguard Worker movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. 3080*c0909341SAndroid Build Coastguard Worker shr myd, 16 ; Note that the code is 8-tap only, having 3081*c0909341SAndroid Build Coastguard Worker cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 3082*c0909341SAndroid Build Coastguard Worker cmove myd, mxd ; had a negligible effect on performance. 3083*c0909341SAndroid Build Coastguard Worker lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] 3084*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3085*c0909341SAndroid Build Coastguard Worker sub srcq, stride3q 3086*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 3087*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+0] 3088*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [myq+2] 3089*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [myq+4] 3090*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [myq+6] 3091*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3092*c0909341SAndroid Build Coastguard Worker jg .v_w16 3093*c0909341SAndroid Build Coastguard Worker je .v_w8 3094*c0909341SAndroid Build Coastguard Worker.v_w4: 3095*c0909341SAndroid Build Coastguard Worker movd xm0, [srcq+strideq*0] 3096*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [srcq+strideq*2] 3097*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [srcq+strideq*1] 3098*c0909341SAndroid Build Coastguard Worker add srcq, stride3q 3099*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [srcq+strideq*0] 3100*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ 3101*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ 3102*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [srcq+strideq*1] 3103*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [srcq+strideq*2] 3104*c0909341SAndroid Build Coastguard Worker vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ 3105*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [srcq+stride3q ] 3106*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [deint_shuf4] 3107*c0909341SAndroid Build Coastguard Worker vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 3108*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 3109*c0909341SAndroid Build Coastguard Worker vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ 3110*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 ; 01 12 23 34 3111*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 3112*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 ; 23 34 45 56 3113*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 3114*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3115*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [srcq+strideq*0], 1 3116*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [srcq+strideq*1] 3117*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [srcq+strideq*2] 3118*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ 3119*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [srcq+stride3q ] 3120*c0909341SAndroid Build Coastguard Worker vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ 3121*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ 3122*c0909341SAndroid Build Coastguard Worker pshufb m3, m5 ; 67 78 89 9a 3123*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m1, m8 3124*c0909341SAndroid Build Coastguard Worker vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 3125*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 3126*c0909341SAndroid Build Coastguard Worker paddw m4, m2 3127*c0909341SAndroid Build Coastguard Worker mova m2, m3 3128*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m11 3129*c0909341SAndroid Build Coastguard Worker paddw m3, m4 3130*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m1, m10 3131*c0909341SAndroid Build Coastguard Worker paddw m3, m4 3132*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m7 3133*c0909341SAndroid Build Coastguard Worker mova [tmpq], m3 3134*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3135*c0909341SAndroid Build Coastguard Worker sub hd, 4 3136*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 3137*c0909341SAndroid Build Coastguard Worker RET 3138*c0909341SAndroid Build Coastguard Worker.v_w8: 3139*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*0] 3140*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*1] 3141*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+strideq*2] 3142*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+stride3q ] 3143*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3144*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+strideq*0] 3145*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+strideq*1] 3146*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*2] 3147*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0x30 3148*c0909341SAndroid Build Coastguard Worker vpblendd m4, m2, 0x30 3149*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4 ; 01 12 3150*c0909341SAndroid Build Coastguard Worker vpblendd m2, m5, 0x30 3151*c0909341SAndroid Build Coastguard Worker vpblendd m5, m3, 0x30 3152*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m5 ; 23 34 3153*c0909341SAndroid Build Coastguard Worker vpblendd m3, m6, 0x30 3154*c0909341SAndroid Build Coastguard Worker vpblendd m6, m0, 0x30 3155*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m6 ; 45 56 3156*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 3157*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+stride3q ] 3158*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3159*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m2, m9 ; a1 3160*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m2, m8 ; b0 3161*c0909341SAndroid Build Coastguard Worker vpblendd m2, m0, m4, 0x30 3162*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*0] 3163*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 3164*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 ; 67 78 3165*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m8 ; a0 3166*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m3, m9 ; b1 3167*c0909341SAndroid Build Coastguard Worker paddw m5, m1 3168*c0909341SAndroid Build Coastguard Worker mova m1, m3 3169*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m10 ; a2 3170*c0909341SAndroid Build Coastguard Worker paddw m6, m4 3171*c0909341SAndroid Build Coastguard Worker paddw m5, m3 3172*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*1] 3173*c0909341SAndroid Build Coastguard Worker vpblendd m3, m0, m4, 0x30 3174*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*2] 3175*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x30 3176*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4 ; 89 9a 3177*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m11 ; a3 3178*c0909341SAndroid Build Coastguard Worker paddw m5, m4 3179*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m10 ; b2 3180*c0909341SAndroid Build Coastguard Worker paddw m6, m4 3181*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m3, m11 ; b3 3182*c0909341SAndroid Build Coastguard Worker paddw m6, m4 3183*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 3184*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m7 3185*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m5 3186*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m6 3187*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 3188*c0909341SAndroid Build Coastguard Worker sub hd, 4 3189*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 3190*c0909341SAndroid Build Coastguard Worker RET 3191*c0909341SAndroid Build Coastguard Worker.v_w16: 3192*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*2-32] 3193*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 15 3194*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*8] 3195*c0909341SAndroid Build Coastguard Worker.v_w16_loop0: 3196*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [srcq+strideq*0] 3197*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [srcq+strideq*1] 3198*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+strideq*2] 3199*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [r5+strideq*1] 3200*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [r5+strideq*0] 3201*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3202*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m1, [r5+strideq*0] 3203*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m2, [r5+strideq*1] 3204*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3205*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [r5+strideq*0] 3206*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3207*c0909341SAndroid Build Coastguard Worker shufpd m4, m0, 0x0c 3208*c0909341SAndroid Build Coastguard Worker shufpd m5, m1, 0x0c 3209*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4, m5 ; 01 3210*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m5 ; 34 3211*c0909341SAndroid Build Coastguard Worker shufpd m6, m2, 0x0c 3212*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m5, m6 ; 12 3213*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 ; 45 3214*c0909341SAndroid Build Coastguard Worker shufpd m0, m3, 0x0c 3215*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m6, m0 ; 23 3216*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 ; 56 3217*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 3218*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [r5+strideq*1] 3219*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3220*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m1, m8 ; a0 3221*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m2, m8 ; b0 3222*c0909341SAndroid Build Coastguard Worker mova m1, m3 3223*c0909341SAndroid Build Coastguard Worker mova m2, m4 3224*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 ; a1 3225*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m9 ; b1 3226*c0909341SAndroid Build Coastguard Worker paddw m13, m3 3227*c0909341SAndroid Build Coastguard Worker paddw m14, m4 3228*c0909341SAndroid Build Coastguard Worker mova m3, m5 3229*c0909341SAndroid Build Coastguard Worker mova m4, m6 3230*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m10 ; a2 3231*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m10 ; b2 3232*c0909341SAndroid Build Coastguard Worker paddw m13, m5 3233*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [r5+strideq*0] 3234*c0909341SAndroid Build Coastguard Worker paddw m14, m6 3235*c0909341SAndroid Build Coastguard Worker shufpd m6, m0, m12, 0x0d 3236*c0909341SAndroid Build Coastguard Worker shufpd m0, m12, m5, 0x0c 3237*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m0 ; 67 3238*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 ; 78 3239*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m5, m11 ; a3 3240*c0909341SAndroid Build Coastguard Worker paddw m13, m12 3241*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m6, m11 ; b3 3242*c0909341SAndroid Build Coastguard Worker paddw m14, m12 3243*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, m7 3244*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m7 3245*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0], m13 3246*c0909341SAndroid Build Coastguard Worker mova [r7+wq*2], m14 3247*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*4] 3248*c0909341SAndroid Build Coastguard Worker sub hd, 2 3249*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 3250*c0909341SAndroid Build Coastguard Worker add srcq, 16 3251*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3252*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 3253*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 3254*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop0 3255*c0909341SAndroid Build Coastguard Worker RET 3256*c0909341SAndroid Build Coastguard Worker.h: 3257*c0909341SAndroid Build Coastguard Worker.h_w4: 3258*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3259*c0909341SAndroid Build Coastguard Worker jnz .hv 3260*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_8192] 3261*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3262*c0909341SAndroid Build Coastguard Worker je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4 3263*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 10 3264*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [subpel_h_shufA] 3265*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 3266*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufB] 3267*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufC] 3268*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3269*c0909341SAndroid Build Coastguard Worker sub srcq, 3 3270*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] 3271*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] 3272*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] 3273*c0909341SAndroid Build Coastguard Worker add wq, r7 3274*c0909341SAndroid Build Coastguard Worker jmp wq 3275*c0909341SAndroid Build Coastguard Worker.h_w8: 3276*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 3277*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*1], 1 3278*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3279*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 0 3280*c0909341SAndroid Build Coastguard Worker pshufb m1, m0, m5 3281*c0909341SAndroid Build Coastguard Worker pshufb m2, m0, m6 3282*c0909341SAndroid Build Coastguard Worker pshufb m3, m0, m7 3283*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m8 3284*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m8 3285*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 3286*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 3287*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3288*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3289*c0909341SAndroid Build Coastguard Worker phaddw m0, m1, m0 3290*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 3291*c0909341SAndroid Build Coastguard Worker%endmacro 3292*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3293*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 3294*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3295*c0909341SAndroid Build Coastguard Worker sub hd, 2 3296*c0909341SAndroid Build Coastguard Worker jg .h_w8 3297*c0909341SAndroid Build Coastguard Worker RET 3298*c0909341SAndroid Build Coastguard Worker.h_w16: 3299*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+8*0] 3300*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*0+8*1], 1 3301*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3302*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 3303*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*1+8*0] 3304*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+strideq*1+8*1], 1 3305*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3306*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3307*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m0 3308*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 3309*c0909341SAndroid Build Coastguard Worker sub hd, 2 3310*c0909341SAndroid Build Coastguard Worker jg .h_w16 3311*c0909341SAndroid Build Coastguard Worker RET 3312*c0909341SAndroid Build Coastguard Worker.h_w32: 3313*c0909341SAndroid Build Coastguard Worker xor r6d, r6d 3314*c0909341SAndroid Build Coastguard Worker jmp .h_start 3315*c0909341SAndroid Build Coastguard Worker.h_w64: 3316*c0909341SAndroid Build Coastguard Worker mov r6, -32*1 3317*c0909341SAndroid Build Coastguard Worker jmp .h_start 3318*c0909341SAndroid Build Coastguard Worker.h_w128: 3319*c0909341SAndroid Build Coastguard Worker mov r6, -32*3 3320*c0909341SAndroid Build Coastguard Worker.h_start: 3321*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3322*c0909341SAndroid Build Coastguard Worker mov r5, r6 3323*c0909341SAndroid Build Coastguard Worker.h_loop: 3324*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+r6+8*0] 3325*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+r6+8*1], 1 3326*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3327*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*0], m0 3328*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+r6+8*2] 3329*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+r6+8*3], 1 3330*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3331*c0909341SAndroid Build Coastguard Worker mova [tmpq+32*1], m0 3332*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 3333*c0909341SAndroid Build Coastguard Worker add r6, 32 3334*c0909341SAndroid Build Coastguard Worker jle .h_loop 3335*c0909341SAndroid Build Coastguard Worker add srcq, strideq 3336*c0909341SAndroid Build Coastguard Worker mov r6, r5 3337*c0909341SAndroid Build Coastguard Worker dec hd 3338*c0909341SAndroid Build Coastguard Worker jg .h_loop 3339*c0909341SAndroid Build Coastguard Worker RET 3340*c0909341SAndroid Build Coastguard Worker.hv: 3341*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 16 3342*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3343*c0909341SAndroid Build Coastguard Worker je .hv_w4 3344*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3345*c0909341SAndroid Build Coastguard Worker sub srcq, 3 3346*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] 3347*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] 3348*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3349*c0909341SAndroid Build Coastguard Worker shr myd, 16 3350*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3351*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3352*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] 3353*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3354*c0909341SAndroid Build Coastguard Worker sub srcq, stride3q 3355*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 3356*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 3357*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 3358*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 3359*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 3360*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q3333 3361*c0909341SAndroid Build Coastguard Worker jmp .hv_w8 3362*c0909341SAndroid Build Coastguard Worker.hv_w4: 3363*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 3364*c0909341SAndroid Build Coastguard Worker dec srcq 3365*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 3366*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3367*c0909341SAndroid Build Coastguard Worker shr myd, 16 3368*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3369*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3370*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] 3371*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3372*c0909341SAndroid Build Coastguard Worker sub srcq, stride3q 3373*c0909341SAndroid Build Coastguard Worker mova m7, [subpel_h_shuf4] 3374*c0909341SAndroid Build Coastguard Worker pmovzxbd m9, [deint_shuf4] 3375*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_8192] 3376*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 3377*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 3378*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [pd_32] 3379*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 3380*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 3381*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 3382*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q3333 3383*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+strideq*0] 3384*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*1] 3385*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+strideq*2] 3386*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+stride3q ] 3387*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3388*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+strideq*0] 3389*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+strideq*1] 3390*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+strideq*2] 3391*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xcc ; 0 1 3392*c0909341SAndroid Build Coastguard Worker vpblendd m0, m5, 0xcc ; 2 3 3393*c0909341SAndroid Build Coastguard Worker vpblendd m3, m6, 0xcc ; 4 5 3394*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 ; 00 01 10 11 02 03 12 13 3395*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 ; 20 21 30 31 22 23 32 33 3396*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 ; 40 41 50 51 42 43 52 53 3397*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 ; 60 61 60 61 62 63 62 63 3398*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m8 3399*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8 3400*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m8 3401*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m8 3402*c0909341SAndroid Build Coastguard Worker phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b 3403*c0909341SAndroid Build Coastguard Worker phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ 3404*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m10 3405*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m10 3406*c0909341SAndroid Build Coastguard Worker palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b 3407*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m4 ; 01 12 3408*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 ; 23 34 3409*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q2121 3410*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m0 ; 45 56 3411*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 3412*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m12 ; a0 b0 3413*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m12 ; c0 d0 3414*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m13 ; a1 b1 3415*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m3, m13 ; c1 d1 3416*c0909341SAndroid Build Coastguard Worker mova m1, m3 3417*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 ; a2 b2 3418*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3419*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+stride3q ] 3420*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3421*c0909341SAndroid Build Coastguard Worker paddd m6, m4 3422*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*0] 3423*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3424*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+strideq*1] 3425*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xcc 3426*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*2] 3427*c0909341SAndroid Build Coastguard Worker vpblendd m3, m4, 0xcc 3428*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 3429*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 3430*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m8 3431*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m8 3432*c0909341SAndroid Build Coastguard Worker phaddw m2, m3 3433*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m10 3434*c0909341SAndroid Build Coastguard Worker palignr m3, m2, m0, 12 3435*c0909341SAndroid Build Coastguard Worker mova m0, m2 3436*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 ; 67 78 3437*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 ; 89 9a 3438*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m2, m14 ; c2 d2 3439*c0909341SAndroid Build Coastguard Worker paddd m6, m11 3440*c0909341SAndroid Build Coastguard Worker paddd m5, m11 3441*c0909341SAndroid Build Coastguard Worker paddd m6, m4 3442*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m2, m15 ; a3 b3 3443*c0909341SAndroid Build Coastguard Worker paddd m5, m4 3444*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m3, m15 ; c3 d3 3445*c0909341SAndroid Build Coastguard Worker paddd m6, m4 3446*c0909341SAndroid Build Coastguard Worker psrad m5, 6 3447*c0909341SAndroid Build Coastguard Worker psrad m6, 6 3448*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 3449*c0909341SAndroid Build Coastguard Worker vpermd m5, m9, m5 3450*c0909341SAndroid Build Coastguard Worker mova [tmpq], m5 3451*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3452*c0909341SAndroid Build Coastguard Worker sub hd, 4 3453*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 3454*c0909341SAndroid Build Coastguard Worker RET 3455*c0909341SAndroid Build Coastguard Worker.hv_w8: 3456*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*8-64] 3457*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*4] 3458*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0: 3459*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufA] 3460*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+strideq*0] 3461*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+strideq*2] 3462*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufB] 3463*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+strideq*1] 3464*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3465*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [subpel_h_shufC] 3466*c0909341SAndroid Build Coastguard Worker movu xm6, [r5+strideq*0] 3467*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [r5+strideq*1] 3468*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3469*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0xf0 ; 0 3 3470*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [r5+strideq*0], 1 ; 1 4 3471*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [r5+strideq*1], 1 ; 2 5 3472*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3473*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r5+strideq*0], 1 ; 3 6 3474*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 3475*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 3476*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 3477*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 3478*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 3479*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 3480*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 3481*c0909341SAndroid Build Coastguard Worker vpermq m6, m6, q3120 3482*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 3483*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 3484*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 3485*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m7 3486*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 3487*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 ; 01 3488*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; 34 3489*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5, m6 ; 12 3490*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 ; 45 3491*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m6, m7 ; 23 3492*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 56 3493*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 3494*c0909341SAndroid Build Coastguard Worker vextracti128 [r7], m0, 1 ; not enough registers 3495*c0909341SAndroid Build Coastguard Worker movu xm0, [r5+strideq*1] 3496*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3497*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [r5+strideq*0], 1 ; 7 8 3498*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m1, m12 ; a0 3499*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m2, m12 ; b0 3500*c0909341SAndroid Build Coastguard Worker mova m1, m3 3501*c0909341SAndroid Build Coastguard Worker mova m2, m4 3502*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 ; a1 3503*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 ; b1 3504*c0909341SAndroid Build Coastguard Worker paddd m8, m3 3505*c0909341SAndroid Build Coastguard Worker paddd m9, m4 3506*c0909341SAndroid Build Coastguard Worker mova m3, m5 3507*c0909341SAndroid Build Coastguard Worker mova m4, m6 3508*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m14 ; a2 3509*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 ; b2 3510*c0909341SAndroid Build Coastguard Worker paddd m8, m5 3511*c0909341SAndroid Build Coastguard Worker paddd m9, m6 3512*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufB] 3513*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufC] 3514*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [subpel_h_shufA] 3515*c0909341SAndroid Build Coastguard Worker HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 3516*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_8192] 3517*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pd_32] 3518*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [r7] 3519*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 3520*c0909341SAndroid Build Coastguard Worker paddd m8, m7 3521*c0909341SAndroid Build Coastguard Worker paddd m9, m7 3522*c0909341SAndroid Build Coastguard Worker vpermq m7, m0, q3120 ; 7 8 3523*c0909341SAndroid Build Coastguard Worker shufpd m6, m7, 0x04 ; 6 7 3524*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m7 ; 67 3525*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 ; 78 3526*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m5, m15 ; a3 3527*c0909341SAndroid Build Coastguard Worker paddd m8, m7 3528*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m6, m15 ; b3 3529*c0909341SAndroid Build Coastguard Worker paddd m7, m9 3530*c0909341SAndroid Build Coastguard Worker psrad m8, 6 3531*c0909341SAndroid Build Coastguard Worker psrad m7, 6 3532*c0909341SAndroid Build Coastguard Worker packssdw m8, m7 3533*c0909341SAndroid Build Coastguard Worker vpermq m7, m8, q3120 3534*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0], xm7 3535*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+wq*2], m7, 1 3536*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*4] 3537*c0909341SAndroid Build Coastguard Worker sub hd, 2 3538*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 3539*c0909341SAndroid Build Coastguard Worker add srcq, 8 3540*c0909341SAndroid Build Coastguard Worker add tmpq, 16 3541*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 3542*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 3543*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop0 3544*c0909341SAndroid Build Coastguard Worker RET 3545*c0909341SAndroid Build Coastguard Worker 3546*c0909341SAndroid Build Coastguard Worker%macro movifprep 2 3547*c0909341SAndroid Build Coastguard Worker %if isprep 3548*c0909341SAndroid Build Coastguard Worker mov %1, %2 3549*c0909341SAndroid Build Coastguard Worker %endif 3550*c0909341SAndroid Build Coastguard Worker%endmacro 3551*c0909341SAndroid Build Coastguard Worker 3552*c0909341SAndroid Build Coastguard Worker%macro REMAP_REG 2 3553*c0909341SAndroid Build Coastguard Worker %xdefine r%1 r%2 3554*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%2q 3555*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%2d 3556*c0909341SAndroid Build Coastguard Worker%endmacro 3557*c0909341SAndroid Build Coastguard Worker 3558*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 3559*c0909341SAndroid Build Coastguard Worker %if isprep 3560*c0909341SAndroid Build Coastguard Worker %xdefine r14_save r14 3561*c0909341SAndroid Build Coastguard Worker %assign %%i 14 3562*c0909341SAndroid Build Coastguard Worker %rep 14 3563*c0909341SAndroid Build Coastguard Worker %assign %%j %%i-1 3564*c0909341SAndroid Build Coastguard Worker REMAP_REG %%i, %%j 3565*c0909341SAndroid Build Coastguard Worker %assign %%i %%i-1 3566*c0909341SAndroid Build Coastguard Worker %endrep 3567*c0909341SAndroid Build Coastguard Worker %endif 3568*c0909341SAndroid Build Coastguard Worker%endmacro 3569*c0909341SAndroid Build Coastguard Worker 3570*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 3571*c0909341SAndroid Build Coastguard Worker %if isprep 3572*c0909341SAndroid Build Coastguard Worker %assign %%i 1 3573*c0909341SAndroid Build Coastguard Worker %rep 13 3574*c0909341SAndroid Build Coastguard Worker %assign %%j %%i+1 3575*c0909341SAndroid Build Coastguard Worker REMAP_REG %%i, %%j 3576*c0909341SAndroid Build Coastguard Worker %assign %%i %%i+1 3577*c0909341SAndroid Build Coastguard Worker %endrep 3578*c0909341SAndroid Build Coastguard Worker %xdefine r14 r14_save 3579*c0909341SAndroid Build Coastguard Worker %undef r14_save 3580*c0909341SAndroid Build Coastguard Worker %endif 3581*c0909341SAndroid Build Coastguard Worker%endmacro 3582*c0909341SAndroid Build Coastguard Worker 3583*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 3584*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 3585*c0909341SAndroid Build Coastguard Worker RET 3586*c0909341SAndroid Build Coastguard Worker %if %1 3587*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3588*c0909341SAndroid Build Coastguard Worker %endif 3589*c0909341SAndroid Build Coastguard Worker%endmacro 3590*c0909341SAndroid Build Coastguard Worker 3591*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] 3592*c0909341SAndroid Build Coastguard Worker movq xm%1, [srcq+ r4] 3593*c0909341SAndroid Build Coastguard Worker movq xm%2, [srcq+ r6] 3594*c0909341SAndroid Build Coastguard Worker movhps xm%1, [srcq+ r7] 3595*c0909341SAndroid Build Coastguard Worker movhps xm%2, [srcq+ r9] 3596*c0909341SAndroid Build Coastguard Worker vinserti128 m%1, [srcq+r10], 1 3597*c0909341SAndroid Build Coastguard Worker vinserti128 m%2, [srcq+r11], 1 3598*c0909341SAndroid Build Coastguard Worker vpbroadcastq m%5, [srcq+r13] 3599*c0909341SAndroid Build Coastguard Worker vpbroadcastq m%6, [srcq+ rX] 3600*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3601*c0909341SAndroid Build Coastguard Worker movq xm%3, [srcq+ r4] 3602*c0909341SAndroid Build Coastguard Worker movq xm%4, [srcq+ r6] 3603*c0909341SAndroid Build Coastguard Worker movhps xm%3, [srcq+ r7] 3604*c0909341SAndroid Build Coastguard Worker movhps xm%4, [srcq+ r9] 3605*c0909341SAndroid Build Coastguard Worker vinserti128 m%3, [srcq+r10], 1 3606*c0909341SAndroid Build Coastguard Worker vinserti128 m%4, [srcq+r11], 1 3607*c0909341SAndroid Build Coastguard Worker vpbroadcastq m%7, [srcq+r13] 3608*c0909341SAndroid Build Coastguard Worker vpbroadcastq m%8, [srcq+ rX] 3609*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3610*c0909341SAndroid Build Coastguard Worker vpblendd m%1, m%5, 0xc0 3611*c0909341SAndroid Build Coastguard Worker vpblendd m%2, m%6, 0xc0 3612*c0909341SAndroid Build Coastguard Worker vpblendd m%3, m%7, 0xc0 3613*c0909341SAndroid Build Coastguard Worker vpblendd m%4, m%8, 0xc0 3614*c0909341SAndroid Build Coastguard Worker pmaddubsw m%1, m15 3615*c0909341SAndroid Build Coastguard Worker pmaddubsw m%2, m10 3616*c0909341SAndroid Build Coastguard Worker pmaddubsw m%3, m15 3617*c0909341SAndroid Build Coastguard Worker pmaddubsw m%4, m10 3618*c0909341SAndroid Build Coastguard Worker phaddw m%1, m%2 3619*c0909341SAndroid Build Coastguard Worker phaddw m%3, m%4 3620*c0909341SAndroid Build Coastguard Worker phaddw m%1, m%3 3621*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m12 3622*c0909341SAndroid Build Coastguard Worker%endmacro 3623*c0909341SAndroid Build Coastguard Worker 3624*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED 1 3625*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3626*c0909341SAndroid Build Coastguard Worker %assign isprep 0 3627*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy 3628*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r12 3629*c0909341SAndroid Build Coastguard Worker %define rndshift 10 3630*c0909341SAndroid Build Coastguard Worker%else 3631*c0909341SAndroid Build Coastguard Worker %assign isprep 1 3632*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy 3633*c0909341SAndroid Build Coastguard Worker %define tmp_stridem qword [rsp+120] 3634*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r11 3635*c0909341SAndroid Build Coastguard Worker %define rndshift 6 3636*c0909341SAndroid Build Coastguard Worker%endif 3637*c0909341SAndroid Build Coastguard Worker lea base_reg, [%1_8tap_scaled_8bpc_avx2] 3638*c0909341SAndroid Build Coastguard Worker%define base base_reg-%1_8tap_scaled_8bpc_avx2 3639*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3640*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 3641*c0909341SAndroid Build Coastguard Worker%if isprep && UNIX64 3642*c0909341SAndroid Build Coastguard Worker movd xm14, mxd 3643*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, xm14 3644*c0909341SAndroid Build Coastguard Worker mov r5d, t0d 3645*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 5, 7 3646*c0909341SAndroid Build Coastguard Worker%else 3647*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, mxm 3648*c0909341SAndroid Build Coastguard Worker%endif 3649*c0909341SAndroid Build Coastguard Worker mov dyd, dym 3650*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3651*c0909341SAndroid Build Coastguard Worker %if WIN64 3652*c0909341SAndroid Build Coastguard Worker mov r8d, hm 3653*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 3654*c0909341SAndroid Build Coastguard Worker %define hm r5m 3655*c0909341SAndroid Build Coastguard Worker %define dxm r8m 3656*c0909341SAndroid Build Coastguard Worker %else 3657*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 3658*c0909341SAndroid Build Coastguard Worker %define hm r6m 3659*c0909341SAndroid Build Coastguard Worker %endif 3660*c0909341SAndroid Build Coastguard Worker %define dsm [rsp+112] 3661*c0909341SAndroid Build Coastguard Worker %define rX r1 3662*c0909341SAndroid Build Coastguard Worker %define rXd r1d 3663*c0909341SAndroid Build Coastguard Worker%else ; prep 3664*c0909341SAndroid Build Coastguard Worker %if WIN64 3665*c0909341SAndroid Build Coastguard Worker mov r7d, hm 3666*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 3667*c0909341SAndroid Build Coastguard Worker %define hm r4m 3668*c0909341SAndroid Build Coastguard Worker %define dxm r7m 3669*c0909341SAndroid Build Coastguard Worker %else 3670*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 3671*c0909341SAndroid Build Coastguard Worker %define hm [rsp+112] 3672*c0909341SAndroid Build Coastguard Worker %endif 3673*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3674*c0909341SAndroid Build Coastguard Worker %define rX r14 3675*c0909341SAndroid Build Coastguard Worker %define rXd r14d 3676*c0909341SAndroid Build Coastguard Worker%endif 3677*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_0x3ff] 3678*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_8192] 3679*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3680*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pd_512] 3681*c0909341SAndroid Build Coastguard Worker%else 3682*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pd_32] 3683*c0909341SAndroid Build Coastguard Worker%endif 3684*c0909341SAndroid Build Coastguard Worker pxor m9, m9 3685*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 3686*c0909341SAndroid Build Coastguard Worker movzx r7d, t1b 3687*c0909341SAndroid Build Coastguard Worker shr t1d, 16 3688*c0909341SAndroid Build Coastguard Worker cmp hd, 6 3689*c0909341SAndroid Build Coastguard Worker cmovs t1d, r7d 3690*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 3691*c0909341SAndroid Build Coastguard Worker cmp dyd, 1024 3692*c0909341SAndroid Build Coastguard Worker je .dy1 3693*c0909341SAndroid Build Coastguard Worker cmp dyd, 2048 3694*c0909341SAndroid Build Coastguard Worker je .dy2 3695*c0909341SAndroid Build Coastguard Worker movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] 3696*c0909341SAndroid Build Coastguard Worker add wq, base_reg 3697*c0909341SAndroid Build Coastguard Worker jmp wq 3698*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3699*c0909341SAndroid Build Coastguard Worker.w2: 3700*c0909341SAndroid Build Coastguard Worker mov myd, mym 3701*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 3702*c0909341SAndroid Build Coastguard Worker dec srcq 3703*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 3704*c0909341SAndroid Build Coastguard Worker punpckldq m8, m9, m8 3705*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0,1] 3706*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pd_0x4000] 3707*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 3708*c0909341SAndroid Build Coastguard Worker pand m8, m14, m10 3709*c0909341SAndroid Build Coastguard Worker psrld m8, 6 3710*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 3711*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 3712*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 3713*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_dw] 3714*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_s_shuf2] 3715*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+subpel_filters+r4*8+2] 3716*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+subpel_filters+r6*8+2] 3717*c0909341SAndroid Build Coastguard Worker pcmpeqd m8, m9 3718*c0909341SAndroid Build Coastguard Worker psrld m14, 10 3719*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 3720*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*2] 3721*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*1] 3722*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ss3q ] 3723*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3724*c0909341SAndroid Build Coastguard Worker pshufb m14, m5 3725*c0909341SAndroid Build Coastguard Worker paddb m14, m6 3726*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0], 1 3727*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*2], 1 3728*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*1] 3729*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ss3q ] 3730*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3731*c0909341SAndroid Build Coastguard Worker vpblendd m15, m7, 0xaa 3732*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0xc0 ; 0 1 4 5 3733*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xc0 ; 2 3 6 7 3734*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m8 3735*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 3736*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 3737*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m15 3738*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m15 3739*c0909341SAndroid Build Coastguard Worker phaddw m0, m1 3740*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 3741*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 ; 4 5 6 7 3742*c0909341SAndroid Build Coastguard Worker palignr xm2, xm1, xm0, 4 ; 1 2 3 4 3743*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0, xm2 ; 01 12 3744*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm2 ; 23 34 3745*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm1, q0321 ; 5 6 7 _ 3746*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm4 ; 45 56 3747*c0909341SAndroid Build Coastguard Worker punpckhwd xm4, xm1, xm4 ; 67 __ 3748*c0909341SAndroid Build Coastguard Worker.w2_loop: 3749*c0909341SAndroid Build Coastguard Worker and myd, 0x3ff 3750*c0909341SAndroid Build Coastguard Worker mov r6d, 64 << 24 3751*c0909341SAndroid Build Coastguard Worker mov r4d, myd 3752*c0909341SAndroid Build Coastguard Worker shr r4d, 6 3753*c0909341SAndroid Build Coastguard Worker lea r4d, [t1+r4] 3754*c0909341SAndroid Build Coastguard Worker cmovnz r6q, [base+subpel_filters+r4*8] 3755*c0909341SAndroid Build Coastguard Worker movq xm11, r6q 3756*c0909341SAndroid Build Coastguard Worker pmovsxbw xm11, xm11 3757*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm11, q0000 3758*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm11, q1111 3759*c0909341SAndroid Build Coastguard Worker pshufd xm10, xm11, q2222 3760*c0909341SAndroid Build Coastguard Worker pshufd xm11, xm11, q3333 3761*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm3, xm8 3762*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm0, xm9 3763*c0909341SAndroid Build Coastguard Worker pmaddwd xm7, xm2, xm10 3764*c0909341SAndroid Build Coastguard Worker pmaddwd xm8, xm4, xm11 3765*c0909341SAndroid Build Coastguard Worker paddd xm5, xm6 3766*c0909341SAndroid Build Coastguard Worker paddd xm7, xm8 3767*c0909341SAndroid Build Coastguard Worker paddd xm5, xm13 3768*c0909341SAndroid Build Coastguard Worker paddd xm5, xm7 3769*c0909341SAndroid Build Coastguard Worker psrad xm5, 10 3770*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 3771*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm5 3772*c0909341SAndroid Build Coastguard Worker pextrw [dstq], xm5, 0 3773*c0909341SAndroid Build Coastguard Worker add dstq, dsq 3774*c0909341SAndroid Build Coastguard Worker dec hd 3775*c0909341SAndroid Build Coastguard Worker jz .ret 3776*c0909341SAndroid Build Coastguard Worker add myd, dyd 3777*c0909341SAndroid Build Coastguard Worker test myd, ~0x3ff 3778*c0909341SAndroid Build Coastguard Worker jz .w2_loop 3779*c0909341SAndroid Build Coastguard Worker movq xm5, [srcq] 3780*c0909341SAndroid Build Coastguard Worker test myd, 0x400 3781*c0909341SAndroid Build Coastguard Worker jz .w2_skip_line 3782*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3783*c0909341SAndroid Build Coastguard Worker shufps xm3, xm0, q1032 ; 01 12 3784*c0909341SAndroid Build Coastguard Worker shufps xm0, xm2, q1032 ; 23 34 3785*c0909341SAndroid Build Coastguard Worker shufps xm2, xm4, q1032 ; 45 56 3786*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm14 3787*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm15 3788*c0909341SAndroid Build Coastguard Worker phaddw xm5, xm5 3789*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm12 3790*c0909341SAndroid Build Coastguard Worker palignr xm1, xm5, xm1, 12 3791*c0909341SAndroid Build Coastguard Worker punpcklqdq xm1, xm1 ; 6 7 6 7 3792*c0909341SAndroid Build Coastguard Worker punpcklwd xm4, xm1, xm5 ; 67 __ 3793*c0909341SAndroid Build Coastguard Worker jmp .w2_loop 3794*c0909341SAndroid Build Coastguard Worker.w2_skip_line: 3795*c0909341SAndroid Build Coastguard Worker movhps xm5, [srcq+ssq*1] 3796*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3797*c0909341SAndroid Build Coastguard Worker mova xm3, xm0 ; 01 12 3798*c0909341SAndroid Build Coastguard Worker mova xm0, xm2 ; 23 34 3799*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm14 3800*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm15 3801*c0909341SAndroid Build Coastguard Worker phaddw xm5, xm5 3802*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm12 ; 6 7 6 7 3803*c0909341SAndroid Build Coastguard Worker palignr xm1, xm5, xm1, 8 ; 4 5 6 7 3804*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm1, q0321 ; 5 6 7 _ 3805*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm5 ; 45 56 3806*c0909341SAndroid Build Coastguard Worker punpckhwd xm4, xm1, xm5 ; 67 __ 3807*c0909341SAndroid Build Coastguard Worker jmp .w2_loop 3808*c0909341SAndroid Build Coastguard Worker%endif 3809*c0909341SAndroid Build Coastguard Worker.w4: 3810*c0909341SAndroid Build Coastguard Worker mov myd, mym 3811*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+rescale_mul] 3812*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 3813*c0909341SAndroid Build Coastguard Worker dec srcq 3814*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 3815*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m7 3816*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pd_0x4000] 3817*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 3818*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-3] 3819*c0909341SAndroid Build Coastguard Worker pand m0, m14, m10 3820*c0909341SAndroid Build Coastguard Worker psrld m0, 6 3821*c0909341SAndroid Build Coastguard Worker paddd xm15, xm0 3822*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 3823*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 3824*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm15, 2 3825*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm15, 3 3826*c0909341SAndroid Build Coastguard Worker movd xm15, [base+subpel_filters+r4*8+2] 3827*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_dw] 3828*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [base+subpel_s_shuf2] 3829*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r6*8+2], 1 3830*c0909341SAndroid Build Coastguard Worker pcmpeqd m0, m9 3831*c0909341SAndroid Build Coastguard Worker psrld m14, 10 3832*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+ssq*0] 3833*c0909341SAndroid Build Coastguard Worker movu xm9, [srcq+ssq*1] 3834*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r11*8+2], 2 3835*c0909341SAndroid Build Coastguard Worker movu xm8, [srcq+ssq*2] 3836*c0909341SAndroid Build Coastguard Worker movu xm10, [srcq+ss3q ] 3837*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r13*8+2], 3 3838*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3839*c0909341SAndroid Build Coastguard Worker pshufb m14, m5 3840*c0909341SAndroid Build Coastguard Worker paddb m14, m6 3841*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+ssq*0], 1 3842*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [srcq+ssq*1], 1 3843*c0909341SAndroid Build Coastguard Worker vinserti128 m15, xm15, 1 3844*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [srcq+ssq*2], 1 3845*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [srcq+ss3q ], 1 3846*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3847*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m0 3848*c0909341SAndroid Build Coastguard Worker pshufb m7, m14 3849*c0909341SAndroid Build Coastguard Worker pshufb m9, m14 3850*c0909341SAndroid Build Coastguard Worker pshufb m8, m14 3851*c0909341SAndroid Build Coastguard Worker pshufb m10, m14 3852*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, m15 3853*c0909341SAndroid Build Coastguard Worker pmaddubsw m9, m15 3854*c0909341SAndroid Build Coastguard Worker pmaddubsw m8, m15 3855*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m15 3856*c0909341SAndroid Build Coastguard Worker phaddw m7, m9 3857*c0909341SAndroid Build Coastguard Worker phaddw m8, m10 3858*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m12 ; 0 1 4 5 3859*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m12 ; 2 3 6 7 3860*c0909341SAndroid Build Coastguard Worker vextracti128 xm9, m7, 1 ; 4 5 3861*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m8, 1 ; 6 7 3862*c0909341SAndroid Build Coastguard Worker shufps xm4, xm7, xm8, q1032 ; 1 2 3863*c0909341SAndroid Build Coastguard Worker shufps xm5, xm8, xm9, q1032 ; 3 4 3864*c0909341SAndroid Build Coastguard Worker shufps xm6, xm9, xm3, q1032 ; 5 6 3865*c0909341SAndroid Build Coastguard Worker psrldq xm11, xm3, 8 ; 7 _ 3866*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm7, xm4 ; 01 3867*c0909341SAndroid Build Coastguard Worker punpckhwd xm7, xm4 ; 12 3868*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm8, xm5 ; 23 3869*c0909341SAndroid Build Coastguard Worker punpckhwd xm8, xm5 ; 34 3870*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm9, xm6 ; 45 3871*c0909341SAndroid Build Coastguard Worker punpckhwd xm9, xm6 ; 56 3872*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm11 ; 67 3873*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], xm7 3874*c0909341SAndroid Build Coastguard Worker mova [rsp+0x10], xm8 3875*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], xm9 3876*c0909341SAndroid Build Coastguard Worker.w4_loop: 3877*c0909341SAndroid Build Coastguard Worker and myd, 0x3ff 3878*c0909341SAndroid Build Coastguard Worker mov r6d, 64 << 24 3879*c0909341SAndroid Build Coastguard Worker mov r4d, myd 3880*c0909341SAndroid Build Coastguard Worker shr r4d, 6 3881*c0909341SAndroid Build Coastguard Worker lea r4d, [t1+r4] 3882*c0909341SAndroid Build Coastguard Worker cmovnz r6q, [base+subpel_filters+r4*8] 3883*c0909341SAndroid Build Coastguard Worker movq xm10, r6q 3884*c0909341SAndroid Build Coastguard Worker pmovsxbw xm10, xm10 3885*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm10, q0000 3886*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm10, q1111 3887*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm10, q2222 3888*c0909341SAndroid Build Coastguard Worker pshufd xm10, xm10, q3333 3889*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm0, xm7 3890*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm1, xm8 3891*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm2, xm9 3892*c0909341SAndroid Build Coastguard Worker pmaddwd xm7, xm3, xm10 3893*c0909341SAndroid Build Coastguard Worker paddd xm4, xm5 3894*c0909341SAndroid Build Coastguard Worker paddd xm6, xm7 3895*c0909341SAndroid Build Coastguard Worker paddd xm4, xm13 3896*c0909341SAndroid Build Coastguard Worker paddd xm4, xm6 3897*c0909341SAndroid Build Coastguard Worker psrad xm4, rndshift 3898*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm4 3899*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3900*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 3901*c0909341SAndroid Build Coastguard Worker movd [dstq], xm4 3902*c0909341SAndroid Build Coastguard Worker add dstq, dsq 3903*c0909341SAndroid Build Coastguard Worker%else 3904*c0909341SAndroid Build Coastguard Worker movq [tmpq], xm4 3905*c0909341SAndroid Build Coastguard Worker add tmpq, 8 3906*c0909341SAndroid Build Coastguard Worker%endif 3907*c0909341SAndroid Build Coastguard Worker dec hd 3908*c0909341SAndroid Build Coastguard Worker jz .ret 3909*c0909341SAndroid Build Coastguard Worker add myd, dyd 3910*c0909341SAndroid Build Coastguard Worker test myd, ~0x3ff 3911*c0909341SAndroid Build Coastguard Worker jz .w4_loop 3912*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq] 3913*c0909341SAndroid Build Coastguard Worker test myd, 0x400 3914*c0909341SAndroid Build Coastguard Worker jz .w4_skip_line 3915*c0909341SAndroid Build Coastguard Worker mova xm0, [rsp+0x00] 3916*c0909341SAndroid Build Coastguard Worker mova [rsp+0x00], xm1 3917*c0909341SAndroid Build Coastguard Worker mova xm1, [rsp+0x10] 3918*c0909341SAndroid Build Coastguard Worker mova [rsp+0x10], xm2 3919*c0909341SAndroid Build Coastguard Worker mova xm2, [rsp+0x20] 3920*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], xm3 3921*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm14 3922*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm15 3923*c0909341SAndroid Build Coastguard Worker phaddw xm4, xm4 3924*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm12 3925*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm11, xm4 3926*c0909341SAndroid Build Coastguard Worker mova xm11, xm4 3927*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3928*c0909341SAndroid Build Coastguard Worker jmp .w4_loop 3929*c0909341SAndroid Build Coastguard Worker.w4_skip_line: 3930*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*1] 3931*c0909341SAndroid Build Coastguard Worker movu m6, [rsp+0x10] 3932*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm14 3933*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm14 3934*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm15 3935*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm15 3936*c0909341SAndroid Build Coastguard Worker movu [rsp+0x00], m6 3937*c0909341SAndroid Build Coastguard Worker phaddw xm4, xm5 3938*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm12 3939*c0909341SAndroid Build Coastguard Worker punpcklwd xm9, xm11, xm4 3940*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], xm9 3941*c0909341SAndroid Build Coastguard Worker psrldq xm11, xm4, 8 3942*c0909341SAndroid Build Coastguard Worker mova xm0, xm1 3943*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 3944*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 3945*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm4, xm11 3946*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3947*c0909341SAndroid Build Coastguard Worker jmp .w4_loop 3948*c0909341SAndroid Build Coastguard Worker.w8: 3949*c0909341SAndroid Build Coastguard Worker mov dword [rsp+48], 1 3950*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 16 3951*c0909341SAndroid Build Coastguard Worker jmp .w_start 3952*c0909341SAndroid Build Coastguard Worker.w16: 3953*c0909341SAndroid Build Coastguard Worker mov dword [rsp+48], 2 3954*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 32 3955*c0909341SAndroid Build Coastguard Worker jmp .w_start 3956*c0909341SAndroid Build Coastguard Worker.w32: 3957*c0909341SAndroid Build Coastguard Worker mov dword [rsp+48], 4 3958*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 64 3959*c0909341SAndroid Build Coastguard Worker jmp .w_start 3960*c0909341SAndroid Build Coastguard Worker.w64: 3961*c0909341SAndroid Build Coastguard Worker mov dword [rsp+48], 8 3962*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 128 3963*c0909341SAndroid Build Coastguard Worker jmp .w_start 3964*c0909341SAndroid Build Coastguard Worker.w128: 3965*c0909341SAndroid Build Coastguard Worker mov dword [rsp+48], 16 3966*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 256 3967*c0909341SAndroid Build Coastguard Worker.w_start: 3968*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 3969*c0909341SAndroid Build Coastguard Worker movifnidn dsm, dsq 3970*c0909341SAndroid Build Coastguard Worker%endif 3971*c0909341SAndroid Build Coastguard Worker shr t0d, 16 3972*c0909341SAndroid Build Coastguard Worker sub srcq, 3 3973*c0909341SAndroid Build Coastguard Worker pmaddwd m8, [base+rescale_mul] 3974*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 3975*c0909341SAndroid Build Coastguard Worker mov [rsp+72], t0d 3976*c0909341SAndroid Build Coastguard Worker mov [rsp+56], srcq 3977*c0909341SAndroid Build Coastguard Worker mov [rsp+64], r0q ; dstq / tmpq 3978*c0909341SAndroid Build Coastguard Worker%if UNIX64 3979*c0909341SAndroid Build Coastguard Worker mov hm, hd 3980*c0909341SAndroid Build Coastguard Worker%endif 3981*c0909341SAndroid Build Coastguard Worker shl dword dxm, 3 ; dx*8 3982*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xm15 3983*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-7] 3984*c0909341SAndroid Build Coastguard Worker jmp .hloop 3985*c0909341SAndroid Build Coastguard Worker.hloop_prep: 3986*c0909341SAndroid Build Coastguard Worker dec dword [rsp+48] 3987*c0909341SAndroid Build Coastguard Worker jz .ret 3988*c0909341SAndroid Build Coastguard Worker add qword [rsp+64], 8*(isprep+1) 3989*c0909341SAndroid Build Coastguard Worker mov hd, hm 3990*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 3991*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_0x3ff] 3992*c0909341SAndroid Build Coastguard Worker paddd m14, m8, [rsp+16] 3993*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+72] 3994*c0909341SAndroid Build Coastguard Worker pxor m9, m9 3995*c0909341SAndroid Build Coastguard Worker mov srcq, [rsp+56] 3996*c0909341SAndroid Build Coastguard Worker mov r0q, [rsp+64] ; dstq / tmpq 3997*c0909341SAndroid Build Coastguard Worker.hloop: 3998*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [base+pq_0x40000000] 3999*c0909341SAndroid Build Coastguard Worker pand m6, m14, m10 4000*c0909341SAndroid Build Coastguard Worker psrld m6, 6 4001*c0909341SAndroid Build Coastguard Worker paddd m15, m6 4002*c0909341SAndroid Build Coastguard Worker pcmpeqd m6, m9 4003*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m15, 1 4004*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4005*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 2 4006*c0909341SAndroid Build Coastguard Worker pextrd r7d, xm15, 1 4007*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm15, 3 4008*c0909341SAndroid Build Coastguard Worker movd r10d, xm7 4009*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm7, 2 4010*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm7, 1 4011*c0909341SAndroid Build Coastguard Worker pextrd rXd, xm7, 3 4012*c0909341SAndroid Build Coastguard Worker movu [rsp+16], m14 4013*c0909341SAndroid Build Coastguard Worker movq xm15, [base+subpel_filters+ r4*8] 4014*c0909341SAndroid Build Coastguard Worker movq xm10, [base+subpel_filters+ r6*8] 4015*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+subpel_filters+ r7*8] 4016*c0909341SAndroid Build Coastguard Worker movhps xm10, [base+subpel_filters+ r9*8] 4017*c0909341SAndroid Build Coastguard Worker vinserti128 m15, [base+subpel_filters+r10*8], 1 4018*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [base+subpel_filters+r11*8], 1 4019*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [base+subpel_filters+r13*8] 4020*c0909341SAndroid Build Coastguard Worker vpbroadcastq m8, [base+subpel_filters+ rX*8] 4021*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4022*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m14, 1 4023*c0909341SAndroid Build Coastguard Worker mova [rsp], xm14 4024*c0909341SAndroid Build Coastguard Worker movd r4d, xm14 4025*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm14, 2 4026*c0909341SAndroid Build Coastguard Worker pextrd r7d, xm14, 1 4027*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm14, 3 4028*c0909341SAndroid Build Coastguard Worker movd r10d, xm7 4029*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm7, 2 4030*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm7, 1 4031*c0909341SAndroid Build Coastguard Worker pextrd rXd, xm7, 3 4032*c0909341SAndroid Build Coastguard Worker pshufd m5, m6, q1100 4033*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q3322 4034*c0909341SAndroid Build Coastguard Worker vpblendd m15, m9, 0xc0 4035*c0909341SAndroid Build Coastguard Worker vpblendd m10, m8, 0xc0 4036*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m5 4037*c0909341SAndroid Build Coastguard Worker pblendvb m10, m11, m6 4038*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [base+subpel_s_shuf8] 4039*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4040*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4041*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4042*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 4043*c0909341SAndroid Build Coastguard Worker mov myd, mym 4044*c0909341SAndroid Build Coastguard Worker mov dyd, dym 4045*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 ; 01a 01b 4046*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 ; 23a 23b 4047*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 ; 45a 45b 4048*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 ; 67a 67b 4049*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [base+wswap] 4050*c0909341SAndroid Build Coastguard Worker.vloop: 4051*c0909341SAndroid Build Coastguard Worker and myd, 0x3ff 4052*c0909341SAndroid Build Coastguard Worker mov r6d, 64 << 24 4053*c0909341SAndroid Build Coastguard Worker mov r4d, myd 4054*c0909341SAndroid Build Coastguard Worker shr r4d, 6 4055*c0909341SAndroid Build Coastguard Worker lea r4d, [t1+r4] 4056*c0909341SAndroid Build Coastguard Worker cmovnz r6q, [base+subpel_filters+r4*8] 4057*c0909341SAndroid Build Coastguard Worker movq xm11, r6q 4058*c0909341SAndroid Build Coastguard Worker punpcklqdq xm11, xm11 4059*c0909341SAndroid Build Coastguard Worker pmovsxbw m11, xm11 4060*c0909341SAndroid Build Coastguard Worker pshufd m8, m11, q0000 4061*c0909341SAndroid Build Coastguard Worker pshufd m9, m11, q1111 4062*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 4063*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m9 4064*c0909341SAndroid Build Coastguard Worker pshufd m8, m11, q2222 4065*c0909341SAndroid Build Coastguard Worker pshufd m11, m11, q3333 4066*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m8 4067*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m3, m11 4068*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4069*c0909341SAndroid Build Coastguard Worker paddd m6, m7 4070*c0909341SAndroid Build Coastguard Worker paddd m4, m13 4071*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4072*c0909341SAndroid Build Coastguard Worker psrad m4, rndshift 4073*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4074*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4075*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4076*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 4077*c0909341SAndroid Build Coastguard Worker movq [dstq], xm4 4078*c0909341SAndroid Build Coastguard Worker add dstq, dsm 4079*c0909341SAndroid Build Coastguard Worker%else 4080*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4081*c0909341SAndroid Build Coastguard Worker add tmpq, tmp_stridem 4082*c0909341SAndroid Build Coastguard Worker%endif 4083*c0909341SAndroid Build Coastguard Worker dec hd 4084*c0909341SAndroid Build Coastguard Worker jz .hloop_prep 4085*c0909341SAndroid Build Coastguard Worker add myd, dyd 4086*c0909341SAndroid Build Coastguard Worker test myd, ~0x3ff 4087*c0909341SAndroid Build Coastguard Worker jz .vloop 4088*c0909341SAndroid Build Coastguard Worker test myd, 0x400 4089*c0909341SAndroid Build Coastguard Worker mov [rsp+52], myd 4090*c0909341SAndroid Build Coastguard Worker mov r4d, [rsp+ 0] 4091*c0909341SAndroid Build Coastguard Worker mov r6d, [rsp+ 8] 4092*c0909341SAndroid Build Coastguard Worker mov r7d, [rsp+ 4] 4093*c0909341SAndroid Build Coastguard Worker mov r9d, [rsp+12] 4094*c0909341SAndroid Build Coastguard Worker jz .skip_line 4095*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+r13] 4096*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [srcq+ rX] 4097*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ r4] 4098*c0909341SAndroid Build Coastguard Worker movq xm5, [srcq+ r6] 4099*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ r7] 4100*c0909341SAndroid Build Coastguard Worker movhps xm5, [srcq+ r9] 4101*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r10], 1 4102*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r11], 1 4103*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4104*c0909341SAndroid Build Coastguard Worker mov myd, [rsp+52] 4105*c0909341SAndroid Build Coastguard Worker mov dyd, dym 4106*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 4107*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 4108*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 4109*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 4110*c0909341SAndroid Build Coastguard Worker vpblendd m4, m6, 0xc0 4111*c0909341SAndroid Build Coastguard Worker vpblendd m5, m7, 0xc0 4112*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m15 4113*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m10 4114*c0909341SAndroid Build Coastguard Worker phaddw m4, m5 4115*c0909341SAndroid Build Coastguard Worker pslld m5, m4, 16 4116*c0909341SAndroid Build Coastguard Worker paddw m4, m5 4117*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 4118*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 4119*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 0xaa 4120*c0909341SAndroid Build Coastguard Worker pblendw m2, m3, 0xaa 4121*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 4122*c0909341SAndroid Build Coastguard Worker jmp .vloop 4123*c0909341SAndroid Build Coastguard Worker.skip_line: 4124*c0909341SAndroid Build Coastguard Worker mova m0, m1 4125*c0909341SAndroid Build Coastguard Worker mova m1, m2 4126*c0909341SAndroid Build Coastguard Worker mova m2, m3 4127*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [srcq+r13] 4128*c0909341SAndroid Build Coastguard Worker vpbroadcastq m8, [srcq+ rX] 4129*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+ r4] 4130*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ r6] 4131*c0909341SAndroid Build Coastguard Worker movhps xm3, [srcq+ r7] 4132*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ r9] 4133*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+r10], 1 4134*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r11], 1 4135*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4136*c0909341SAndroid Build Coastguard Worker movq xm5, [srcq+ r4] 4137*c0909341SAndroid Build Coastguard Worker movq xm6, [srcq+ r6] 4138*c0909341SAndroid Build Coastguard Worker movhps xm5, [srcq+ r7] 4139*c0909341SAndroid Build Coastguard Worker movhps xm6, [srcq+ r9] 4140*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r10], 1 4141*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+r11], 1 4142*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [srcq+r13] 4143*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [srcq+ rX] 4144*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4145*c0909341SAndroid Build Coastguard Worker mov myd, [rsp+52] 4146*c0909341SAndroid Build Coastguard Worker mov dyd, dym 4147*c0909341SAndroid Build Coastguard Worker vpblendd m3, m7, 0xc0 4148*c0909341SAndroid Build Coastguard Worker vpblendd m4, m8, 0xc0 4149*c0909341SAndroid Build Coastguard Worker vpblendd m5, m9, 0xc0 4150*c0909341SAndroid Build Coastguard Worker vpblendd m6, m11, 0xc0 4151*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m15 4152*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m10 4153*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m15 4154*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m10 4155*c0909341SAndroid Build Coastguard Worker phaddw m3, m4 4156*c0909341SAndroid Build Coastguard Worker phaddw m5, m6 4157*c0909341SAndroid Build Coastguard Worker psrld m4, m3, 16 4158*c0909341SAndroid Build Coastguard Worker pslld m6, m5, 16 4159*c0909341SAndroid Build Coastguard Worker paddw m3, m4 4160*c0909341SAndroid Build Coastguard Worker paddw m5, m6 4161*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 4162*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m12 4163*c0909341SAndroid Build Coastguard Worker jmp .vloop 4164*c0909341SAndroid Build Coastguard Worker.dy1: 4165*c0909341SAndroid Build Coastguard Worker movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] 4166*c0909341SAndroid Build Coastguard Worker add wq, base_reg 4167*c0909341SAndroid Build Coastguard Worker jmp wq 4168*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4169*c0909341SAndroid Build Coastguard Worker.dy1_w2: 4170*c0909341SAndroid Build Coastguard Worker mov myd, mym 4171*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4172*c0909341SAndroid Build Coastguard Worker dec srcq 4173*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4174*c0909341SAndroid Build Coastguard Worker punpckldq m8, m9, m8 4175*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-1] 4176*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pd_0x4000] 4177*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4178*c0909341SAndroid Build Coastguard Worker pand m8, m14, m10 4179*c0909341SAndroid Build Coastguard Worker psrld m8, 6 4180*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 4181*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4182*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4183*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_dw] 4184*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_s_shuf2] 4185*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+subpel_filters+r4*8+2] 4186*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+subpel_filters+r6*8+2] 4187*c0909341SAndroid Build Coastguard Worker pcmpeqd m8, m9 4188*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4189*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 4190*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*2] 4191*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*1] 4192*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ss3q ] 4193*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4194*c0909341SAndroid Build Coastguard Worker shr myd, 6 4195*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4196*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4197*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4198*c0909341SAndroid Build Coastguard Worker pshufb m14, m5 4199*c0909341SAndroid Build Coastguard Worker paddb m14, m6 4200*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [srcq+ssq*0], 1 4201*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+ssq*2], 1 4202*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*1] 4203*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 4204*c0909341SAndroid Build Coastguard Worker movq xm10, r4q 4205*c0909341SAndroid Build Coastguard Worker pmovsxbw xm10, xm10 4206*c0909341SAndroid Build Coastguard Worker vpblendd m15, m7, 0xaa 4207*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m8 4208*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm10, q0000 4209*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm10, q1111 4210*c0909341SAndroid Build Coastguard Worker pshufd xm11, xm10, q3333 4211*c0909341SAndroid Build Coastguard Worker pshufd xm10, xm10, q2222 4212*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0xc0 4213*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 4214*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 4215*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m15 4216*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m15 4217*c0909341SAndroid Build Coastguard Worker phaddw m0, m1 4218*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m12 4219*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 4220*c0909341SAndroid Build Coastguard Worker palignr xm2, xm1, xm0, 4 4221*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm1, q2121 4222*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm0, xm2 ; 01 12 4223*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm2 ; 23 34 4224*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm1, xm4 ; 45 56 4225*c0909341SAndroid Build Coastguard Worker.dy1_w2_loop: 4226*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*0] 4227*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ssq*1] 4228*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4229*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm3, xm8 4230*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm0, xm9 4231*c0909341SAndroid Build Coastguard Worker pmaddwd xm7, xm2, xm10 4232*c0909341SAndroid Build Coastguard Worker mova xm3, xm0 4233*c0909341SAndroid Build Coastguard Worker mova xm0, xm2 4234*c0909341SAndroid Build Coastguard Worker paddd xm5, xm13 4235*c0909341SAndroid Build Coastguard Worker paddd xm6, xm7 4236*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm14 4237*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm15 4238*c0909341SAndroid Build Coastguard Worker phaddw xm1, xm1 4239*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm12 4240*c0909341SAndroid Build Coastguard Worker palignr xm7, xm1, xm4, 12 4241*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm7, xm1 ; 67 78 4242*c0909341SAndroid Build Coastguard Worker pmaddwd xm7, xm2, xm11 4243*c0909341SAndroid Build Coastguard Worker mova xm4, xm1 4244*c0909341SAndroid Build Coastguard Worker paddd xm5, xm6 4245*c0909341SAndroid Build Coastguard Worker paddd xm5, xm7 4246*c0909341SAndroid Build Coastguard Worker psrad xm5, rndshift 4247*c0909341SAndroid Build Coastguard Worker packssdw xm5, xm5 4248*c0909341SAndroid Build Coastguard Worker packuswb xm5, xm5 4249*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm5, 0 4250*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm5, 1 4251*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4252*c0909341SAndroid Build Coastguard Worker sub hd, 2 4253*c0909341SAndroid Build Coastguard Worker jg .dy1_w2_loop 4254*c0909341SAndroid Build Coastguard Worker RET 4255*c0909341SAndroid Build Coastguard Worker%endif 4256*c0909341SAndroid Build Coastguard Worker.dy1_w4: 4257*c0909341SAndroid Build Coastguard Worker mov myd, mym 4258*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+rescale_mul] 4259*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4260*c0909341SAndroid Build Coastguard Worker dec srcq 4261*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4262*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m7 4263*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pd_0x4000] 4264*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4265*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-3] 4266*c0909341SAndroid Build Coastguard Worker pand m8, m14, m10 4267*c0909341SAndroid Build Coastguard Worker psrld m8, 6 4268*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 4269*c0909341SAndroid Build Coastguard Worker vpermq m8, m8, q3120 4270*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4271*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 2 4272*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm15, 1 4273*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm15, 3 4274*c0909341SAndroid Build Coastguard Worker movd xm15, [base+subpel_filters+r4*8+2] 4275*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+subpel_filters+r6*8+2] 4276*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0] 4277*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*2] 4278*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_dw] 4279*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [base+subpel_s_shuf2] 4280*c0909341SAndroid Build Coastguard Worker pcmpeqd m8, m9 4281*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4282*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r11*8+2], 1 4283*c0909341SAndroid Build Coastguard Worker vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 4284*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*1], 1 4285*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ss3q ], 1 4286*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4287*c0909341SAndroid Build Coastguard Worker shr myd, 6 4288*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4289*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4290*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4291*c0909341SAndroid Build Coastguard Worker pshufb m14, m5 4292*c0909341SAndroid Build Coastguard Worker paddb m14, m6 4293*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 4294*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*2] 4295*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+ssq*1], 1 4296*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 4297*c0909341SAndroid Build Coastguard Worker vpblendd m15, m7, 0x30 4298*c0909341SAndroid Build Coastguard Worker punpcklqdq m15, m15 4299*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m8 4300*c0909341SAndroid Build Coastguard Worker movq xm10, r4q 4301*c0909341SAndroid Build Coastguard Worker punpcklqdq xm10, xm10 4302*c0909341SAndroid Build Coastguard Worker pmovsxbw m10, xm10 4303*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 4304*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 4305*c0909341SAndroid Build Coastguard Worker pshufb m4, m14 4306*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm14 4307*c0909341SAndroid Build Coastguard Worker vpermq m2, m2, q3120 4308*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q3120 4309*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, q3120 4310*c0909341SAndroid Build Coastguard Worker vpermq m5, m5, q3120 4311*c0909341SAndroid Build Coastguard Worker pshufd m7, m10, q0000 4312*c0909341SAndroid Build Coastguard Worker pshufd m8, m10, q1111 4313*c0909341SAndroid Build Coastguard Worker pshufd m9, m10, q2222 4314*c0909341SAndroid Build Coastguard Worker pshufd m10, m10, q3333 4315*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m15 4316*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m15 4317*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m15 4318*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m15 4319*c0909341SAndroid Build Coastguard Worker phaddw m2, m3 4320*c0909341SAndroid Build Coastguard Worker phaddw m4, m5 4321*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m12 4322*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 4323*c0909341SAndroid Build Coastguard Worker palignr m5, m4, m2, 4 4324*c0909341SAndroid Build Coastguard Worker pshufd m3, m4, q2121 4325*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m5 ; 01 12 4326*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m5 ; 23 34 4327*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m3 ; 45 56 4328*c0909341SAndroid Build Coastguard Worker.dy1_w4_loop: 4329*c0909341SAndroid Build Coastguard Worker movu xm11, [srcq+ssq*0] 4330*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [srcq+ssq*1], 1 4331*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4332*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m7 4333*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m8 4334*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m9 4335*c0909341SAndroid Build Coastguard Worker mova m0, m1 4336*c0909341SAndroid Build Coastguard Worker mova m1, m2 4337*c0909341SAndroid Build Coastguard Worker paddd m4, m13 4338*c0909341SAndroid Build Coastguard Worker paddd m5, m6 4339*c0909341SAndroid Build Coastguard Worker pshufb m11, m14 4340*c0909341SAndroid Build Coastguard Worker vpermq m11, m11, q3120 4341*c0909341SAndroid Build Coastguard Worker pmaddubsw m11, m15 4342*c0909341SAndroid Build Coastguard Worker phaddw m11, m11 4343*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m12 4344*c0909341SAndroid Build Coastguard Worker palignr m6, m11, m3, 12 4345*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m6, m11 ; 67 78 4346*c0909341SAndroid Build Coastguard Worker mova m3, m11 4347*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m10 4348*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4349*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4350*c0909341SAndroid Build Coastguard Worker psrad m4, rndshift 4351*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4352*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4353*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4354*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 4355*c0909341SAndroid Build Coastguard Worker pshuflw xm4, xm4, q3120 4356*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm4 4357*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm4, 1 4358*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4359*c0909341SAndroid Build Coastguard Worker%else 4360*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm4, q3120 4361*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4362*c0909341SAndroid Build Coastguard Worker add tmpq, 16 4363*c0909341SAndroid Build Coastguard Worker%endif 4364*c0909341SAndroid Build Coastguard Worker sub hd, 2 4365*c0909341SAndroid Build Coastguard Worker jg .dy1_w4_loop 4366*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_RET 4367*c0909341SAndroid Build Coastguard Worker.dy1_w8: 4368*c0909341SAndroid Build Coastguard Worker mov dword [rsp+72], 1 4369*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 16 4370*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4371*c0909341SAndroid Build Coastguard Worker.dy1_w16: 4372*c0909341SAndroid Build Coastguard Worker mov dword [rsp+72], 2 4373*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 32 4374*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4375*c0909341SAndroid Build Coastguard Worker.dy1_w32: 4376*c0909341SAndroid Build Coastguard Worker mov dword [rsp+72], 4 4377*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 64 4378*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4379*c0909341SAndroid Build Coastguard Worker.dy1_w64: 4380*c0909341SAndroid Build Coastguard Worker mov dword [rsp+72], 8 4381*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 128 4382*c0909341SAndroid Build Coastguard Worker jmp .dy1_w_start 4383*c0909341SAndroid Build Coastguard Worker.dy1_w128: 4384*c0909341SAndroid Build Coastguard Worker mov dword [rsp+72], 16 4385*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 256 4386*c0909341SAndroid Build Coastguard Worker.dy1_w_start: 4387*c0909341SAndroid Build Coastguard Worker mov myd, mym 4388*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4389*c0909341SAndroid Build Coastguard Worker movifnidn dsm, dsq 4390*c0909341SAndroid Build Coastguard Worker%endif 4391*c0909341SAndroid Build Coastguard Worker shr t0d, 16 4392*c0909341SAndroid Build Coastguard Worker sub srcq, 3 4393*c0909341SAndroid Build Coastguard Worker shr myd, 6 4394*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4395*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4396*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4397*c0909341SAndroid Build Coastguard Worker pmaddwd m8, [base+rescale_mul] 4398*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4399*c0909341SAndroid Build Coastguard Worker mov [rsp+76], t0d 4400*c0909341SAndroid Build Coastguard Worker mov [rsp+80], srcq 4401*c0909341SAndroid Build Coastguard Worker mov [rsp+88], r0q ; dstq / tmpq 4402*c0909341SAndroid Build Coastguard Worker%if UNIX64 4403*c0909341SAndroid Build Coastguard Worker mov hm, hd 4404*c0909341SAndroid Build Coastguard Worker%endif 4405*c0909341SAndroid Build Coastguard Worker shl dword dxm, 3 ; dx*8 4406*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xm15 4407*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-7] 4408*c0909341SAndroid Build Coastguard Worker movq xm0, r4q 4409*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, xm0 4410*c0909341SAndroid Build Coastguard Worker mova [rsp+96], xm0 4411*c0909341SAndroid Build Coastguard Worker jmp .dy1_hloop 4412*c0909341SAndroid Build Coastguard Worker.dy1_hloop_prep: 4413*c0909341SAndroid Build Coastguard Worker dec dword [rsp+72] 4414*c0909341SAndroid Build Coastguard Worker jz .ret 4415*c0909341SAndroid Build Coastguard Worker add qword [rsp+88], 8*(isprep+1) 4416*c0909341SAndroid Build Coastguard Worker mov hd, hm 4417*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 4418*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_0x3ff] 4419*c0909341SAndroid Build Coastguard Worker paddd m14, m8, [rsp+32] 4420*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+76] 4421*c0909341SAndroid Build Coastguard Worker pxor m9, m9 4422*c0909341SAndroid Build Coastguard Worker mov srcq, [rsp+80] 4423*c0909341SAndroid Build Coastguard Worker mov r0q, [rsp+88] ; dstq / tmpq 4424*c0909341SAndroid Build Coastguard Worker.dy1_hloop: 4425*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [base+pq_0x40000000] 4426*c0909341SAndroid Build Coastguard Worker pand m6, m14, m10 4427*c0909341SAndroid Build Coastguard Worker psrld m6, 6 4428*c0909341SAndroid Build Coastguard Worker paddd m15, m6 4429*c0909341SAndroid Build Coastguard Worker pcmpeqd m6, m9 4430*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m15, 1 4431*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4432*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 2 4433*c0909341SAndroid Build Coastguard Worker pextrd r7d, xm15, 1 4434*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm15, 3 4435*c0909341SAndroid Build Coastguard Worker movd r10d, xm7 4436*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm7, 2 4437*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm7, 1 4438*c0909341SAndroid Build Coastguard Worker pextrd rXd, xm7, 3 4439*c0909341SAndroid Build Coastguard Worker movu [rsp+32], m14 4440*c0909341SAndroid Build Coastguard Worker movq xm15, [base+subpel_filters+ r4*8] 4441*c0909341SAndroid Build Coastguard Worker movq xm10, [base+subpel_filters+ r6*8] 4442*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+subpel_filters+ r7*8] 4443*c0909341SAndroid Build Coastguard Worker movhps xm10, [base+subpel_filters+ r9*8] 4444*c0909341SAndroid Build Coastguard Worker vinserti128 m15, [base+subpel_filters+r10*8], 1 4445*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [base+subpel_filters+r11*8], 1 4446*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [base+subpel_filters+r13*8] 4447*c0909341SAndroid Build Coastguard Worker vpbroadcastq m8, [base+subpel_filters+ rX*8] 4448*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4449*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m14, 1 4450*c0909341SAndroid Build Coastguard Worker movq [rsp+64], xm14 4451*c0909341SAndroid Build Coastguard Worker movd r4d, xm14 4452*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm14, 2 4453*c0909341SAndroid Build Coastguard Worker pextrd r7d, xm14, 1 4454*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm14, 3 4455*c0909341SAndroid Build Coastguard Worker movd r10d, xm7 4456*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm7, 2 4457*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm7, 1 4458*c0909341SAndroid Build Coastguard Worker pextrd rXd, xm7, 3 4459*c0909341SAndroid Build Coastguard Worker pshufd m5, m6, q1100 4460*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q3322 4461*c0909341SAndroid Build Coastguard Worker vpblendd m15, m9, 0xc0 4462*c0909341SAndroid Build Coastguard Worker vpblendd m10, m8, 0xc0 4463*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m5 4464*c0909341SAndroid Build Coastguard Worker pblendvb m10, m11, m6 4465*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [base+subpel_s_shuf8] 4466*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4467*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4468*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4469*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 4470*c0909341SAndroid Build Coastguard Worker movu [rsp], m10 4471*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [rsp+0x60] 4472*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [rsp+0x64] 4473*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [rsp+0x68] 4474*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [rsp+0x6c] 4475*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 ; 01a 01b 4476*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 ; 23a 23b 4477*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 ; 45a 45b 4478*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 ; 67a 67b 4479*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [base+wswap] 4480*c0909341SAndroid Build Coastguard Worker.dy1_vloop: 4481*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 4482*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m9 4483*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m10 4484*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m3, m11 4485*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4486*c0909341SAndroid Build Coastguard Worker paddd m6, m7 4487*c0909341SAndroid Build Coastguard Worker paddd m4, m13 4488*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4489*c0909341SAndroid Build Coastguard Worker psrad m4, rndshift 4490*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4491*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4492*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4493*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 4494*c0909341SAndroid Build Coastguard Worker movq [dstq], xm4 4495*c0909341SAndroid Build Coastguard Worker add dstq, dsm 4496*c0909341SAndroid Build Coastguard Worker%else 4497*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4498*c0909341SAndroid Build Coastguard Worker add tmpq, tmp_stridem 4499*c0909341SAndroid Build Coastguard Worker%endif 4500*c0909341SAndroid Build Coastguard Worker dec hd 4501*c0909341SAndroid Build Coastguard Worker jz .dy1_hloop_prep 4502*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ r4] 4503*c0909341SAndroid Build Coastguard Worker movq xm5, [srcq+ r6] 4504*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ r7] 4505*c0909341SAndroid Build Coastguard Worker movhps xm5, [srcq+ r9] 4506*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r10], 1 4507*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r11], 1 4508*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+r13] 4509*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [srcq+ rX] 4510*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4511*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 4512*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 4513*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 4514*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 4515*c0909341SAndroid Build Coastguard Worker vpblendd m4, m6, 0xc0 4516*c0909341SAndroid Build Coastguard Worker vpblendd m5, m7, 0xc0 4517*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m15 4518*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, [rsp] 4519*c0909341SAndroid Build Coastguard Worker phaddw m4, m5 4520*c0909341SAndroid Build Coastguard Worker pslld m5, m4, 16 4521*c0909341SAndroid Build Coastguard Worker paddw m4, m5 4522*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 4523*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 4524*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 0xaa 4525*c0909341SAndroid Build Coastguard Worker pblendw m2, m3, 0xaa 4526*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 4527*c0909341SAndroid Build Coastguard Worker jmp .dy1_vloop 4528*c0909341SAndroid Build Coastguard Worker.dy2: 4529*c0909341SAndroid Build Coastguard Worker movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] 4530*c0909341SAndroid Build Coastguard Worker add wq, base_reg 4531*c0909341SAndroid Build Coastguard Worker jmp wq 4532*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4533*c0909341SAndroid Build Coastguard Worker.dy2_w2: 4534*c0909341SAndroid Build Coastguard Worker mov myd, mym 4535*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4536*c0909341SAndroid Build Coastguard Worker dec srcq 4537*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4538*c0909341SAndroid Build Coastguard Worker punpckldq m8, m9, m8 4539*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-1] 4540*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pd_0x4000] 4541*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4542*c0909341SAndroid Build Coastguard Worker pand m8, m14, m10 4543*c0909341SAndroid Build Coastguard Worker psrld m8, 6 4544*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 4545*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4546*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4547*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_dw] 4548*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [base+subpel_s_shuf2] 4549*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+subpel_filters+r4*8+2] 4550*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+subpel_filters+r6*8+2] 4551*c0909341SAndroid Build Coastguard Worker pcmpeqd m8, m9 4552*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4553*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*0] 4554*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*1] 4555*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+ssq*2] 4556*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ss3q ] 4557*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4558*c0909341SAndroid Build Coastguard Worker pshufb m14, m5 4559*c0909341SAndroid Build Coastguard Worker paddb m14, m6 4560*c0909341SAndroid Build Coastguard Worker vpblendd m15, m7, 0xaa 4561*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m8 4562*c0909341SAndroid Build Coastguard Worker movhps xm1, [srcq+ssq*0] 4563*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+ssq*1] 4564*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4565*c0909341SAndroid Build Coastguard Worker shr myd, 6 4566*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4567*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4568*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4569*c0909341SAndroid Build Coastguard Worker vpblendd m0, m2, 0x30 4570*c0909341SAndroid Build Coastguard Worker vpblendd m1, m4, 0xc0 4571*c0909341SAndroid Build Coastguard Worker vpblendd m0, m3, 0xc0 4572*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 4573*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 4574*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m15 4575*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m15 4576*c0909341SAndroid Build Coastguard Worker movq xm11, r4q 4577*c0909341SAndroid Build Coastguard Worker pmovsxbw xm11, xm11 4578*c0909341SAndroid Build Coastguard Worker phaddw m0, m1 4579*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 4580*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm11, q0000 4581*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm11, q1111 4582*c0909341SAndroid Build Coastguard Worker pshufd xm10, xm11, q2222 4583*c0909341SAndroid Build Coastguard Worker pshufd xm11, xm11, q3333 4584*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 4585*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m2, 1 4586*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm2, xm1 ; 01 23 4587*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm1 ; 23 45 4588*c0909341SAndroid Build Coastguard Worker.dy2_w2_loop: 4589*c0909341SAndroid Build Coastguard Worker movq xm6, [srcq+ssq*0] 4590*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [srcq+ssq*1] 4591*c0909341SAndroid Build Coastguard Worker movhps xm6, [srcq+ssq*2] 4592*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ss3q ] 4593*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4594*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm3, xm8 4595*c0909341SAndroid Build Coastguard Worker pmaddwd xm5, xm2, xm9 4596*c0909341SAndroid Build Coastguard Worker vpblendd m6, m7, 0x30 4597*c0909341SAndroid Build Coastguard Worker vpblendd m6, m1, 0xc0 4598*c0909341SAndroid Build Coastguard Worker pshufb m6, m14 4599*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m15 4600*c0909341SAndroid Build Coastguard Worker phaddw m6, m6 4601*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m12 4602*c0909341SAndroid Build Coastguard Worker palignr m0, m6, m0, 8 4603*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q3221 4604*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m2, 1 4605*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm2, xm1 ; 45 67 4606*c0909341SAndroid Build Coastguard Worker punpckhwd xm2, xm1 ; 67 89 4607*c0909341SAndroid Build Coastguard Worker pmaddwd xm6, xm3, xm10 4608*c0909341SAndroid Build Coastguard Worker pmaddwd xm7, xm2, xm11 4609*c0909341SAndroid Build Coastguard Worker paddd xm4, xm5 4610*c0909341SAndroid Build Coastguard Worker paddd xm4, xm13 4611*c0909341SAndroid Build Coastguard Worker paddd xm6, xm7 4612*c0909341SAndroid Build Coastguard Worker paddd xm4, xm6 4613*c0909341SAndroid Build Coastguard Worker psrad xm4, rndshift 4614*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm4 4615*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 4616*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm4, 0 4617*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm4, 1 4618*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4619*c0909341SAndroid Build Coastguard Worker sub hd, 2 4620*c0909341SAndroid Build Coastguard Worker jg .dy2_w2_loop 4621*c0909341SAndroid Build Coastguard Worker RET 4622*c0909341SAndroid Build Coastguard Worker%endif 4623*c0909341SAndroid Build Coastguard Worker.dy2_w4: 4624*c0909341SAndroid Build Coastguard Worker mov myd, mym 4625*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [base+rescale_mul] 4626*c0909341SAndroid Build Coastguard Worker movzx t0d, t0b 4627*c0909341SAndroid Build Coastguard Worker dec srcq 4628*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4629*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m7 4630*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pd_0x4000] 4631*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm15, xm15 4632*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-3] 4633*c0909341SAndroid Build Coastguard Worker pand m8, m14, m10 4634*c0909341SAndroid Build Coastguard Worker psrld m8, 6 4635*c0909341SAndroid Build Coastguard Worker paddd xm15, xm8 4636*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4637*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 1 4638*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm15, 2 4639*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm15, 3 4640*c0909341SAndroid Build Coastguard Worker movd xm15, [base+subpel_filters+r4*8+2] 4641*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [base+bdct_lb_dw] 4642*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [base+subpel_s_shuf2] 4643*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r6*8+2], 1 4644*c0909341SAndroid Build Coastguard Worker pcmpeqd m8, m9 4645*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4646*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 4647*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*2] 4648*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r11*8+2], 2 4649*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 4650*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ss3q ] 4651*c0909341SAndroid Build Coastguard Worker pinsrd xm15, [base+subpel_filters+r13*8+2], 3 4652*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4653*c0909341SAndroid Build Coastguard Worker shr myd, 6 4654*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4655*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4656*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4657*c0909341SAndroid Build Coastguard Worker vinserti128 m15, xm15, 1 4658*c0909341SAndroid Build Coastguard Worker pshufb m14, m5 4659*c0909341SAndroid Build Coastguard Worker paddb m14, m6 4660*c0909341SAndroid Build Coastguard Worker vinserti128 m2, [srcq+ssq*0], 1 4661*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+ssq*1], 1 4662*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4663*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m8 4664*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm14 4665*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 4666*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm14 4667*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 4668*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm15 4669*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m15 4670*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm15 4671*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m15 4672*c0909341SAndroid Build Coastguard Worker movq xm11, r4q 4673*c0909341SAndroid Build Coastguard Worker punpcklqdq xm11, xm11 4674*c0909341SAndroid Build Coastguard Worker pmovsxbw m11, xm11 4675*c0909341SAndroid Build Coastguard Worker phaddw m0, m2 4676*c0909341SAndroid Build Coastguard Worker phaddw m1, m3 4677*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m12 ; 0 2 _ 4 4678*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m12 ; 1 3 _ 5 4679*c0909341SAndroid Build Coastguard Worker pshufd m8, m11, q0000 4680*c0909341SAndroid Build Coastguard Worker pshufd m9, m11, q1111 4681*c0909341SAndroid Build Coastguard Worker pshufd m10, m11, q2222 4682*c0909341SAndroid Build Coastguard Worker pshufd m11, m11, q3333 4683*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm0, xm1 4684*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m1 ; 23 45 4685*c0909341SAndroid Build Coastguard Worker vinserti128 m0, m2, xm1, 1 ; 01 23 4686*c0909341SAndroid Build Coastguard Worker.dy2_w4_loop: 4687*c0909341SAndroid Build Coastguard Worker movu xm6, [srcq+ssq*0] 4688*c0909341SAndroid Build Coastguard Worker movu xm7, [srcq+ssq*1] 4689*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [srcq+ssq*2], 1 4690*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [srcq+ss3q ], 1 4691*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 4692*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 4693*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m9 4694*c0909341SAndroid Build Coastguard Worker pshufb m6, m14 4695*c0909341SAndroid Build Coastguard Worker pshufb m7, m14 4696*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m15 4697*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, m15 4698*c0909341SAndroid Build Coastguard Worker psrld m2, m6, 16 4699*c0909341SAndroid Build Coastguard Worker pslld m3, m7, 16 4700*c0909341SAndroid Build Coastguard Worker paddw m6, m2 4701*c0909341SAndroid Build Coastguard Worker paddw m7, m3 4702*c0909341SAndroid Build Coastguard Worker pblendw m6, m7, 0xaa ; 67 89 4703*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m12 4704*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4705*c0909341SAndroid Build Coastguard Worker vperm2i128 m0, m1, m6, 0x21 ; 45 67 4706*c0909341SAndroid Build Coastguard Worker mova m1, m6 4707*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m0, m10 4708*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m1, m11 4709*c0909341SAndroid Build Coastguard Worker paddd m4, m13 4710*c0909341SAndroid Build Coastguard Worker paddd m6, m7 4711*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4712*c0909341SAndroid Build Coastguard Worker psrad m4, rndshift 4713*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4714*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4715*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4716*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 4717*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm4 4718*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm4, 1 4719*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4720*c0909341SAndroid Build Coastguard Worker%else 4721*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4722*c0909341SAndroid Build Coastguard Worker add tmpq, 16 4723*c0909341SAndroid Build Coastguard Worker%endif 4724*c0909341SAndroid Build Coastguard Worker sub hd, 2 4725*c0909341SAndroid Build Coastguard Worker jg .dy2_w4_loop 4726*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_RET 4727*c0909341SAndroid Build Coastguard Worker.dy2_w8: 4728*c0909341SAndroid Build Coastguard Worker mov dword [rsp+40], 1 4729*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 16 4730*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4731*c0909341SAndroid Build Coastguard Worker.dy2_w16: 4732*c0909341SAndroid Build Coastguard Worker mov dword [rsp+40], 2 4733*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 32 4734*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4735*c0909341SAndroid Build Coastguard Worker.dy2_w32: 4736*c0909341SAndroid Build Coastguard Worker mov dword [rsp+40], 4 4737*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 64 4738*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4739*c0909341SAndroid Build Coastguard Worker.dy2_w64: 4740*c0909341SAndroid Build Coastguard Worker mov dword [rsp+40], 8 4741*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 128 4742*c0909341SAndroid Build Coastguard Worker jmp .dy2_w_start 4743*c0909341SAndroid Build Coastguard Worker.dy2_w128: 4744*c0909341SAndroid Build Coastguard Worker mov dword [rsp+40], 16 4745*c0909341SAndroid Build Coastguard Worker movifprep tmp_stridem, 256 4746*c0909341SAndroid Build Coastguard Worker.dy2_w_start: 4747*c0909341SAndroid Build Coastguard Worker mov myd, mym 4748*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4749*c0909341SAndroid Build Coastguard Worker movifnidn dsm, dsq 4750*c0909341SAndroid Build Coastguard Worker%endif 4751*c0909341SAndroid Build Coastguard Worker shr t0d, 16 4752*c0909341SAndroid Build Coastguard Worker sub srcq, 3 4753*c0909341SAndroid Build Coastguard Worker shr myd, 6 4754*c0909341SAndroid Build Coastguard Worker mov r4d, 64 << 24 4755*c0909341SAndroid Build Coastguard Worker lea myd, [t1+myq] 4756*c0909341SAndroid Build Coastguard Worker cmovnz r4q, [base+subpel_filters+myq*8] 4757*c0909341SAndroid Build Coastguard Worker pmaddwd m8, [base+rescale_mul] 4758*c0909341SAndroid Build Coastguard Worker movd xm15, t0d 4759*c0909341SAndroid Build Coastguard Worker mov [rsp+64], t0d 4760*c0909341SAndroid Build Coastguard Worker mov [rsp+48], srcq 4761*c0909341SAndroid Build Coastguard Worker mov [rsp+56], r0q ; dstq / tmpq 4762*c0909341SAndroid Build Coastguard Worker%if UNIX64 4763*c0909341SAndroid Build Coastguard Worker mov hm, hd 4764*c0909341SAndroid Build Coastguard Worker%endif 4765*c0909341SAndroid Build Coastguard Worker shl dword dxm, 3 ; dx*8 4766*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xm15 4767*c0909341SAndroid Build Coastguard Worker paddd m14, m8 ; mx+dx*[0-7] 4768*c0909341SAndroid Build Coastguard Worker movq xm0, r4q 4769*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, xm0 4770*c0909341SAndroid Build Coastguard Worker mova [rsp+0x50], xm0 4771*c0909341SAndroid Build Coastguard Worker jmp .dy2_hloop 4772*c0909341SAndroid Build Coastguard Worker.dy2_hloop_prep: 4773*c0909341SAndroid Build Coastguard Worker dec dword [rsp+40] 4774*c0909341SAndroid Build Coastguard Worker jz .ret 4775*c0909341SAndroid Build Coastguard Worker add qword [rsp+56], 8*(isprep+1) 4776*c0909341SAndroid Build Coastguard Worker mov hd, hm 4777*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, dxm 4778*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_0x3ff] 4779*c0909341SAndroid Build Coastguard Worker paddd m14, m8, [rsp] 4780*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+64] 4781*c0909341SAndroid Build Coastguard Worker pxor m9, m9 4782*c0909341SAndroid Build Coastguard Worker mov srcq, [rsp+48] 4783*c0909341SAndroid Build Coastguard Worker mov r0q, [rsp+56] ; dstq / tmpq 4784*c0909341SAndroid Build Coastguard Worker.dy2_hloop: 4785*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [base+pq_0x40000000] 4786*c0909341SAndroid Build Coastguard Worker pand m6, m14, m10 4787*c0909341SAndroid Build Coastguard Worker psrld m6, 6 4788*c0909341SAndroid Build Coastguard Worker paddd m15, m6 4789*c0909341SAndroid Build Coastguard Worker pcmpeqd m6, m9 4790*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m15, 1 4791*c0909341SAndroid Build Coastguard Worker movd r4d, xm15 4792*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm15, 2 4793*c0909341SAndroid Build Coastguard Worker pextrd r7d, xm15, 1 4794*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm15, 3 4795*c0909341SAndroid Build Coastguard Worker movd r10d, xm7 4796*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm7, 2 4797*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm7, 1 4798*c0909341SAndroid Build Coastguard Worker pextrd rXd, xm7, 3 4799*c0909341SAndroid Build Coastguard Worker movu [rsp], m14 4800*c0909341SAndroid Build Coastguard Worker movq xm15, [base+subpel_filters+ r4*8] 4801*c0909341SAndroid Build Coastguard Worker movq xm10, [base+subpel_filters+ r6*8] 4802*c0909341SAndroid Build Coastguard Worker movhps xm15, [base+subpel_filters+ r7*8] 4803*c0909341SAndroid Build Coastguard Worker movhps xm10, [base+subpel_filters+ r9*8] 4804*c0909341SAndroid Build Coastguard Worker vinserti128 m15, [base+subpel_filters+r10*8], 1 4805*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [base+subpel_filters+r11*8], 1 4806*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [base+subpel_filters+r13*8] 4807*c0909341SAndroid Build Coastguard Worker vpbroadcastq m8, [base+subpel_filters+ rX*8] 4808*c0909341SAndroid Build Coastguard Worker psrld m14, 10 4809*c0909341SAndroid Build Coastguard Worker vextracti128 xm7, m14, 1 4810*c0909341SAndroid Build Coastguard Worker movd r4d, xm14 4811*c0909341SAndroid Build Coastguard Worker pextrd r6d, xm14, 2 4812*c0909341SAndroid Build Coastguard Worker pextrd r7d, xm14, 1 4813*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm14, 3 4814*c0909341SAndroid Build Coastguard Worker movd r10d, xm7 4815*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm7, 2 4816*c0909341SAndroid Build Coastguard Worker pextrd r13d, xm7, 1 4817*c0909341SAndroid Build Coastguard Worker pextrd rXd, xm7, 3 4818*c0909341SAndroid Build Coastguard Worker pshufd m5, m6, q1100 4819*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q3322 4820*c0909341SAndroid Build Coastguard Worker vpblendd m15, m9, 0xc0 4821*c0909341SAndroid Build Coastguard Worker vpblendd m10, m8, 0xc0 4822*c0909341SAndroid Build Coastguard Worker pblendvb m15, m11, m5 4823*c0909341SAndroid Build Coastguard Worker pblendvb m10, m11, m6 4824*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [base+subpel_s_shuf8] 4825*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4826*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4827*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4828*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 4829*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [rsp+0x50] 4830*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [rsp+0x54] 4831*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [rsp+0x58] 4832*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [rsp+0x5c] 4833*c0909341SAndroid Build Coastguard Worker pshufb m0, m14 ; 01a 01b 4834*c0909341SAndroid Build Coastguard Worker pshufb m1, m14 ; 23a 23b 4835*c0909341SAndroid Build Coastguard Worker pshufb m2, m14 ; 45a 45b 4836*c0909341SAndroid Build Coastguard Worker pshufb m3, m14 ; 67a 67b 4837*c0909341SAndroid Build Coastguard Worker SWAP m14, m4 4838*c0909341SAndroid Build Coastguard Worker.dy2_vloop: 4839*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m8 4840*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m9 4841*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m2, m11 4842*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m3, m14 4843*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4844*c0909341SAndroid Build Coastguard Worker paddd m6, m7 4845*c0909341SAndroid Build Coastguard Worker paddd m4, m13 4846*c0909341SAndroid Build Coastguard Worker paddd m4, m6 4847*c0909341SAndroid Build Coastguard Worker psrad m4, rndshift 4848*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 4849*c0909341SAndroid Build Coastguard Worker packssdw xm4, xm5 4850*c0909341SAndroid Build Coastguard Worker%ifidn %1, put 4851*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 4852*c0909341SAndroid Build Coastguard Worker movq [dstq], xm4 4853*c0909341SAndroid Build Coastguard Worker add dstq, dsm 4854*c0909341SAndroid Build Coastguard Worker%else 4855*c0909341SAndroid Build Coastguard Worker mova [tmpq], xm4 4856*c0909341SAndroid Build Coastguard Worker add tmpq, tmp_stridem 4857*c0909341SAndroid Build Coastguard Worker%endif 4858*c0909341SAndroid Build Coastguard Worker dec hd 4859*c0909341SAndroid Build Coastguard Worker jz .dy2_hloop_prep 4860*c0909341SAndroid Build Coastguard Worker mova m0, m1 4861*c0909341SAndroid Build Coastguard Worker mova m1, m2 4862*c0909341SAndroid Build Coastguard Worker mova m2, m3 4863*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+ r4] 4864*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ r6] 4865*c0909341SAndroid Build Coastguard Worker movhps xm3, [srcq+ r7] 4866*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ r9] 4867*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+r10], 1 4868*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r11], 1 4869*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+r13] 4870*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+ rX] 4871*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4872*c0909341SAndroid Build Coastguard Worker vpblendd m3, m5, 0xc0 4873*c0909341SAndroid Build Coastguard Worker vpblendd m4, m6, 0xc0 4874*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m15 4875*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m10 4876*c0909341SAndroid Build Coastguard Worker phaddw m3, m4 4877*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ r4] 4878*c0909341SAndroid Build Coastguard Worker movq xm5, [srcq+ r6] 4879*c0909341SAndroid Build Coastguard Worker movhps xm4, [srcq+ r7] 4880*c0909341SAndroid Build Coastguard Worker movhps xm5, [srcq+ r9] 4881*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [srcq+r10], 1 4882*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [srcq+r11], 1 4883*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [srcq+r13] 4884*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [srcq+ rX] 4885*c0909341SAndroid Build Coastguard Worker add srcq, ssq 4886*c0909341SAndroid Build Coastguard Worker vpblendd m4, m6, 0xc0 4887*c0909341SAndroid Build Coastguard Worker vpblendd m5, m7, 0xc0 4888*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m15 4889*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m10 4890*c0909341SAndroid Build Coastguard Worker phaddw m4, m5 4891*c0909341SAndroid Build Coastguard Worker psrld m5, m3, 16 4892*c0909341SAndroid Build Coastguard Worker pslld m6, m4, 16 4893*c0909341SAndroid Build Coastguard Worker paddw m3, m5 4894*c0909341SAndroid Build Coastguard Worker paddw m4, m6 4895*c0909341SAndroid Build Coastguard Worker pblendw m3, m4, 0xaa 4896*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m12 4897*c0909341SAndroid Build Coastguard Worker jmp .dy2_vloop 4898*c0909341SAndroid Build Coastguard Worker.ret: 4899*c0909341SAndroid Build Coastguard Worker MC_8TAP_SCALED_RET 0 4900*c0909341SAndroid Build Coastguard Worker%undef isprep 4901*c0909341SAndroid Build Coastguard Worker%endmacro 4902*c0909341SAndroid Build Coastguard Worker 4903*c0909341SAndroid Build Coastguard Worker%macro BILIN_SCALED_FN 1 4904*c0909341SAndroid Build Coastguard Workercglobal %1_bilin_scaled_8bpc 4905*c0909341SAndroid Build Coastguard Worker mov t0d, (5*15 << 16) | 5*15 4906*c0909341SAndroid Build Coastguard Worker mov t1d, t0d 4907*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) 4908*c0909341SAndroid Build Coastguard Worker%endmacro 4909*c0909341SAndroid Build Coastguard Worker 4910*c0909341SAndroid Build Coastguard Worker%if WIN64 4911*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 5 4912*c0909341SAndroid Build Coastguard Worker%else 4913*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 8 4914*c0909341SAndroid Build Coastguard Worker%endif 4915*c0909341SAndroid Build Coastguard Worker 4916*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 4917*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 4918*c0909341SAndroid Build Coastguard Worker 4919*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN put 4920*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc 4921*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc 4922*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc 4923*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc 4924*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc 4925*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc 4926*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc 4927*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc 4928*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 4929*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED put 4930*c0909341SAndroid Build Coastguard Worker 4931*c0909341SAndroid Build Coastguard Worker%if WIN64 4932*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 4 4933*c0909341SAndroid Build Coastguard Worker%else 4934*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7 4935*c0909341SAndroid Build Coastguard Worker%endif 4936*c0909341SAndroid Build Coastguard Worker 4937*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN prep 4938*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc 4939*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc 4940*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc 4941*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc 4942*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc 4943*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc 4944*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc 4945*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc 4946*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 4947*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED prep 4948*c0909341SAndroid Build Coastguard Worker 4949*c0909341SAndroid Build Coastguard Worker%macro WARP_V 5 ; dst, 02, 46, 13, 57 4950*c0909341SAndroid Build Coastguard Worker ; Can be done using gathers, but that's terribly slow on many CPU:s 4951*c0909341SAndroid Build Coastguard Worker lea tmp1d, [myq+deltaq*4] 4952*c0909341SAndroid Build Coastguard Worker lea tmp2d, [myq+deltaq*1] 4953*c0909341SAndroid Build Coastguard Worker shr myd, 10 4954*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 4955*c0909341SAndroid Build Coastguard Worker movq xm8, [filterq+myq *8] 4956*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [filterq+tmp1q*8], 1 ; a e 4957*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+deltaq*4] 4958*c0909341SAndroid Build Coastguard Worker lea myd, [tmp2q+deltaq*1] 4959*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 4960*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 4961*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+tmp2q*8] 4962*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 ; b f 4963*c0909341SAndroid Build Coastguard Worker lea tmp1d, [myq+deltaq*4] 4964*c0909341SAndroid Build Coastguard Worker lea tmp2d, [myq+deltaq*1] 4965*c0909341SAndroid Build Coastguard Worker shr myd, 10 4966*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 4967*c0909341SAndroid Build Coastguard Worker movq xm9, [filterq+myq *8] 4968*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [filterq+tmp1q*8], 1 ; c g 4969*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+deltaq*4] 4970*c0909341SAndroid Build Coastguard Worker lea myd, [tmp2q+gammaq] ; my += gamma 4971*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 4972*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 4973*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m0 4974*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+tmp2q*8] 4975*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 ; d h 4976*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m9, m0 4977*c0909341SAndroid Build Coastguard Worker punpckldq m9, m8, m0 4978*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m8, m0 4979*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 4980*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 4981*c0909341SAndroid Build Coastguard Worker pmaddwd m%2, m8 4982*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m%3 4983*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 4984*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 4985*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m%4 4986*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m%5 4987*c0909341SAndroid Build Coastguard Worker paddd m%2, m9 4988*c0909341SAndroid Build Coastguard Worker paddd m0, m8 4989*c0909341SAndroid Build Coastguard Worker paddd m%1, m0, m%2 4990*c0909341SAndroid Build Coastguard Worker%endmacro 4991*c0909341SAndroid Build Coastguard Worker 4992*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts 4993*c0909341SAndroid Build Coastguard Worker%if WIN64 4994*c0909341SAndroid Build Coastguard Worker sub rsp, 0xa0 4995*c0909341SAndroid Build Coastguard Worker%endif 4996*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main 4997*c0909341SAndroid Build Coastguard Worker.loop: 4998*c0909341SAndroid Build Coastguard Worker psrad m7, 13 4999*c0909341SAndroid Build Coastguard Worker psrad m0, 13 5000*c0909341SAndroid Build Coastguard Worker packssdw m7, m0 5001*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 5002*c0909341SAndroid Build Coastguard Worker vpermq m7, m7, q3120 5003*c0909341SAndroid Build Coastguard Worker mova [tmpq+tsq*0], xm7 5004*c0909341SAndroid Build Coastguard Worker vextracti128 [tmpq+tsq*2], m7, 1 5005*c0909341SAndroid Build Coastguard Worker dec r4d 5006*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end 5007*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 5008*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+tsq*4] 5009*c0909341SAndroid Build Coastguard Worker jmp .loop 5010*c0909341SAndroid Build Coastguard Worker 5011*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ 5012*c0909341SAndroid Build Coastguard Worker beta, filter, tmp1, delta, my, gamma 5013*c0909341SAndroid Build Coastguard Worker%if WIN64 5014*c0909341SAndroid Build Coastguard Worker %assign xmm_regs_used 16 5015*c0909341SAndroid Build Coastguard Worker %assign stack_size_padded 0xa0 5016*c0909341SAndroid Build Coastguard Worker SUB rsp, stack_size_padded 5017*c0909341SAndroid Build Coastguard Worker%endif 5018*c0909341SAndroid Build Coastguard Worker call .main 5019*c0909341SAndroid Build Coastguard Worker jmp .start 5020*c0909341SAndroid Build Coastguard Worker.loop: 5021*c0909341SAndroid Build Coastguard Worker call .main2 5022*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5023*c0909341SAndroid Build Coastguard Worker.start: 5024*c0909341SAndroid Build Coastguard Worker psrad m7, 18 5025*c0909341SAndroid Build Coastguard Worker psrad m0, 18 5026*c0909341SAndroid Build Coastguard Worker packusdw m7, m0 5027*c0909341SAndroid Build Coastguard Worker pavgw m7, m11 ; (x + (1 << 10)) >> 11 5028*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m7, 1 5029*c0909341SAndroid Build Coastguard Worker packuswb xm7, xm0 5030*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm7, q3120 5031*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm7 5032*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm7 5033*c0909341SAndroid Build Coastguard Worker dec r4d 5034*c0909341SAndroid Build Coastguard Worker jg .loop 5035*c0909341SAndroid Build Coastguard Worker.end: 5036*c0909341SAndroid Build Coastguard Worker RET 5037*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5038*c0909341SAndroid Build Coastguard Worker.main: 5039*c0909341SAndroid Build Coastguard Worker ; Stack is offset due to call 5040*c0909341SAndroid Build Coastguard Worker %assign stack_offset stack_offset + gprsize 5041*c0909341SAndroid Build Coastguard Worker %assign stack_size stack_size + gprsize 5042*c0909341SAndroid Build Coastguard Worker %assign stack_size_padded stack_size_padded + gprsize 5043*c0909341SAndroid Build Coastguard Worker movifnidn abcdq, abcdmp 5044*c0909341SAndroid Build Coastguard Worker movifnidn mxd, mxm 5045*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 5046*c0909341SAndroid Build Coastguard Worker movsx alphad, word [abcdq+2*0] 5047*c0909341SAndroid Build Coastguard Worker movsx betad, word [abcdq+2*1] 5048*c0909341SAndroid Build Coastguard Worker mova m12, [warp_8x8_shufA] 5049*c0909341SAndroid Build Coastguard Worker mova m13, [warp_8x8_shufB] 5050*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pw_8192] 5051*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [pd_32768] 5052*c0909341SAndroid Build Coastguard Worker pxor m11, m11 5053*c0909341SAndroid Build Coastguard Worker lea filterq, [mc_warp_filter2] 5054*c0909341SAndroid Build Coastguard Worker lea tmp1q, [ssq*3+3] 5055*c0909341SAndroid Build Coastguard Worker add mxd, 512+(64<<10) 5056*c0909341SAndroid Build Coastguard Worker lea tmp2d, [alphaq*3] 5057*c0909341SAndroid Build Coastguard Worker sub srcq, tmp1q ; src -= src_stride*3 + 3 5058*c0909341SAndroid Build Coastguard Worker sub betad, tmp2d ; beta -= alpha*3 5059*c0909341SAndroid Build Coastguard Worker mov myd, r6m 5060*c0909341SAndroid Build Coastguard Worker call .h 5061*c0909341SAndroid Build Coastguard Worker psrld m1, m0, 16 5062*c0909341SAndroid Build Coastguard Worker call .h 5063*c0909341SAndroid Build Coastguard Worker psrld m4, m0, 16 5064*c0909341SAndroid Build Coastguard Worker call .h 5065*c0909341SAndroid Build Coastguard Worker pblendw m1, m0, 0xaa ; 02 5066*c0909341SAndroid Build Coastguard Worker call .h 5067*c0909341SAndroid Build Coastguard Worker pblendw m4, m0, 0xaa ; 13 5068*c0909341SAndroid Build Coastguard Worker call .h 5069*c0909341SAndroid Build Coastguard Worker psrld m2, m1, 16 5070*c0909341SAndroid Build Coastguard Worker pblendw m2, m0, 0xaa ; 24 5071*c0909341SAndroid Build Coastguard Worker call .h 5072*c0909341SAndroid Build Coastguard Worker psrld m5, m4, 16 5073*c0909341SAndroid Build Coastguard Worker pblendw m5, m0, 0xaa ; 35 5074*c0909341SAndroid Build Coastguard Worker call .h 5075*c0909341SAndroid Build Coastguard Worker psrld m3, m2, 16 5076*c0909341SAndroid Build Coastguard Worker pblendw m3, m0, 0xaa ; 46 5077*c0909341SAndroid Build Coastguard Worker movsx deltad, word [abcdq+2*2] 5078*c0909341SAndroid Build Coastguard Worker movsx gammad, word [abcdq+2*3] 5079*c0909341SAndroid Build Coastguard Worker add myd, 512+(64<<10) 5080*c0909341SAndroid Build Coastguard Worker mov r4d, 4 5081*c0909341SAndroid Build Coastguard Worker lea tmp1d, [deltaq*3] 5082*c0909341SAndroid Build Coastguard Worker sub gammad, tmp1d ; gamma -= delta*3 5083*c0909341SAndroid Build Coastguard Worker.main2: 5084*c0909341SAndroid Build Coastguard Worker call .h 5085*c0909341SAndroid Build Coastguard Worker psrld m6, m5, 16 5086*c0909341SAndroid Build Coastguard Worker pblendw m6, m0, 0xaa ; 57 5087*c0909341SAndroid Build Coastguard Worker WARP_V 7, 1, 3, 4, 6 5088*c0909341SAndroid Build Coastguard Worker call .h 5089*c0909341SAndroid Build Coastguard Worker mova m1, m2 5090*c0909341SAndroid Build Coastguard Worker mova m2, m3 5091*c0909341SAndroid Build Coastguard Worker psrld m3, 16 5092*c0909341SAndroid Build Coastguard Worker pblendw m3, m0, 0xaa ; 68 5093*c0909341SAndroid Build Coastguard Worker WARP_V 0, 4, 6, 1, 3 5094*c0909341SAndroid Build Coastguard Worker mova m4, m5 5095*c0909341SAndroid Build Coastguard Worker mova m5, m6 5096*c0909341SAndroid Build Coastguard Worker ret 5097*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5098*c0909341SAndroid Build Coastguard Worker.h: 5099*c0909341SAndroid Build Coastguard Worker lea tmp1d, [mxq+alphaq*4] 5100*c0909341SAndroid Build Coastguard Worker lea tmp2d, [mxq+alphaq*1] 5101*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [srcq] 5102*c0909341SAndroid Build Coastguard Worker shr mxd, 10 5103*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5104*c0909341SAndroid Build Coastguard Worker movq xm8, [filterq+mxq *8] 5105*c0909341SAndroid Build Coastguard Worker vinserti128 m8, [filterq+tmp1q*8], 1 5106*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+alphaq*4] 5107*c0909341SAndroid Build Coastguard Worker lea mxd, [tmp2q+alphaq*1] 5108*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 5109*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5110*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+tmp2q*8] 5111*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 5112*c0909341SAndroid Build Coastguard Worker lea tmp1d, [mxq+alphaq*4] 5113*c0909341SAndroid Build Coastguard Worker lea tmp2d, [mxq+alphaq*1] 5114*c0909341SAndroid Build Coastguard Worker shr mxd, 10 5115*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5116*c0909341SAndroid Build Coastguard Worker movq xm9, [filterq+mxq *8] 5117*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [filterq+tmp1q*8], 1 5118*c0909341SAndroid Build Coastguard Worker lea tmp1d, [tmp2q+alphaq*4] 5119*c0909341SAndroid Build Coastguard Worker lea mxd, [tmp2q+betaq] ; mx += beta 5120*c0909341SAndroid Build Coastguard Worker shr tmp2d, 10 5121*c0909341SAndroid Build Coastguard Worker shr tmp1d, 10 5122*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m0 ; 0 1 4 5 5123*c0909341SAndroid Build Coastguard Worker movq xm0, [filterq+tmp2q*8] 5124*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [filterq+tmp1q*8], 1 5125*c0909341SAndroid Build Coastguard Worker punpcklqdq m9, m0 ; 2 3 6 7 5126*c0909341SAndroid Build Coastguard Worker pshufb m0, m10, m12 5127*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8 5128*c0909341SAndroid Build Coastguard Worker pshufb m10, m13 5129*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m9 5130*c0909341SAndroid Build Coastguard Worker add srcq, ssq 5131*c0909341SAndroid Build Coastguard Worker phaddw m0, m10 5132*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 5133*c0909341SAndroid Build Coastguard Worker paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword 5134*c0909341SAndroid Build Coastguard Worker ret 5135*c0909341SAndroid Build Coastguard Worker 5136*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 1 ; op 5137*c0909341SAndroid Build Coastguard Worker %1 0 5138*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5139*c0909341SAndroid Build Coastguard Worker jmp wq 5140*c0909341SAndroid Build Coastguard Worker.w4: 5141*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 5142*c0909341SAndroid Build Coastguard Worker movd [dstq ], xm0 5143*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 5144*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 5145*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 5146*c0909341SAndroid Build Coastguard Worker cmp hd, 4 5147*c0909341SAndroid Build Coastguard Worker je .ret 5148*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5149*c0909341SAndroid Build Coastguard Worker pextrd [dstq ], xm0, 2 5150*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 5151*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 5152*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 5153*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5154*c0909341SAndroid Build Coastguard Worker je .ret 5155*c0909341SAndroid Build Coastguard Worker %1 2 5156*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5157*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 5158*c0909341SAndroid Build Coastguard Worker movd [dstq ], xm0 5159*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 5160*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 5161*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 5162*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5163*c0909341SAndroid Build Coastguard Worker pextrd [dstq ], xm0, 2 5164*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 5165*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 5166*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 5167*c0909341SAndroid Build Coastguard Worker.ret: 5168*c0909341SAndroid Build Coastguard Worker RET 5169*c0909341SAndroid Build Coastguard Worker.w8_loop: 5170*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 2 5171*c0909341SAndroid Build Coastguard Worker %1 0 5172*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5173*c0909341SAndroid Build Coastguard Worker.w8: 5174*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 5175*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm0 5176*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 5177*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 5178*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5179*c0909341SAndroid Build Coastguard Worker sub hd, 4 5180*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5181*c0909341SAndroid Build Coastguard Worker RET 5182*c0909341SAndroid Build Coastguard Worker.w16_loop: 5183*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 4 5184*c0909341SAndroid Build Coastguard Worker %1 0 5185*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5186*c0909341SAndroid Build Coastguard Worker.w16: 5187*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5188*c0909341SAndroid Build Coastguard Worker mova [dstq ], xm0 5189*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 5190*c0909341SAndroid Build Coastguard Worker %1 2 5191*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5192*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm0 5193*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m0, 1 5194*c0909341SAndroid Build Coastguard Worker sub hd, 4 5195*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5196*c0909341SAndroid Build Coastguard Worker RET 5197*c0909341SAndroid Build Coastguard Worker.w32_loop: 5198*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 4 5199*c0909341SAndroid Build Coastguard Worker %1 0 5200*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5201*c0909341SAndroid Build Coastguard Worker.w32: 5202*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5203*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5204*c0909341SAndroid Build Coastguard Worker %1 2 5205*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5206*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 5207*c0909341SAndroid Build Coastguard Worker sub hd, 2 5208*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5209*c0909341SAndroid Build Coastguard Worker RET 5210*c0909341SAndroid Build Coastguard Worker.w64_loop: 5211*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 4 5212*c0909341SAndroid Build Coastguard Worker %1 0 5213*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5214*c0909341SAndroid Build Coastguard Worker.w64: 5215*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5216*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5217*c0909341SAndroid Build Coastguard Worker %1 2 5218*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5219*c0909341SAndroid Build Coastguard Worker mova [dstq+32], m0 5220*c0909341SAndroid Build Coastguard Worker dec hd 5221*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5222*c0909341SAndroid Build Coastguard Worker RET 5223*c0909341SAndroid Build Coastguard Worker.w128_loop: 5224*c0909341SAndroid Build Coastguard Worker %1 0 5225*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5226*c0909341SAndroid Build Coastguard Worker.w128: 5227*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5228*c0909341SAndroid Build Coastguard Worker mova [dstq+0*32], m0 5229*c0909341SAndroid Build Coastguard Worker %1 2 5230*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5231*c0909341SAndroid Build Coastguard Worker mova [dstq+1*32], m0 5232*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 8 5233*c0909341SAndroid Build Coastguard Worker %1 -4 5234*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5235*c0909341SAndroid Build Coastguard Worker mova [dstq+2*32], m0 5236*c0909341SAndroid Build Coastguard Worker %1 -2 5237*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 5238*c0909341SAndroid Build Coastguard Worker mova [dstq+3*32], m0 5239*c0909341SAndroid Build Coastguard Worker dec hd 5240*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5241*c0909341SAndroid Build Coastguard Worker RET 5242*c0909341SAndroid Build Coastguard Worker%endmacro 5243*c0909341SAndroid Build Coastguard Worker 5244*c0909341SAndroid Build Coastguard Worker%macro AVG 1 ; src_offset 5245*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+(%1+0)*32] 5246*c0909341SAndroid Build Coastguard Worker paddw m0, [tmp2q+(%1+0)*32] 5247*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+(%1+1)*32] 5248*c0909341SAndroid Build Coastguard Worker paddw m1, [tmp2q+(%1+1)*32] 5249*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m2 5250*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 5251*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5252*c0909341SAndroid Build Coastguard Worker%endmacro 5253*c0909341SAndroid Build Coastguard Worker 5254*c0909341SAndroid Build Coastguard Worker%macro AVG_INC_PTR 1 5255*c0909341SAndroid Build Coastguard Worker add tmp1q, %1*32 5256*c0909341SAndroid Build Coastguard Worker add tmp2q, %1*32 5257*c0909341SAndroid Build Coastguard Worker%endmacro 5258*c0909341SAndroid Build Coastguard Worker 5259*c0909341SAndroid Build Coastguard Workercglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 5260*c0909341SAndroid Build Coastguard Worker%define base r6-avg %+ SUFFIX %+ _table 5261*c0909341SAndroid Build Coastguard Worker lea r6, [avg %+ SUFFIX %+ _table] 5262*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5263*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5264*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r6+wq*4] 5265*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pw_1024] 5266*c0909341SAndroid Build Coastguard Worker add wq, r6 5267*c0909341SAndroid Build Coastguard Worker BIDIR_FN AVG 5268*c0909341SAndroid Build Coastguard Worker 5269*c0909341SAndroid Build Coastguard Worker%macro W_AVG 1 ; src_offset 5270*c0909341SAndroid Build Coastguard Worker ; (a * weight + b * (16 - weight) + 128) >> 8 5271*c0909341SAndroid Build Coastguard Worker ; = ((a - b) * weight + (b << 4) + 128) >> 8 5272*c0909341SAndroid Build Coastguard Worker ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 5273*c0909341SAndroid Build Coastguard Worker ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 5274*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+(%1+0)*32] 5275*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmp2q+(%1+0)*32] 5276*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+(%1+1)*32] 5277*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmp2q+(%1+1)*32] 5278*c0909341SAndroid Build Coastguard Worker pmulhw m2, m4 5279*c0909341SAndroid Build Coastguard Worker pmulhw m3, m4 5280*c0909341SAndroid Build Coastguard Worker paddw m0, m2 5281*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5282*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5283*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5284*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5285*c0909341SAndroid Build Coastguard Worker%endmacro 5286*c0909341SAndroid Build Coastguard Worker 5287*c0909341SAndroid Build Coastguard Worker%define W_AVG_INC_PTR AVG_INC_PTR 5288*c0909341SAndroid Build Coastguard Worker 5289*c0909341SAndroid Build Coastguard Workercglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 5290*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg %+ SUFFIX %+ _table 5291*c0909341SAndroid Build Coastguard Worker lea r6, [w_avg %+ SUFFIX %+ _table] 5292*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5293*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5294*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, r6m ; weight 5295*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r6+wq*4] 5296*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_2048] 5297*c0909341SAndroid Build Coastguard Worker psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 5298*c0909341SAndroid Build Coastguard Worker add wq, r6 5299*c0909341SAndroid Build Coastguard Worker cmp dword r6m, 7 5300*c0909341SAndroid Build Coastguard Worker jg .weight_gt7 5301*c0909341SAndroid Build Coastguard Worker mov r6, tmp1q 5302*c0909341SAndroid Build Coastguard Worker pxor m0, m0 5303*c0909341SAndroid Build Coastguard Worker mov tmp1q, tmp2q 5304*c0909341SAndroid Build Coastguard Worker psubw m4, m0, m4 ; -weight 5305*c0909341SAndroid Build Coastguard Worker mov tmp2q, r6 5306*c0909341SAndroid Build Coastguard Worker.weight_gt7: 5307*c0909341SAndroid Build Coastguard Worker BIDIR_FN W_AVG 5308*c0909341SAndroid Build Coastguard Worker 5309*c0909341SAndroid Build Coastguard Worker%macro MASK 1 ; src_offset 5310*c0909341SAndroid Build Coastguard Worker ; (a * m + b * (64 - m) + 512) >> 10 5311*c0909341SAndroid Build Coastguard Worker ; = ((a - b) * m + (b << 6) + 512) >> 10 5312*c0909341SAndroid Build Coastguard Worker ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 5313*c0909341SAndroid Build Coastguard Worker vpermq m3, [maskq+%1*16], q3120 5314*c0909341SAndroid Build Coastguard Worker mova m0, [tmp2q+(%1+0)*32] 5315*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmp1q+(%1+0)*32] 5316*c0909341SAndroid Build Coastguard Worker psubb m3, m4, m3 5317*c0909341SAndroid Build Coastguard Worker paddw m1, m1 ; (b - a) << 1 5318*c0909341SAndroid Build Coastguard Worker paddb m3, m3 5319*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4, m3 ; -m << 9 5320*c0909341SAndroid Build Coastguard Worker pmulhw m1, m2 5321*c0909341SAndroid Build Coastguard Worker paddw m0, m1 5322*c0909341SAndroid Build Coastguard Worker mova m1, [tmp2q+(%1+1)*32] 5323*c0909341SAndroid Build Coastguard Worker psubw m2, m1, [tmp1q+(%1+1)*32] 5324*c0909341SAndroid Build Coastguard Worker paddw m2, m2 5325*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4, m3 5326*c0909341SAndroid Build Coastguard Worker pmulhw m2, m3 5327*c0909341SAndroid Build Coastguard Worker paddw m1, m2 5328*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5329*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5330*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5331*c0909341SAndroid Build Coastguard Worker%endmacro 5332*c0909341SAndroid Build Coastguard Worker 5333*c0909341SAndroid Build Coastguard Worker%macro MASK_INC_PTR 1 5334*c0909341SAndroid Build Coastguard Worker add maskq, %1*16 5335*c0909341SAndroid Build Coastguard Worker add tmp2q, %1*32 5336*c0909341SAndroid Build Coastguard Worker add tmp1q, %1*32 5337*c0909341SAndroid Build Coastguard Worker%endmacro 5338*c0909341SAndroid Build Coastguard Worker 5339*c0909341SAndroid Build Coastguard Workercglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 5340*c0909341SAndroid Build Coastguard Worker%define base r7-mask %+ SUFFIX %+ _table 5341*c0909341SAndroid Build Coastguard Worker lea r7, [mask %+ SUFFIX %+ _table] 5342*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5343*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5344*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5345*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r7+wq*4] 5346*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_2048] 5347*c0909341SAndroid Build Coastguard Worker pxor m4, m4 5348*c0909341SAndroid Build Coastguard Worker add wq, r7 5349*c0909341SAndroid Build Coastguard Worker BIDIR_FN MASK 5350*c0909341SAndroid Build Coastguard Worker 5351*c0909341SAndroid Build Coastguard Worker%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 5352*c0909341SAndroid Build Coastguard Worker mova m%1, [tmp1q+32*%3] 5353*c0909341SAndroid Build Coastguard Worker mova m1, [tmp2q+32*%3] 5354*c0909341SAndroid Build Coastguard Worker psubw m1, m%1 5355*c0909341SAndroid Build Coastguard Worker pabsw m%2, m1 5356*c0909341SAndroid Build Coastguard Worker psubusw m%2, m6, m%2 5357*c0909341SAndroid Build Coastguard Worker psrlw m%2, 8 ; 64 - m 5358*c0909341SAndroid Build Coastguard Worker psllw m2, m%2, 10 5359*c0909341SAndroid Build Coastguard Worker pmulhw m1, m2 5360*c0909341SAndroid Build Coastguard Worker paddw m%1, m1 5361*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+32*%4] 5362*c0909341SAndroid Build Coastguard Worker mova m2, [tmp2q+32*%4] 5363*c0909341SAndroid Build Coastguard Worker psubw m2, m1 5364*c0909341SAndroid Build Coastguard Worker pabsw m3, m2 5365*c0909341SAndroid Build Coastguard Worker psubusw m3, m6, m3 5366*c0909341SAndroid Build Coastguard Worker psrlw m3, 8 5367*c0909341SAndroid Build Coastguard Worker%if %5 5368*c0909341SAndroid Build Coastguard Worker packuswb m%2, m3 5369*c0909341SAndroid Build Coastguard Worker psubb m%2, m5, m%2 5370*c0909341SAndroid Build Coastguard Worker vpermq m%2, m%2, q3120 5371*c0909341SAndroid Build Coastguard Worker%else 5372*c0909341SAndroid Build Coastguard Worker phaddw m%2, m3 5373*c0909341SAndroid Build Coastguard Worker%endif 5374*c0909341SAndroid Build Coastguard Worker psllw m3, 10 5375*c0909341SAndroid Build Coastguard Worker pmulhw m2, m3 5376*c0909341SAndroid Build Coastguard Worker paddw m1, m2 5377*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m7 5378*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 5379*c0909341SAndroid Build Coastguard Worker packuswb m%1, m1 5380*c0909341SAndroid Build Coastguard Worker%endmacro 5381*c0909341SAndroid Build Coastguard Worker 5382*c0909341SAndroid Build Coastguard Workercglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 5383*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx2_table 5384*c0909341SAndroid Build Coastguard Worker lea r6, [blend_avx2_table] 5385*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5386*c0909341SAndroid Build Coastguard Worker movifnidn maskq, maskmp 5387*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5388*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r6+wq*4] 5389*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pb_64] 5390*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_512] 5391*c0909341SAndroid Build Coastguard Worker sub tmpq, maskq 5392*c0909341SAndroid Build Coastguard Worker add wq, r6 5393*c0909341SAndroid Build Coastguard Worker lea r6, [dsq*3] 5394*c0909341SAndroid Build Coastguard Worker jmp wq 5395*c0909341SAndroid Build Coastguard Worker.w4: 5396*c0909341SAndroid Build Coastguard Worker movd xm0, [dstq+dsq*0] 5397*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [dstq+dsq*1], 1 5398*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm1, [dstq+dsq*2] 5399*c0909341SAndroid Build Coastguard Worker pinsrd xm1, [dstq+r6 ], 3 5400*c0909341SAndroid Build Coastguard Worker mova xm6, [maskq] 5401*c0909341SAndroid Build Coastguard Worker psubb xm3, xm4, xm6 5402*c0909341SAndroid Build Coastguard Worker punpcklbw xm2, xm3, xm6 5403*c0909341SAndroid Build Coastguard Worker punpckhbw xm3, xm6 5404*c0909341SAndroid Build Coastguard Worker mova xm6, [maskq+tmpq] 5405*c0909341SAndroid Build Coastguard Worker add maskq, 4*4 5406*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm6 5407*c0909341SAndroid Build Coastguard Worker punpckhbw xm1, xm6 5408*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm2 5409*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm3 5410*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm5 5411*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 5412*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 5413*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 5414*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 1 5415*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*2], xm0, 2 5416*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r6 ], xm0, 3 5417*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5418*c0909341SAndroid Build Coastguard Worker sub hd, 4 5419*c0909341SAndroid Build Coastguard Worker jg .w4 5420*c0909341SAndroid Build Coastguard Worker RET 5421*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5422*c0909341SAndroid Build Coastguard Worker.w8: 5423*c0909341SAndroid Build Coastguard Worker movq xm1, [dstq+dsq*0] 5424*c0909341SAndroid Build Coastguard Worker movhps xm1, [dstq+dsq*1] 5425*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [dstq+dsq*2] 5426*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [dstq+r6 ] 5427*c0909341SAndroid Build Coastguard Worker mova m0, [maskq] 5428*c0909341SAndroid Build Coastguard Worker mova m6, [maskq+tmpq] 5429*c0909341SAndroid Build Coastguard Worker add maskq, 8*4 5430*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0x30 5431*c0909341SAndroid Build Coastguard Worker vpblendd m1, m3, 0xc0 5432*c0909341SAndroid Build Coastguard Worker psubb m3, m4, m0 5433*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m0 5434*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0 5435*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m6 5436*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m6 5437*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 5438*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5439*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5440*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5441*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5442*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 5443*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 5444*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 5445*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*2], xm1 5446*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], xm1 5447*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5448*c0909341SAndroid Build Coastguard Worker sub hd, 4 5449*c0909341SAndroid Build Coastguard Worker jg .w8 5450*c0909341SAndroid Build Coastguard Worker RET 5451*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5452*c0909341SAndroid Build Coastguard Worker.w16: 5453*c0909341SAndroid Build Coastguard Worker mova m0, [maskq] 5454*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*0] 5455*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [dstq+dsq*1], 1 5456*c0909341SAndroid Build Coastguard Worker psubb m3, m4, m0 5457*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m0 5458*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0 5459*c0909341SAndroid Build Coastguard Worker mova m6, [maskq+tmpq] 5460*c0909341SAndroid Build Coastguard Worker add maskq, 16*2 5461*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m6 5462*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m6 5463*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 5464*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5465*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5466*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5467*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5468*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5469*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 5470*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5471*c0909341SAndroid Build Coastguard Worker sub hd, 2 5472*c0909341SAndroid Build Coastguard Worker jg .w16 5473*c0909341SAndroid Build Coastguard Worker RET 5474*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5475*c0909341SAndroid Build Coastguard Worker.w32: 5476*c0909341SAndroid Build Coastguard Worker mova m0, [maskq] 5477*c0909341SAndroid Build Coastguard Worker mova m1, [dstq] 5478*c0909341SAndroid Build Coastguard Worker mova m6, [maskq+tmpq] 5479*c0909341SAndroid Build Coastguard Worker add maskq, 32 5480*c0909341SAndroid Build Coastguard Worker psubb m3, m4, m0 5481*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m0 5482*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0 5483*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m6 5484*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m6 5485*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 5486*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5487*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5488*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5489*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5490*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5491*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5492*c0909341SAndroid Build Coastguard Worker dec hd 5493*c0909341SAndroid Build Coastguard Worker jg .w32 5494*c0909341SAndroid Build Coastguard Worker RET 5495*c0909341SAndroid Build Coastguard Worker 5496*c0909341SAndroid Build Coastguard Workercglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 5497*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_avx2_table 5498*c0909341SAndroid Build Coastguard Worker lea r5, [blend_v_avx2_table] 5499*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5500*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5501*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r5+wq*4] 5502*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_512] 5503*c0909341SAndroid Build Coastguard Worker add wq, r5 5504*c0909341SAndroid Build Coastguard Worker add maskq, obmc_masks-blend_v_avx2_table 5505*c0909341SAndroid Build Coastguard Worker jmp wq 5506*c0909341SAndroid Build Coastguard Worker.w2: 5507*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [maskq+2*2] 5508*c0909341SAndroid Build Coastguard Worker.w2_s0_loop: 5509*c0909341SAndroid Build Coastguard Worker movd xm0, [dstq+dsq*0] 5510*c0909341SAndroid Build Coastguard Worker pinsrw xm0, [dstq+dsq*1], 1 5511*c0909341SAndroid Build Coastguard Worker movd xm1, [tmpq] 5512*c0909341SAndroid Build Coastguard Worker add tmpq, 2*2 5513*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1 5514*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm2 5515*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm5 5516*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 5517*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm0, 0 5518*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm0, 1 5519*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5520*c0909341SAndroid Build Coastguard Worker sub hd, 2 5521*c0909341SAndroid Build Coastguard Worker jg .w2_s0_loop 5522*c0909341SAndroid Build Coastguard Worker RET 5523*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5524*c0909341SAndroid Build Coastguard Worker.w4: 5525*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm2, [maskq+4*2] 5526*c0909341SAndroid Build Coastguard Worker.w4_loop: 5527*c0909341SAndroid Build Coastguard Worker movd xm0, [dstq+dsq*0] 5528*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [dstq+dsq*1], 1 5529*c0909341SAndroid Build Coastguard Worker movq xm1, [tmpq] 5530*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 5531*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1 5532*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm2 5533*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm5 5534*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 5535*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 5536*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 1 5537*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5538*c0909341SAndroid Build Coastguard Worker sub hd, 2 5539*c0909341SAndroid Build Coastguard Worker jg .w4_loop 5540*c0909341SAndroid Build Coastguard Worker RET 5541*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5542*c0909341SAndroid Build Coastguard Worker.w8: 5543*c0909341SAndroid Build Coastguard Worker mova xm3, [maskq+8*2] 5544*c0909341SAndroid Build Coastguard Worker.w8_loop: 5545*c0909341SAndroid Build Coastguard Worker movq xm0, [dstq+dsq*0] 5546*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm1, [dstq+dsq*1] 5547*c0909341SAndroid Build Coastguard Worker mova xm2, [tmpq] 5548*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 5549*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm2 5550*c0909341SAndroid Build Coastguard Worker punpckhbw xm1, xm2 5551*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm3 5552*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm3 5553*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm5 5554*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm5 5555*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 5556*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 5557*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 5558*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5559*c0909341SAndroid Build Coastguard Worker sub hd, 2 5560*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5561*c0909341SAndroid Build Coastguard Worker RET 5562*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5563*c0909341SAndroid Build Coastguard Worker.w16: 5564*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [maskq+16*2] 5565*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [maskq+16*3] 5566*c0909341SAndroid Build Coastguard Worker.w16_loop: 5567*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*0] 5568*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [dstq+dsq*1], 1 5569*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5570*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 5571*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5572*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5573*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5574*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 5575*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5576*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5577*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5578*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5579*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 5580*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5581*c0909341SAndroid Build Coastguard Worker sub hd, 2 5582*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5583*c0909341SAndroid Build Coastguard Worker RET 5584*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5585*c0909341SAndroid Build Coastguard Worker.w32: 5586*c0909341SAndroid Build Coastguard Worker mova xm3, [maskq+16*4] 5587*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [maskq+16*6], 1 5588*c0909341SAndroid Build Coastguard Worker mova xm4, [maskq+16*5] 5589*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [maskq+16*7], 1 5590*c0909341SAndroid Build Coastguard Worker.w32_loop: 5591*c0909341SAndroid Build Coastguard Worker mova m1, [dstq] 5592*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5593*c0909341SAndroid Build Coastguard Worker add tmpq, 32 5594*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5595*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5596*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5597*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 5598*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5599*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5600*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5601*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5602*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5603*c0909341SAndroid Build Coastguard Worker dec hd 5604*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5605*c0909341SAndroid Build Coastguard Worker RET 5606*c0909341SAndroid Build Coastguard Worker 5607*c0909341SAndroid Build Coastguard Workercglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask 5608*c0909341SAndroid Build Coastguard Worker%define base r5-blend_h_avx2_table 5609*c0909341SAndroid Build Coastguard Worker lea r5, [blend_h_avx2_table] 5610*c0909341SAndroid Build Coastguard Worker mov r6d, wd 5611*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 5612*c0909341SAndroid Build Coastguard Worker mov hd, hm 5613*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r5+wq*4] 5614*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_512] 5615*c0909341SAndroid Build Coastguard Worker add wq, r5 5616*c0909341SAndroid Build Coastguard Worker lea maskq, [base+obmc_masks+hq*2] 5617*c0909341SAndroid Build Coastguard Worker lea hd, [hq*3] 5618*c0909341SAndroid Build Coastguard Worker shr hd, 2 ; h * 3/4 5619*c0909341SAndroid Build Coastguard Worker lea maskq, [maskq+hq*2] 5620*c0909341SAndroid Build Coastguard Worker neg hq 5621*c0909341SAndroid Build Coastguard Worker jmp wq 5622*c0909341SAndroid Build Coastguard Worker.w2: 5623*c0909341SAndroid Build Coastguard Worker movd xm0, [dstq+dsq*0] 5624*c0909341SAndroid Build Coastguard Worker pinsrw xm0, [dstq+dsq*1], 1 5625*c0909341SAndroid Build Coastguard Worker movd xm2, [maskq+hq*2] 5626*c0909341SAndroid Build Coastguard Worker movd xm1, [tmpq] 5627*c0909341SAndroid Build Coastguard Worker add tmpq, 2*2 5628*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm2 5629*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1 5630*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm2 5631*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm5 5632*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 5633*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xm0, 0 5634*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xm0, 1 5635*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5636*c0909341SAndroid Build Coastguard Worker add hq, 2 5637*c0909341SAndroid Build Coastguard Worker jl .w2 5638*c0909341SAndroid Build Coastguard Worker RET 5639*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5640*c0909341SAndroid Build Coastguard Worker.w4: 5641*c0909341SAndroid Build Coastguard Worker mova xm3, [blend_shuf] 5642*c0909341SAndroid Build Coastguard Worker.w4_loop: 5643*c0909341SAndroid Build Coastguard Worker movd xm0, [dstq+dsq*0] 5644*c0909341SAndroid Build Coastguard Worker pinsrd xm0, [dstq+dsq*1], 1 5645*c0909341SAndroid Build Coastguard Worker movd xm2, [maskq+hq*2] 5646*c0909341SAndroid Build Coastguard Worker movq xm1, [tmpq] 5647*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 5648*c0909341SAndroid Build Coastguard Worker pshufb xm2, xm3 5649*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1 5650*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm2 5651*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm5 5652*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 5653*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm0 5654*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm0, 1 5655*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5656*c0909341SAndroid Build Coastguard Worker add hq, 2 5657*c0909341SAndroid Build Coastguard Worker jl .w4_loop 5658*c0909341SAndroid Build Coastguard Worker RET 5659*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5660*c0909341SAndroid Build Coastguard Worker.w8: 5661*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [blend_shuf] 5662*c0909341SAndroid Build Coastguard Worker shufpd m4, m4, 0x03 5663*c0909341SAndroid Build Coastguard Worker.w8_loop: 5664*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [dstq+dsq*0] 5665*c0909341SAndroid Build Coastguard Worker movq xm0, [dstq+dsq*1] 5666*c0909341SAndroid Build Coastguard Worker vpblendd m0, m1, 0x30 5667*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [maskq+hq*2] 5668*c0909341SAndroid Build Coastguard Worker movq xm1, [tmpq+8*1] 5669*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [tmpq+8*0], 1 5670*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 5671*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 5672*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 5673*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5674*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5675*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 5676*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 5677*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*0], xm0 5678*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*1], xm0 5679*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5680*c0909341SAndroid Build Coastguard Worker add hq, 2 5681*c0909341SAndroid Build Coastguard Worker jl .w8_loop 5682*c0909341SAndroid Build Coastguard Worker RET 5683*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5684*c0909341SAndroid Build Coastguard Worker.w16: 5685*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [blend_shuf] 5686*c0909341SAndroid Build Coastguard Worker shufpd m4, m4, 0x0c 5687*c0909341SAndroid Build Coastguard Worker.w16_loop: 5688*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*0] 5689*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [dstq+dsq*1], 1 5690*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [maskq+hq*2] 5691*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5692*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 5693*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 5694*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5695*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5696*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5697*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5698*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5699*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5700*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5701*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5702*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], m0, 1 5703*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5704*c0909341SAndroid Build Coastguard Worker add hq, 2 5705*c0909341SAndroid Build Coastguard Worker jl .w16_loop 5706*c0909341SAndroid Build Coastguard Worker RET 5707*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5708*c0909341SAndroid Build Coastguard Worker.w32: ; w32/w64/w128 5709*c0909341SAndroid Build Coastguard Worker sub dsq, r6 5710*c0909341SAndroid Build Coastguard Worker.w32_loop0: 5711*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [maskq+hq*2] 5712*c0909341SAndroid Build Coastguard Worker mov wd, r6d 5713*c0909341SAndroid Build Coastguard Worker.w32_loop: 5714*c0909341SAndroid Build Coastguard Worker mova m1, [dstq] 5715*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5716*c0909341SAndroid Build Coastguard Worker add tmpq, 32 5717*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5718*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5719*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5720*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5721*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5722*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5723*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5724*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5725*c0909341SAndroid Build Coastguard Worker add dstq, 32 5726*c0909341SAndroid Build Coastguard Worker sub wd, 32 5727*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5728*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5729*c0909341SAndroid Build Coastguard Worker inc hq 5730*c0909341SAndroid Build Coastguard Worker jl .w32_loop0 5731*c0909341SAndroid Build Coastguard Worker RET 5732*c0909341SAndroid Build Coastguard Worker 5733*c0909341SAndroid Build Coastguard Workercglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ 5734*c0909341SAndroid Build Coastguard Worker bottomext, rightext 5735*c0909341SAndroid Build Coastguard Worker ; we assume that the buffer (stride) is larger than width, so we can 5736*c0909341SAndroid Build Coastguard Worker ; safely overwrite by a few bytes 5737*c0909341SAndroid Build Coastguard Worker 5738*c0909341SAndroid Build Coastguard Worker ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 5739*c0909341SAndroid Build Coastguard Worker xor r12d, r12d 5740*c0909341SAndroid Build Coastguard Worker lea r10, [ihq-1] 5741*c0909341SAndroid Build Coastguard Worker cmp yq, ihq 5742*c0909341SAndroid Build Coastguard Worker cmovs r10, yq 5743*c0909341SAndroid Build Coastguard Worker test yq, yq 5744*c0909341SAndroid Build Coastguard Worker cmovs r10, r12 5745*c0909341SAndroid Build Coastguard Worker imul r10, sstrideq 5746*c0909341SAndroid Build Coastguard Worker add srcq, r10 5747*c0909341SAndroid Build Coastguard Worker 5748*c0909341SAndroid Build Coastguard Worker ; ref += iclip(x, 0, iw - 1) 5749*c0909341SAndroid Build Coastguard Worker lea r10, [iwq-1] 5750*c0909341SAndroid Build Coastguard Worker cmp xq, iwq 5751*c0909341SAndroid Build Coastguard Worker cmovs r10, xq 5752*c0909341SAndroid Build Coastguard Worker test xq, xq 5753*c0909341SAndroid Build Coastguard Worker cmovs r10, r12 5754*c0909341SAndroid Build Coastguard Worker add srcq, r10 5755*c0909341SAndroid Build Coastguard Worker 5756*c0909341SAndroid Build Coastguard Worker ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 5757*c0909341SAndroid Build Coastguard Worker lea bottomextq, [yq+bhq] 5758*c0909341SAndroid Build Coastguard Worker sub bottomextq, ihq 5759*c0909341SAndroid Build Coastguard Worker lea r3, [bhq-1] 5760*c0909341SAndroid Build Coastguard Worker cmovs bottomextq, r12 5761*c0909341SAndroid Build Coastguard Worker 5762*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ 5763*c0909341SAndroid Build Coastguard Worker bottomext, rightext 5764*c0909341SAndroid Build Coastguard Worker 5765*c0909341SAndroid Build Coastguard Worker ; top_ext = iclip(-y, 0, bh - 1) 5766*c0909341SAndroid Build Coastguard Worker neg topextq 5767*c0909341SAndroid Build Coastguard Worker cmovs topextq, r12 5768*c0909341SAndroid Build Coastguard Worker cmp bottomextq, bhq 5769*c0909341SAndroid Build Coastguard Worker cmovns bottomextq, r3 5770*c0909341SAndroid Build Coastguard Worker cmp topextq, bhq 5771*c0909341SAndroid Build Coastguard Worker cmovg topextq, r3 5772*c0909341SAndroid Build Coastguard Worker 5773*c0909341SAndroid Build Coastguard Worker ; right_ext = iclip(x + bw - iw, 0, bw - 1) 5774*c0909341SAndroid Build Coastguard Worker lea rightextq, [xq+bwq] 5775*c0909341SAndroid Build Coastguard Worker sub rightextq, iwq 5776*c0909341SAndroid Build Coastguard Worker lea r2, [bwq-1] 5777*c0909341SAndroid Build Coastguard Worker cmovs rightextq, r12 5778*c0909341SAndroid Build Coastguard Worker 5779*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ 5780*c0909341SAndroid Build Coastguard Worker bottomext, rightext 5781*c0909341SAndroid Build Coastguard Worker 5782*c0909341SAndroid Build Coastguard Worker ; left_ext = iclip(-x, 0, bw - 1) 5783*c0909341SAndroid Build Coastguard Worker neg leftextq 5784*c0909341SAndroid Build Coastguard Worker cmovs leftextq, r12 5785*c0909341SAndroid Build Coastguard Worker cmp rightextq, bwq 5786*c0909341SAndroid Build Coastguard Worker cmovns rightextq, r2 5787*c0909341SAndroid Build Coastguard Worker cmp leftextq, bwq 5788*c0909341SAndroid Build Coastguard Worker cmovns leftextq, r2 5789*c0909341SAndroid Build Coastguard Worker 5790*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ 5791*c0909341SAndroid Build Coastguard Worker dst, dstride, src, sstride, bottomext, rightext 5792*c0909341SAndroid Build Coastguard Worker 5793*c0909341SAndroid Build Coastguard Worker ; center_h = bh - top_ext - bottom_ext 5794*c0909341SAndroid Build Coastguard Worker lea r3, [bottomextq+topextq] 5795*c0909341SAndroid Build Coastguard Worker sub centerhq, r3 5796*c0909341SAndroid Build Coastguard Worker 5797*c0909341SAndroid Build Coastguard Worker ; blk += top_ext * PXSTRIDE(dst_stride) 5798*c0909341SAndroid Build Coastguard Worker mov r2, topextq 5799*c0909341SAndroid Build Coastguard Worker imul r2, dstrideq 5800*c0909341SAndroid Build Coastguard Worker add dstq, r2 5801*c0909341SAndroid Build Coastguard Worker mov r9m, dstq 5802*c0909341SAndroid Build Coastguard Worker 5803*c0909341SAndroid Build Coastguard Worker ; center_w = bw - left_ext - right_ext 5804*c0909341SAndroid Build Coastguard Worker mov centerwq, bwq 5805*c0909341SAndroid Build Coastguard Worker lea r3, [rightextq+leftextq] 5806*c0909341SAndroid Build Coastguard Worker sub centerwq, r3 5807*c0909341SAndroid Build Coastguard Worker 5808*c0909341SAndroid Build Coastguard Worker%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 5809*c0909341SAndroid Build Coastguard Worker.v_loop_%3: 5810*c0909341SAndroid Build Coastguard Worker%if %1 5811*c0909341SAndroid Build Coastguard Worker ; left extension 5812*c0909341SAndroid Build Coastguard Worker xor r3, r3 5813*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [srcq] 5814*c0909341SAndroid Build Coastguard Worker.left_loop_%3: 5815*c0909341SAndroid Build Coastguard Worker mova [dstq+r3], m0 5816*c0909341SAndroid Build Coastguard Worker add r3, 32 5817*c0909341SAndroid Build Coastguard Worker cmp r3, leftextq 5818*c0909341SAndroid Build Coastguard Worker jl .left_loop_%3 5819*c0909341SAndroid Build Coastguard Worker 5820*c0909341SAndroid Build Coastguard Worker ; body 5821*c0909341SAndroid Build Coastguard Worker lea r12, [dstq+leftextq] 5822*c0909341SAndroid Build Coastguard Worker%endif 5823*c0909341SAndroid Build Coastguard Worker xor r3, r3 5824*c0909341SAndroid Build Coastguard Worker.body_loop_%3: 5825*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r3] 5826*c0909341SAndroid Build Coastguard Worker%if %1 5827*c0909341SAndroid Build Coastguard Worker movu [r12+r3], m0 5828*c0909341SAndroid Build Coastguard Worker%else 5829*c0909341SAndroid Build Coastguard Worker movu [dstq+r3], m0 5830*c0909341SAndroid Build Coastguard Worker%endif 5831*c0909341SAndroid Build Coastguard Worker add r3, 32 5832*c0909341SAndroid Build Coastguard Worker cmp r3, centerwq 5833*c0909341SAndroid Build Coastguard Worker jl .body_loop_%3 5834*c0909341SAndroid Build Coastguard Worker 5835*c0909341SAndroid Build Coastguard Worker%if %2 5836*c0909341SAndroid Build Coastguard Worker ; right extension 5837*c0909341SAndroid Build Coastguard Worker%if %1 5838*c0909341SAndroid Build Coastguard Worker add r12, centerwq 5839*c0909341SAndroid Build Coastguard Worker%else 5840*c0909341SAndroid Build Coastguard Worker lea r12, [dstq+centerwq] 5841*c0909341SAndroid Build Coastguard Worker%endif 5842*c0909341SAndroid Build Coastguard Worker xor r3, r3 5843*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [srcq+centerwq-1] 5844*c0909341SAndroid Build Coastguard Worker.right_loop_%3: 5845*c0909341SAndroid Build Coastguard Worker movu [r12+r3], m0 5846*c0909341SAndroid Build Coastguard Worker add r3, 32 5847*c0909341SAndroid Build Coastguard Worker cmp r3, rightextq 5848*c0909341SAndroid Build Coastguard Worker jl .right_loop_%3 5849*c0909341SAndroid Build Coastguard Worker 5850*c0909341SAndroid Build Coastguard Worker%endif 5851*c0909341SAndroid Build Coastguard Worker add dstq, dstrideq 5852*c0909341SAndroid Build Coastguard Worker add srcq, sstrideq 5853*c0909341SAndroid Build Coastguard Worker dec centerhq 5854*c0909341SAndroid Build Coastguard Worker jg .v_loop_%3 5855*c0909341SAndroid Build Coastguard Worker%endmacro 5856*c0909341SAndroid Build Coastguard Worker 5857*c0909341SAndroid Build Coastguard Worker test leftextq, leftextq 5858*c0909341SAndroid Build Coastguard Worker jnz .need_left_ext 5859*c0909341SAndroid Build Coastguard Worker test rightextq, rightextq 5860*c0909341SAndroid Build Coastguard Worker jnz .need_right_ext 5861*c0909341SAndroid Build Coastguard Worker v_loop 0, 0, 0 5862*c0909341SAndroid Build Coastguard Worker jmp .body_done 5863*c0909341SAndroid Build Coastguard Worker 5864*c0909341SAndroid Build Coastguard Worker.need_left_ext: 5865*c0909341SAndroid Build Coastguard Worker test rightextq, rightextq 5866*c0909341SAndroid Build Coastguard Worker jnz .need_left_right_ext 5867*c0909341SAndroid Build Coastguard Worker v_loop 1, 0, 1 5868*c0909341SAndroid Build Coastguard Worker jmp .body_done 5869*c0909341SAndroid Build Coastguard Worker 5870*c0909341SAndroid Build Coastguard Worker.need_left_right_ext: 5871*c0909341SAndroid Build Coastguard Worker v_loop 1, 1, 2 5872*c0909341SAndroid Build Coastguard Worker jmp .body_done 5873*c0909341SAndroid Build Coastguard Worker 5874*c0909341SAndroid Build Coastguard Worker.need_right_ext: 5875*c0909341SAndroid Build Coastguard Worker v_loop 0, 1, 3 5876*c0909341SAndroid Build Coastguard Worker 5877*c0909341SAndroid Build Coastguard Worker.body_done: 5878*c0909341SAndroid Build Coastguard Worker ; bottom edge extension 5879*c0909341SAndroid Build Coastguard Worker test bottomextq, bottomextq 5880*c0909341SAndroid Build Coastguard Worker jz .top 5881*c0909341SAndroid Build Coastguard Worker mov srcq, dstq 5882*c0909341SAndroid Build Coastguard Worker sub srcq, dstrideq 5883*c0909341SAndroid Build Coastguard Worker xor r1, r1 5884*c0909341SAndroid Build Coastguard Worker.bottom_x_loop: 5885*c0909341SAndroid Build Coastguard Worker mova m0, [srcq+r1] 5886*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r1] 5887*c0909341SAndroid Build Coastguard Worker mov r4, bottomextq 5888*c0909341SAndroid Build Coastguard Worker.bottom_y_loop: 5889*c0909341SAndroid Build Coastguard Worker mova [r3], m0 5890*c0909341SAndroid Build Coastguard Worker add r3, dstrideq 5891*c0909341SAndroid Build Coastguard Worker dec r4 5892*c0909341SAndroid Build Coastguard Worker jg .bottom_y_loop 5893*c0909341SAndroid Build Coastguard Worker add r1, 32 5894*c0909341SAndroid Build Coastguard Worker cmp r1, bwq 5895*c0909341SAndroid Build Coastguard Worker jl .bottom_x_loop 5896*c0909341SAndroid Build Coastguard Worker 5897*c0909341SAndroid Build Coastguard Worker.top: 5898*c0909341SAndroid Build Coastguard Worker ; top edge extension 5899*c0909341SAndroid Build Coastguard Worker test topextq, topextq 5900*c0909341SAndroid Build Coastguard Worker jz .end 5901*c0909341SAndroid Build Coastguard Worker mov srcq, r9m 5902*c0909341SAndroid Build Coastguard Worker mov dstq, dstm 5903*c0909341SAndroid Build Coastguard Worker xor r1, r1 5904*c0909341SAndroid Build Coastguard Worker.top_x_loop: 5905*c0909341SAndroid Build Coastguard Worker mova m0, [srcq+r1] 5906*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r1] 5907*c0909341SAndroid Build Coastguard Worker mov r4, topextq 5908*c0909341SAndroid Build Coastguard Worker.top_y_loop: 5909*c0909341SAndroid Build Coastguard Worker mova [r3], m0 5910*c0909341SAndroid Build Coastguard Worker add r3, dstrideq 5911*c0909341SAndroid Build Coastguard Worker dec r4 5912*c0909341SAndroid Build Coastguard Worker jg .top_y_loop 5913*c0909341SAndroid Build Coastguard Worker add r1, 32 5914*c0909341SAndroid Build Coastguard Worker cmp r1, bwq 5915*c0909341SAndroid Build Coastguard Worker jl .top_x_loop 5916*c0909341SAndroid Build Coastguard Worker 5917*c0909341SAndroid Build Coastguard Worker.end: 5918*c0909341SAndroid Build Coastguard Worker RET 5919*c0909341SAndroid Build Coastguard Worker 5920*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ 5921*c0909341SAndroid Build Coastguard Worker dst_w, h, src_w, dx, mx0 5922*c0909341SAndroid Build Coastguard Worker sub dword mx0m, 4<<14 5923*c0909341SAndroid Build Coastguard Worker sub dword src_wm, 8 5924*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, dxm 5925*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, mx0m 5926*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, src_wm 5927*c0909341SAndroid Build Coastguard Worker 5928*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 5929*c0909341SAndroid Build Coastguard Worker LEA r7, $$ 5930*c0909341SAndroid Build Coastguard Worker%define base r7-$$ 5931*c0909341SAndroid Build Coastguard Worker 5932*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [base+pw_m256] 5933*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pd_63] 5934*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m15, [base+pb_8x0_8x8] 5935*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] 5936*c0909341SAndroid Build Coastguard Worker pslld m5, 3 ; dx*8 5937*c0909341SAndroid Build Coastguard Worker pslld m6, 14 5938*c0909341SAndroid Build Coastguard Worker paddd m8, m2 ; mx+[0..7]*dx 5939*c0909341SAndroid Build Coastguard Worker pxor m2, m2 5940*c0909341SAndroid Build Coastguard Worker 5941*c0909341SAndroid Build Coastguard Worker ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 5942*c0909341SAndroid Build Coastguard Worker ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 5943*c0909341SAndroid Build Coastguard Worker 5944*c0909341SAndroid Build Coastguard Worker.loop_y: 5945*c0909341SAndroid Build Coastguard Worker xor xd, xd 5946*c0909341SAndroid Build Coastguard Worker mova m4, m8 ; per-line working version of mx 5947*c0909341SAndroid Build Coastguard Worker 5948*c0909341SAndroid Build Coastguard Worker.loop_x: 5949*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m4, m2 5950*c0909341SAndroid Build Coastguard Worker psrad m9, m4, 8 ; filter offset (unmasked) 5951*c0909341SAndroid Build Coastguard Worker pminsd m0, m6 ; iclip(mx, 0, src_w-8) 5952*c0909341SAndroid Build Coastguard Worker psubd m1, m4, m0 ; pshufb offset 5953*c0909341SAndroid Build Coastguard Worker psrad m0, 14 ; clipped src_x offset 5954*c0909341SAndroid Build Coastguard Worker psrad m1, 14 ; pshufb edge_emu offset 5955*c0909341SAndroid Build Coastguard Worker pand m9, m7 ; filter offset (masked) 5956*c0909341SAndroid Build Coastguard Worker 5957*c0909341SAndroid Build Coastguard Worker ; load source pixels - this ugly code is vpgatherdq emulation since 5958*c0909341SAndroid Build Coastguard Worker ; directly using vpgatherdq on Haswell is quite a bit slower :( 5959*c0909341SAndroid Build Coastguard Worker movd r8d, xm0 5960*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm0, 1 5961*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm0, 2 5962*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm0, 3 5963*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 5964*c0909341SAndroid Build Coastguard Worker movq xm12, [srcq+r8] 5965*c0909341SAndroid Build Coastguard Worker movq xm13, [srcq+r10] 5966*c0909341SAndroid Build Coastguard Worker movhps xm12, [srcq+r9] 5967*c0909341SAndroid Build Coastguard Worker movhps xm13, [srcq+r11] 5968*c0909341SAndroid Build Coastguard Worker movd r8d, xm0 5969*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm0, 1 5970*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm0, 2 5971*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm0, 3 5972*c0909341SAndroid Build Coastguard Worker vinserti128 m12, [srcq+r8], 1 5973*c0909341SAndroid Build Coastguard Worker vinserti128 m13, [srcq+r10], 1 5974*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [srcq+r9] 5975*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [srcq+r11] 5976*c0909341SAndroid Build Coastguard Worker vpblendd m12, m10, 11000000b 5977*c0909341SAndroid Build Coastguard Worker vpblendd m13, m11, 11000000b 5978*c0909341SAndroid Build Coastguard Worker 5979*c0909341SAndroid Build Coastguard Worker ; if no emulation is required, we don't need to shuffle or emulate edges 5980*c0909341SAndroid Build Coastguard Worker ; this also saves 2 quasi-vpgatherdqs 5981*c0909341SAndroid Build Coastguard Worker vptest m1, m1 5982*c0909341SAndroid Build Coastguard Worker jz .filter 5983*c0909341SAndroid Build Coastguard Worker 5984*c0909341SAndroid Build Coastguard Worker movq r9, xm1 5985*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 5986*c0909341SAndroid Build Coastguard Worker movsxd r8, r9d 5987*c0909341SAndroid Build Coastguard Worker sar r9, 32 5988*c0909341SAndroid Build Coastguard Worker movsxd r10, r11d 5989*c0909341SAndroid Build Coastguard Worker sar r11, 32 5990*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m1, 1 5991*c0909341SAndroid Build Coastguard Worker movq xm14, [base+resize_shuf+4+r8] 5992*c0909341SAndroid Build Coastguard Worker movq xm0, [base+resize_shuf+4+r10] 5993*c0909341SAndroid Build Coastguard Worker movhps xm14, [base+resize_shuf+4+r9] 5994*c0909341SAndroid Build Coastguard Worker movhps xm0, [base+resize_shuf+4+r11] 5995*c0909341SAndroid Build Coastguard Worker movq r9, xm1 5996*c0909341SAndroid Build Coastguard Worker pextrq r11, xm1, 1 5997*c0909341SAndroid Build Coastguard Worker movsxd r8, r9d 5998*c0909341SAndroid Build Coastguard Worker sar r9, 32 5999*c0909341SAndroid Build Coastguard Worker movsxd r10, r11d 6000*c0909341SAndroid Build Coastguard Worker sar r11, 32 6001*c0909341SAndroid Build Coastguard Worker vinserti128 m14, [base+resize_shuf+4+r8], 1 6002*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [base+resize_shuf+4+r10], 1 6003*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+resize_shuf+4+r9] 6004*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [base+resize_shuf+4+r11] 6005*c0909341SAndroid Build Coastguard Worker vpblendd m14, m10, 11000000b 6006*c0909341SAndroid Build Coastguard Worker vpblendd m0, m11, 11000000b 6007*c0909341SAndroid Build Coastguard Worker 6008*c0909341SAndroid Build Coastguard Worker paddb m14, m15 6009*c0909341SAndroid Build Coastguard Worker paddb m0, m15 6010*c0909341SAndroid Build Coastguard Worker pshufb m12, m14 6011*c0909341SAndroid Build Coastguard Worker pshufb m13, m0 6012*c0909341SAndroid Build Coastguard Worker 6013*c0909341SAndroid Build Coastguard Worker.filter: 6014*c0909341SAndroid Build Coastguard Worker movd r8d, xm9 6015*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm9, 1 6016*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm9, 2 6017*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm9, 3 6018*c0909341SAndroid Build Coastguard Worker vextracti128 xm9, m9, 1 6019*c0909341SAndroid Build Coastguard Worker movq xm10, [base+resize_filter+r8*8] 6020*c0909341SAndroid Build Coastguard Worker movq xm11, [base+resize_filter+r10*8] 6021*c0909341SAndroid Build Coastguard Worker movhps xm10, [base+resize_filter+r9*8] 6022*c0909341SAndroid Build Coastguard Worker movhps xm11, [base+resize_filter+r11*8] 6023*c0909341SAndroid Build Coastguard Worker movd r8d, xm9 6024*c0909341SAndroid Build Coastguard Worker pextrd r9d, xm9, 1 6025*c0909341SAndroid Build Coastguard Worker pextrd r10d, xm9, 2 6026*c0909341SAndroid Build Coastguard Worker pextrd r11d, xm9, 3 6027*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [base+resize_filter+r8*8], 1 6028*c0909341SAndroid Build Coastguard Worker vinserti128 m11, [base+resize_filter+r10*8], 1 6029*c0909341SAndroid Build Coastguard Worker vpbroadcastq m14, [base+resize_filter+r9*8] 6030*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+resize_filter+r11*8] 6031*c0909341SAndroid Build Coastguard Worker vpblendd m10, m14, 11000000b 6032*c0909341SAndroid Build Coastguard Worker vpblendd m11, m1, 11000000b 6033*c0909341SAndroid Build Coastguard Worker 6034*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m10 6035*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m11 6036*c0909341SAndroid Build Coastguard Worker phaddw m12, m13 6037*c0909341SAndroid Build Coastguard Worker vextracti128 xm13, m12, 1 6038*c0909341SAndroid Build Coastguard Worker phaddsw xm12, xm13 6039*c0909341SAndroid Build Coastguard Worker pmulhrsw xm12, xm3 ; x=(x+64)>>7 6040*c0909341SAndroid Build Coastguard Worker packuswb xm12, xm12 6041*c0909341SAndroid Build Coastguard Worker movq [dstq+xq], xm12 6042*c0909341SAndroid Build Coastguard Worker 6043*c0909341SAndroid Build Coastguard Worker paddd m4, m5 6044*c0909341SAndroid Build Coastguard Worker add xd, 8 6045*c0909341SAndroid Build Coastguard Worker cmp xd, dst_wd 6046*c0909341SAndroid Build Coastguard Worker jl .loop_x 6047*c0909341SAndroid Build Coastguard Worker 6048*c0909341SAndroid Build Coastguard Worker add dstq, dst_strideq 6049*c0909341SAndroid Build Coastguard Worker add srcq, src_strideq 6050*c0909341SAndroid Build Coastguard Worker dec hd 6051*c0909341SAndroid Build Coastguard Worker jg .loop_y 6052*c0909341SAndroid Build Coastguard Worker RET 6053*c0909341SAndroid Build Coastguard Worker 6054*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 6055*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx2_table 6056*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_420_avx2_table] 6057*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6058*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; sign 6059*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 6060*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 6061*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 6062*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_2048] 6063*c0909341SAndroid Build Coastguard Worker pmovzxbd m9, [base+deint_shuf4] 6064*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign 6065*c0909341SAndroid Build Coastguard Worker add wq, r7 6066*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6067*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 6068*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 6069*c0909341SAndroid Build Coastguard Worker jmp wq 6070*c0909341SAndroid Build Coastguard Worker.w4: 6071*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6072*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 6073*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 6074*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 6075*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 6076*c0909341SAndroid Build Coastguard Worker cmp hd, 8 6077*c0909341SAndroid Build Coastguard Worker jl .w4_end 6078*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6079*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 6080*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 6081*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 6082*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 6083*c0909341SAndroid Build Coastguard Worker jg .w4_h16 6084*c0909341SAndroid Build Coastguard Worker.w4_end: 6085*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m4, 1 6086*c0909341SAndroid Build Coastguard Worker vpblendd xm1, xm4, xm0, 0x05 6087*c0909341SAndroid Build Coastguard Worker vpblendd xm4, xm0, 0x0a 6088*c0909341SAndroid Build Coastguard Worker pshufd xm1, xm1, q2301 6089*c0909341SAndroid Build Coastguard Worker psubw xm4, xm8, xm4 6090*c0909341SAndroid Build Coastguard Worker psubw xm4, xm1 6091*c0909341SAndroid Build Coastguard Worker psrlw xm4, 2 6092*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 6093*c0909341SAndroid Build Coastguard Worker movq [maskq], xm4 6094*c0909341SAndroid Build Coastguard Worker RET 6095*c0909341SAndroid Build Coastguard Worker.w4_h16: 6096*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6097*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6098*c0909341SAndroid Build Coastguard Worker phaddd m4, m5 6099*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6100*c0909341SAndroid Build Coastguard Worker psubw m4, m8, m4 6101*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 6102*c0909341SAndroid Build Coastguard Worker vpermd m4, m9, m4 6103*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 6104*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm5 6105*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 6106*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 6107*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 6108*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q], xm1, 1 6109*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6110*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 6111*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 6112*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 6113*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 6114*c0909341SAndroid Build Coastguard Worker mova [maskq], xm4 6115*c0909341SAndroid Build Coastguard Worker RET 6116*c0909341SAndroid Build Coastguard Worker.w8_loop: 6117*c0909341SAndroid Build Coastguard Worker add tmp1q, 2*32 6118*c0909341SAndroid Build Coastguard Worker add tmp2q, 2*32 6119*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6120*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6121*c0909341SAndroid Build Coastguard Worker add maskq, 8 6122*c0909341SAndroid Build Coastguard Worker.w8: 6123*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m4, 1 6124*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6125*c0909341SAndroid Build Coastguard Worker psubw xm4, xm8, xm4 6126*c0909341SAndroid Build Coastguard Worker psubw xm4, xm2 6127*c0909341SAndroid Build Coastguard Worker psrlw xm4, 2 6128*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm4 6129*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 6130*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 6131*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 6132*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 6133*c0909341SAndroid Build Coastguard Worker movq [maskq], xm4 6134*c0909341SAndroid Build Coastguard Worker sub hd, 4 6135*c0909341SAndroid Build Coastguard Worker jg .w8_loop 6136*c0909341SAndroid Build Coastguard Worker RET 6137*c0909341SAndroid Build Coastguard Worker.w16_loop: 6138*c0909341SAndroid Build Coastguard Worker add tmp1q, 4*32 6139*c0909341SAndroid Build Coastguard Worker add tmp2q, 4*32 6140*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6141*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6142*c0909341SAndroid Build Coastguard Worker add maskq, 16 6143*c0909341SAndroid Build Coastguard Worker.w16: 6144*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6145*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 6146*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 6147*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6148*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m4, m5 6149*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5 6150*c0909341SAndroid Build Coastguard Worker psubw m1, m8, m1 6151*c0909341SAndroid Build Coastguard Worker psubw m1, m4 6152*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 6153*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6154*c0909341SAndroid Build Coastguard Worker packuswb m1, m1 6155*c0909341SAndroid Build Coastguard Worker vpermd m1, m9, m1 6156*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm0 6157*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m0, 1 6158*c0909341SAndroid Build Coastguard Worker mova [maskq], xm1 6159*c0909341SAndroid Build Coastguard Worker sub hd, 4 6160*c0909341SAndroid Build Coastguard Worker jg .w16_loop 6161*c0909341SAndroid Build Coastguard Worker RET 6162*c0909341SAndroid Build Coastguard Worker.w32_loop: 6163*c0909341SAndroid Build Coastguard Worker add tmp1q, 4*32 6164*c0909341SAndroid Build Coastguard Worker add tmp2q, 4*32 6165*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6166*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 6167*c0909341SAndroid Build Coastguard Worker add maskq, 16 6168*c0909341SAndroid Build Coastguard Worker.w32: 6169*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6170*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 6171*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6172*c0909341SAndroid Build Coastguard Worker psubw m4, m8, m4 6173*c0909341SAndroid Build Coastguard Worker psubw m4, m5 6174*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 6175*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6176*c0909341SAndroid Build Coastguard Worker packuswb m4, m4 6177*c0909341SAndroid Build Coastguard Worker vpermd m4, m9, m4 6178*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 6179*c0909341SAndroid Build Coastguard Worker mova [maskq], xm4 6180*c0909341SAndroid Build Coastguard Worker sub hd, 2 6181*c0909341SAndroid Build Coastguard Worker jg .w32_loop 6182*c0909341SAndroid Build Coastguard Worker RET 6183*c0909341SAndroid Build Coastguard Worker.w64_loop_even: 6184*c0909341SAndroid Build Coastguard Worker psubw m10, m8, m4 6185*c0909341SAndroid Build Coastguard Worker psubw m11, m8, m5 6186*c0909341SAndroid Build Coastguard Worker dec hd 6187*c0909341SAndroid Build Coastguard Worker.w64_loop: 6188*c0909341SAndroid Build Coastguard Worker add tmp1q, 4*32 6189*c0909341SAndroid Build Coastguard Worker add tmp2q, 4*32 6190*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6191*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6192*c0909341SAndroid Build Coastguard Worker.w64: 6193*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6194*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6195*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6196*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6197*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 6198*c0909341SAndroid Build Coastguard Worker test hd, 1 6199*c0909341SAndroid Build Coastguard Worker jz .w64_loop_even 6200*c0909341SAndroid Build Coastguard Worker psubw m4, m10, m4 6201*c0909341SAndroid Build Coastguard Worker psubw m5, m11, m5 6202*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 6203*c0909341SAndroid Build Coastguard Worker psrlw m5, 2 6204*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6205*c0909341SAndroid Build Coastguard Worker vpermd m4, m9, m4 6206*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 6207*c0909341SAndroid Build Coastguard Worker add maskq, 32 6208*c0909341SAndroid Build Coastguard Worker dec hd 6209*c0909341SAndroid Build Coastguard Worker jg .w64_loop 6210*c0909341SAndroid Build Coastguard Worker RET 6211*c0909341SAndroid Build Coastguard Worker.w128_loop_even: 6212*c0909341SAndroid Build Coastguard Worker psubw m12, m8, m4 6213*c0909341SAndroid Build Coastguard Worker psubw m13, m8, m5 6214*c0909341SAndroid Build Coastguard Worker dec hd 6215*c0909341SAndroid Build Coastguard Worker.w128_loop: 6216*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6217*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6218*c0909341SAndroid Build Coastguard Worker.w128: 6219*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6220*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6221*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6222*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6223*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 6224*c0909341SAndroid Build Coastguard Worker add tmp1q, 8*32 6225*c0909341SAndroid Build Coastguard Worker add tmp2q, 8*32 6226*c0909341SAndroid Build Coastguard Worker test hd, 1 6227*c0909341SAndroid Build Coastguard Worker jz .w128_even 6228*c0909341SAndroid Build Coastguard Worker psubw m4, m10, m4 6229*c0909341SAndroid Build Coastguard Worker psubw m5, m11, m5 6230*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 6231*c0909341SAndroid Build Coastguard Worker psrlw m5, 2 6232*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6233*c0909341SAndroid Build Coastguard Worker vpermd m4, m9, m4 6234*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m4 6235*c0909341SAndroid Build Coastguard Worker jmp .w128_odd 6236*c0909341SAndroid Build Coastguard Worker.w128_even: 6237*c0909341SAndroid Build Coastguard Worker psubw m10, m8, m4 6238*c0909341SAndroid Build Coastguard Worker psubw m11, m8, m5 6239*c0909341SAndroid Build Coastguard Worker.w128_odd: 6240*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, -4, -3 6241*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6242*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 6243*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, -2, -1 6244*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6245*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m0 6246*c0909341SAndroid Build Coastguard Worker test hd, 1 6247*c0909341SAndroid Build Coastguard Worker jz .w128_loop_even 6248*c0909341SAndroid Build Coastguard Worker psubw m4, m12, m4 6249*c0909341SAndroid Build Coastguard Worker psubw m5, m13, m5 6250*c0909341SAndroid Build Coastguard Worker psrlw m4, 2 6251*c0909341SAndroid Build Coastguard Worker psrlw m5, 2 6252*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6253*c0909341SAndroid Build Coastguard Worker vpermd m4, m9, m4 6254*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m4 6255*c0909341SAndroid Build Coastguard Worker add maskq, 64 6256*c0909341SAndroid Build Coastguard Worker dec hd 6257*c0909341SAndroid Build Coastguard Worker jg .w128_loop 6258*c0909341SAndroid Build Coastguard Worker RET 6259*c0909341SAndroid Build Coastguard Worker 6260*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 6261*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx2_table 6262*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_422_avx2_table] 6263*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6264*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; sign 6265*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 6266*c0909341SAndroid Build Coastguard Worker pxor m9, m9 6267*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r7+wq*4] 6268*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 6269*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_2048] 6270*c0909341SAndroid Build Coastguard Worker pmovzxbd m10, [base+deint_shuf4] 6271*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign 6272*c0909341SAndroid Build Coastguard Worker add wq, r7 6273*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 6274*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6275*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 6276*c0909341SAndroid Build Coastguard Worker jmp wq 6277*c0909341SAndroid Build Coastguard Worker.w4: 6278*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6279*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 6280*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 6281*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 6282*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 6283*c0909341SAndroid Build Coastguard Worker cmp hd, 8 6284*c0909341SAndroid Build Coastguard Worker jl .w4_end 6285*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6286*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 6287*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 6288*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 6289*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 6290*c0909341SAndroid Build Coastguard Worker jg .w4_h16 6291*c0909341SAndroid Build Coastguard Worker.w4_end: 6292*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 6293*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm5 6294*c0909341SAndroid Build Coastguard Worker psubb xm5, xm8, xm4 6295*c0909341SAndroid Build Coastguard Worker pavgb xm5, xm9 6296*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q3120 6297*c0909341SAndroid Build Coastguard Worker mova [maskq], xm5 6298*c0909341SAndroid Build Coastguard Worker RET 6299*c0909341SAndroid Build Coastguard Worker.w4_h16: 6300*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6301*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6302*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6303*c0909341SAndroid Build Coastguard Worker psubb m5, m8, m4 6304*c0909341SAndroid Build Coastguard Worker pavgb m5, m9 6305*c0909341SAndroid Build Coastguard Worker vpermd m5, m10, m5 6306*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6307*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 6308*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 6309*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 6310*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 6311*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6312*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 6313*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 6314*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 6315*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 6316*c0909341SAndroid Build Coastguard Worker mova [maskq], m5 6317*c0909341SAndroid Build Coastguard Worker RET 6318*c0909341SAndroid Build Coastguard Worker.w8_loop: 6319*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*2 6320*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*2 6321*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6322*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6323*c0909341SAndroid Build Coastguard Worker add maskq, 16 6324*c0909341SAndroid Build Coastguard Worker.w8: 6325*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 6326*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6327*c0909341SAndroid Build Coastguard Worker packuswb xm4, xm5 6328*c0909341SAndroid Build Coastguard Worker psubb xm5, xm8, xm4 6329*c0909341SAndroid Build Coastguard Worker pavgb xm5, xm9 6330*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q3120 6331*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 6332*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 6333*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 6334*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 6335*c0909341SAndroid Build Coastguard Worker mova [maskq], xm5 6336*c0909341SAndroid Build Coastguard Worker sub hd, 4 6337*c0909341SAndroid Build Coastguard Worker jg .w8_loop 6338*c0909341SAndroid Build Coastguard Worker RET 6339*c0909341SAndroid Build Coastguard Worker.w16_loop: 6340*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 6341*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 6342*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6343*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6344*c0909341SAndroid Build Coastguard Worker add maskq, 32 6345*c0909341SAndroid Build Coastguard Worker.w16: 6346*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6347*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 6348*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 6349*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6350*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6351*c0909341SAndroid Build Coastguard Worker psubb m5, m8, m4 6352*c0909341SAndroid Build Coastguard Worker pavgb m5, m9 6353*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6354*c0909341SAndroid Build Coastguard Worker vpermd m5, m10, m5 6355*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm0 6356*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+stride3q ], m0, 1 6357*c0909341SAndroid Build Coastguard Worker mova [maskq], m5 6358*c0909341SAndroid Build Coastguard Worker sub hd, 4 6359*c0909341SAndroid Build Coastguard Worker jg .w16_loop 6360*c0909341SAndroid Build Coastguard Worker RET 6361*c0909341SAndroid Build Coastguard Worker.w32_loop: 6362*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 6363*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 6364*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6365*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 6366*c0909341SAndroid Build Coastguard Worker add maskq, 32 6367*c0909341SAndroid Build Coastguard Worker.w32: 6368*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6369*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 6370*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6371*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6372*c0909341SAndroid Build Coastguard Worker psubb m5, m8, m4 6373*c0909341SAndroid Build Coastguard Worker pavgb m5, m9 6374*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6375*c0909341SAndroid Build Coastguard Worker vpermd m5, m10, m5 6376*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 6377*c0909341SAndroid Build Coastguard Worker mova [maskq], m5 6378*c0909341SAndroid Build Coastguard Worker sub hd, 2 6379*c0909341SAndroid Build Coastguard Worker jg .w32_loop 6380*c0909341SAndroid Build Coastguard Worker RET 6381*c0909341SAndroid Build Coastguard Worker.w64_loop: 6382*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 6383*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 6384*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6385*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6386*c0909341SAndroid Build Coastguard Worker add maskq, 32 6387*c0909341SAndroid Build Coastguard Worker.w64: 6388*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6389*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6390*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6391*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6392*c0909341SAndroid Build Coastguard Worker psubb m5, m8, m4 6393*c0909341SAndroid Build Coastguard Worker pavgb m5, m9 6394*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6395*c0909341SAndroid Build Coastguard Worker vpermd m5, m10, m5 6396*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 6397*c0909341SAndroid Build Coastguard Worker mova [maskq], m5 6398*c0909341SAndroid Build Coastguard Worker dec hd 6399*c0909341SAndroid Build Coastguard Worker jg .w64_loop 6400*c0909341SAndroid Build Coastguard Worker RET 6401*c0909341SAndroid Build Coastguard Worker.w128_loop: 6402*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*8 6403*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*8 6404*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 6405*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6406*c0909341SAndroid Build Coastguard Worker add maskq, 32*2 6407*c0909341SAndroid Build Coastguard Worker.w128: 6408*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6409*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6410*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 2, 3 6411*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6412*c0909341SAndroid Build Coastguard Worker psubb m5, m8, m4 6413*c0909341SAndroid Build Coastguard Worker pavgb m5, m9 6414*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6415*c0909341SAndroid Build Coastguard Worker vpermd m5, m10, m5 6416*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 6417*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m5 6418*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 4, 5 6419*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6420*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 6421*c0909341SAndroid Build Coastguard Worker W_MASK 0, 5, 6, 7 6422*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 6423*c0909341SAndroid Build Coastguard Worker psubb m5, m8, m4 6424*c0909341SAndroid Build Coastguard Worker pavgb m5, m9 6425*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6426*c0909341SAndroid Build Coastguard Worker vpermd m5, m10, m5 6427*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m0 6428*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m5 6429*c0909341SAndroid Build Coastguard Worker dec hd 6430*c0909341SAndroid Build Coastguard Worker jg .w128_loop 6431*c0909341SAndroid Build Coastguard Worker RET 6432*c0909341SAndroid Build Coastguard Worker 6433*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 6434*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx2_table 6435*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_444_avx2_table] 6436*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 6437*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 6438*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 6439*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r7+wq*4] 6440*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 6441*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pb_64] 6442*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_2048] 6443*c0909341SAndroid Build Coastguard Worker add wq, r7 6444*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 6445*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 6446*c0909341SAndroid Build Coastguard Worker jmp wq 6447*c0909341SAndroid Build Coastguard Worker.w4: 6448*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6449*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 6450*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 6451*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 6452*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 6453*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m4 6454*c0909341SAndroid Build Coastguard Worker cmp hd, 8 6455*c0909341SAndroid Build Coastguard Worker jl .w4_end 6456*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6457*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 6458*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 6459*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 6460*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 6461*c0909341SAndroid Build Coastguard Worker je .w4_end 6462*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 2, 3, 1 6463*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6464*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6465*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 6466*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 6467*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 6468*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 6469*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6470*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 6471*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 6472*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 6473*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 6474*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m4 6475*c0909341SAndroid Build Coastguard Worker.w4_end: 6476*c0909341SAndroid Build Coastguard Worker RET 6477*c0909341SAndroid Build Coastguard Worker.w8_loop: 6478*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*2 6479*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*2 6480*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 6481*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 6482*c0909341SAndroid Build Coastguard Worker add maskq, 32 6483*c0909341SAndroid Build Coastguard Worker.w8: 6484*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 6485*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 6486*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 6487*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 6488*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 6489*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 6490*c0909341SAndroid Build Coastguard Worker sub hd, 4 6491*c0909341SAndroid Build Coastguard Worker jg .w8_loop 6492*c0909341SAndroid Build Coastguard Worker RET 6493*c0909341SAndroid Build Coastguard Worker.w16_loop: 6494*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*2 6495*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*2 6496*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 6497*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 6498*c0909341SAndroid Build Coastguard Worker add maskq, 32 6499*c0909341SAndroid Build Coastguard Worker.w16: 6500*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6501*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 6502*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], m0, 1 6503*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 6504*c0909341SAndroid Build Coastguard Worker sub hd, 2 6505*c0909341SAndroid Build Coastguard Worker jg .w16_loop 6506*c0909341SAndroid Build Coastguard Worker RET 6507*c0909341SAndroid Build Coastguard Worker.w32_loop: 6508*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*2 6509*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*2 6510*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 6511*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6512*c0909341SAndroid Build Coastguard Worker add maskq, 32 6513*c0909341SAndroid Build Coastguard Worker.w32: 6514*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6515*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 6516*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 6517*c0909341SAndroid Build Coastguard Worker dec hd 6518*c0909341SAndroid Build Coastguard Worker jg .w32_loop 6519*c0909341SAndroid Build Coastguard Worker RET 6520*c0909341SAndroid Build Coastguard Worker.w64_loop: 6521*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*4 6522*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*4 6523*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 6524*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6525*c0909341SAndroid Build Coastguard Worker add maskq, 32*2 6526*c0909341SAndroid Build Coastguard Worker.w64: 6527*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6528*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6529*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m4 6530*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 2, 3, 1 6531*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6532*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 6533*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m4 6534*c0909341SAndroid Build Coastguard Worker dec hd 6535*c0909341SAndroid Build Coastguard Worker jg .w64_loop 6536*c0909341SAndroid Build Coastguard Worker RET 6537*c0909341SAndroid Build Coastguard Worker.w128_loop: 6538*c0909341SAndroid Build Coastguard Worker add tmp1q, 32*8 6539*c0909341SAndroid Build Coastguard Worker add tmp2q, 32*8 6540*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 6541*c0909341SAndroid Build Coastguard Worker add dstq, strideq 6542*c0909341SAndroid Build Coastguard Worker add maskq, 32*4 6543*c0909341SAndroid Build Coastguard Worker.w128: 6544*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6545*c0909341SAndroid Build Coastguard Worker mova [dstq+32*0], m0 6546*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], m4 6547*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 2, 3, 1 6548*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6549*c0909341SAndroid Build Coastguard Worker mova [dstq+32*1], m0 6550*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], m4 6551*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 4, 5, 1 6552*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6553*c0909341SAndroid Build Coastguard Worker mova [dstq+32*2], m0 6554*c0909341SAndroid Build Coastguard Worker mova [maskq+32*2], m4 6555*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 6, 7, 1 6556*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 6557*c0909341SAndroid Build Coastguard Worker mova [dstq+32*3], m0 6558*c0909341SAndroid Build Coastguard Worker mova [maskq+32*3], m4 6559*c0909341SAndroid Build Coastguard Worker dec hd 6560*c0909341SAndroid Build Coastguard Worker jg .w128_loop 6561*c0909341SAndroid Build Coastguard Worker RET 6562*c0909341SAndroid Build Coastguard Worker 6563*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 6564