1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Workerfilter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 32*c0909341SAndroid Build Coastguard Workerpal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 33*c0909341SAndroid Build Coastguard Workerz_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 34*c0909341SAndroid Build Coastguard Workerz_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 35*c0909341SAndroid Build Coastguard Workerz_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 36*c0909341SAndroid Build Coastguard Workerz2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 37*c0909341SAndroid Build Coastguard Worker db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 38*c0909341SAndroid Build Coastguard Workerz2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 39*c0909341SAndroid Build Coastguard Workerz2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 40*c0909341SAndroid Build Coastguard Workerz2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 41*c0909341SAndroid Build Coastguard Workerz2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 42*c0909341SAndroid Build Coastguard Workerz_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 43*c0909341SAndroid Build Coastguard Workerz_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 44*c0909341SAndroid Build Coastguard Worker db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 45*c0909341SAndroid Build Coastguard Workerz_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 46*c0909341SAndroid Build Coastguard Workerz_filt_wh4: db 7, 7, 19, 7, 47*c0909341SAndroid Build Coastguard Workerz_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 48*c0909341SAndroid Build Coastguard WorkerALIGN 8 49*c0909341SAndroid Build Coastguard Workerpb_2_3: times 4 db 2, 3 50*c0909341SAndroid Build Coastguard Workerz2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 51*c0909341SAndroid Build Coastguard Workerz_filt_k: times 4 dw 8 52*c0909341SAndroid Build Coastguard Worker times 4 dw 6 53*c0909341SAndroid Build Coastguard Worker times 4 dw 4 54*c0909341SAndroid Build Coastguard Worker times 4 dw 5 55*c0909341SAndroid Build Coastguard Workerpw_m3584: times 4 dw -3584 56*c0909341SAndroid Build Coastguard Workerpw_m3072: times 4 dw -3072 57*c0909341SAndroid Build Coastguard Workerpw_m2560: times 4 dw -2560 58*c0909341SAndroid Build Coastguard Workerpw_m2048: times 4 dw -2048 59*c0909341SAndroid Build Coastguard Workerpw_m1536: times 4 dw -1536 60*c0909341SAndroid Build Coastguard Workerpw_m1024: times 4 dw -1024 61*c0909341SAndroid Build Coastguard Workerpw_m512: times 4 dw -512 62*c0909341SAndroid Build Coastguard Workerpw_1: times 4 dw 1 63*c0909341SAndroid Build Coastguard Workerpw_2: times 4 dw 2 64*c0909341SAndroid Build Coastguard Workerpw_3: times 4 dw 3 65*c0909341SAndroid Build Coastguard Workerpw_62: times 4 dw 62 66*c0909341SAndroid Build Coastguard Workerpw_256: times 4 dw 256 67*c0909341SAndroid Build Coastguard Workerpw_512: times 4 dw 512 68*c0909341SAndroid Build Coastguard Workerpw_2048: times 4 dw 2048 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard Worker%define pw_4 (z_filt_k+8*2) 71*c0909341SAndroid Build Coastguard Worker%define pw_8 (z_filt_k+8*0) 72*c0909341SAndroid Build Coastguard Worker%define pw_m1to4 z2_upsample_l 73*c0909341SAndroid Build Coastguard Worker 74*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-* 75*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*4) 76*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_%2) 77*c0909341SAndroid Build Coastguard Worker %%table: 78*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 79*c0909341SAndroid Build Coastguard Worker dd %%base %+ .%3 - (%%table - 2*4) 80*c0909341SAndroid Build Coastguard Worker %rotate 1 81*c0909341SAndroid Build Coastguard Worker %endrep 82*c0909341SAndroid Build Coastguard Worker%endmacro 83*c0909341SAndroid Build Coastguard Worker 84*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) 85*c0909341SAndroid Build Coastguard Worker%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) 86*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) 87*c0909341SAndroid Build Coastguard Worker 88*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 89*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 90*c0909341SAndroid Build Coastguard Worker s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ 91*c0909341SAndroid Build Coastguard Worker s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 92*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 93*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 94*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 95*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 96*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ 97*c0909341SAndroid Build Coastguard Worker s4-8*4, s8-8*4, s16-8*4, s32-8*4 98*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 99*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 100*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 101*c0909341SAndroid Build Coastguard Worker 102*c0909341SAndroid Build Coastguard Workercextern smooth_weights_1d_16bpc 103*c0909341SAndroid Build Coastguard Workercextern smooth_weights_2d_16bpc 104*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative 105*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps 106*c0909341SAndroid Build Coastguard Worker 107*c0909341SAndroid Build Coastguard WorkerSECTION .text 108*c0909341SAndroid Build Coastguard Worker 109*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 110*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h 111*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_left_16bpc_ssse3_table 112*c0909341SAndroid Build Coastguard Worker movd m4, wm 113*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 114*c0909341SAndroid Build Coastguard Worker add tlq, 2 115*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 116*c0909341SAndroid Build Coastguard Worker pxor m3, m3 117*c0909341SAndroid Build Coastguard Worker pavgw m4, m3 118*c0909341SAndroid Build Coastguard Worker movd m5, wd 119*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 120*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+wq*4] 121*c0909341SAndroid Build Coastguard Worker add r6, r5 122*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table 123*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 124*c0909341SAndroid Build Coastguard Worker add wq, r5 125*c0909341SAndroid Build Coastguard Worker jmp r6 126*c0909341SAndroid Build Coastguard Worker 127*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 128*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_left_16bpc_ssse3_table 129*c0909341SAndroid Build Coastguard Worker mov hd, hm 130*c0909341SAndroid Build Coastguard Worker movd m4, hm 131*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 132*c0909341SAndroid Build Coastguard Worker sub tlq, hq 133*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 134*c0909341SAndroid Build Coastguard Worker pxor m3, m3 135*c0909341SAndroid Build Coastguard Worker sub tlq, hq 136*c0909341SAndroid Build Coastguard Worker pavgw m4, m3 137*c0909341SAndroid Build Coastguard Worker movd m5, r6d 138*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 139*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 140*c0909341SAndroid Build Coastguard Worker add r6, r5 141*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table 142*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 143*c0909341SAndroid Build Coastguard Worker add wq, r5 144*c0909341SAndroid Build Coastguard Worker jmp r6 145*c0909341SAndroid Build Coastguard Worker.h64: 146*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+112] 147*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 96] 148*c0909341SAndroid Build Coastguard Worker paddw m0, m2 149*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 80] 150*c0909341SAndroid Build Coastguard Worker paddw m1, m2 151*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 64] 152*c0909341SAndroid Build Coastguard Worker paddw m0, m2 153*c0909341SAndroid Build Coastguard Worker paddw m0, m1 154*c0909341SAndroid Build Coastguard Worker.h32: 155*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 48] 156*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 32] 157*c0909341SAndroid Build Coastguard Worker paddw m1, m2 158*c0909341SAndroid Build Coastguard Worker paddw m0, m1 159*c0909341SAndroid Build Coastguard Worker.h16: 160*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 16] 161*c0909341SAndroid Build Coastguard Worker paddw m0, m1 162*c0909341SAndroid Build Coastguard Worker.h8: 163*c0909341SAndroid Build Coastguard Worker movhlps m1, m0 164*c0909341SAndroid Build Coastguard Worker paddw m0, m1 165*c0909341SAndroid Build Coastguard Worker.h4: 166*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 167*c0909341SAndroid Build Coastguard Worker paddd m4, m0 168*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 169*c0909341SAndroid Build Coastguard Worker paddd m0, m4 170*c0909341SAndroid Build Coastguard Worker pshuflw m4, m0, q1032 171*c0909341SAndroid Build Coastguard Worker paddd m0, m4 172*c0909341SAndroid Build Coastguard Worker psrld m0, m5 173*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 174*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 175*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 176*c0909341SAndroid Build Coastguard Worker jmp wq 177*c0909341SAndroid Build Coastguard Worker 178*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 179*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 180*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 181*c0909341SAndroid Build Coastguard Worker lea r5d, [wq+hq] 182*c0909341SAndroid Build Coastguard Worker movd m4, r5d 183*c0909341SAndroid Build Coastguard Worker tzcnt r5d, r5d 184*c0909341SAndroid Build Coastguard Worker movd m5, r5d 185*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_16bpc_ssse3_table 186*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 187*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 188*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4+5*4] 189*c0909341SAndroid Build Coastguard Worker pxor m3, m3 190*c0909341SAndroid Build Coastguard Worker psrlw m4, 1 191*c0909341SAndroid Build Coastguard Worker add r6, r5 192*c0909341SAndroid Build Coastguard Worker add wq, r5 193*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 194*c0909341SAndroid Build Coastguard Worker jmp r6 195*c0909341SAndroid Build Coastguard Worker.h4: 196*c0909341SAndroid Build Coastguard Worker movq m0, [tlq-8] 197*c0909341SAndroid Build Coastguard Worker jmp wq 198*c0909341SAndroid Build Coastguard Worker.w4: 199*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+2] 200*c0909341SAndroid Build Coastguard Worker paddw m1, m0 201*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3 202*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 203*c0909341SAndroid Build Coastguard Worker paddd m0, m1 204*c0909341SAndroid Build Coastguard Worker paddd m4, m0 205*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 206*c0909341SAndroid Build Coastguard Worker paddd m0, m4 207*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 208*c0909341SAndroid Build Coastguard Worker paddd m0, m1 209*c0909341SAndroid Build Coastguard Worker cmp hd, 4 210*c0909341SAndroid Build Coastguard Worker jg .w4_mul 211*c0909341SAndroid Build Coastguard Worker psrlw m0, 3 212*c0909341SAndroid Build Coastguard Worker jmp .w4_end 213*c0909341SAndroid Build Coastguard Worker.w4_mul: 214*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAB 215*c0909341SAndroid Build Coastguard Worker mov r3d, 0x6667 216*c0909341SAndroid Build Coastguard Worker cmp hd, 16 217*c0909341SAndroid Build Coastguard Worker cmove r2d, r3d 218*c0909341SAndroid Build Coastguard Worker psrld m0, 2 219*c0909341SAndroid Build Coastguard Worker movd m1, r2d 220*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 221*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 222*c0909341SAndroid Build Coastguard Worker.w4_end: 223*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 224*c0909341SAndroid Build Coastguard Worker.s4: 225*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 226*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m0 227*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m0 228*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], m0 229*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 230*c0909341SAndroid Build Coastguard Worker sub hd, 4 231*c0909341SAndroid Build Coastguard Worker jg .s4 232*c0909341SAndroid Build Coastguard Worker RET 233*c0909341SAndroid Build Coastguard Worker.h8: 234*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16] 235*c0909341SAndroid Build Coastguard Worker jmp wq 236*c0909341SAndroid Build Coastguard Worker.w8: 237*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+2] 238*c0909341SAndroid Build Coastguard Worker paddw m0, m1 239*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m3 240*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3 241*c0909341SAndroid Build Coastguard Worker paddd m0, m1 242*c0909341SAndroid Build Coastguard Worker paddd m4, m0 243*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 244*c0909341SAndroid Build Coastguard Worker paddd m0, m4 245*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 246*c0909341SAndroid Build Coastguard Worker paddd m0, m1 247*c0909341SAndroid Build Coastguard Worker psrld m0, m5 248*c0909341SAndroid Build Coastguard Worker cmp hd, 8 249*c0909341SAndroid Build Coastguard Worker je .w8_end 250*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAB 251*c0909341SAndroid Build Coastguard Worker mov r3d, 0x6667 252*c0909341SAndroid Build Coastguard Worker cmp hd, 32 253*c0909341SAndroid Build Coastguard Worker cmove r2d, r3d 254*c0909341SAndroid Build Coastguard Worker movd m1, r2d 255*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 256*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 257*c0909341SAndroid Build Coastguard Worker.w8_end: 258*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 259*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 260*c0909341SAndroid Build Coastguard Worker.s8: 261*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 262*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 263*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 264*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m0 265*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 266*c0909341SAndroid Build Coastguard Worker sub hd, 4 267*c0909341SAndroid Build Coastguard Worker jg .s8 268*c0909341SAndroid Build Coastguard Worker RET 269*c0909341SAndroid Build Coastguard Worker.h16: 270*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] 271*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-16] 272*c0909341SAndroid Build Coastguard Worker jmp wq 273*c0909341SAndroid Build Coastguard Worker.w16: 274*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 2] 275*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+18] 276*c0909341SAndroid Build Coastguard Worker paddw m1, m2 277*c0909341SAndroid Build Coastguard Worker paddw m0, m1 278*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m3 279*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 280*c0909341SAndroid Build Coastguard Worker paddd m0, m1 281*c0909341SAndroid Build Coastguard Worker paddd m4, m0 282*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 283*c0909341SAndroid Build Coastguard Worker paddd m0, m4 284*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 285*c0909341SAndroid Build Coastguard Worker paddd m0, m1 286*c0909341SAndroid Build Coastguard Worker psrld m0, m5 287*c0909341SAndroid Build Coastguard Worker cmp hd, 16 288*c0909341SAndroid Build Coastguard Worker je .w16_end 289*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAB 290*c0909341SAndroid Build Coastguard Worker mov r3d, 0x6667 291*c0909341SAndroid Build Coastguard Worker test hd, 8|32 292*c0909341SAndroid Build Coastguard Worker cmovz r2d, r3d 293*c0909341SAndroid Build Coastguard Worker movd m1, r2d 294*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 295*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 296*c0909341SAndroid Build Coastguard Worker.w16_end: 297*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 298*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 299*c0909341SAndroid Build Coastguard Worker.s16c: 300*c0909341SAndroid Build Coastguard Worker mova m1, m0 301*c0909341SAndroid Build Coastguard Worker.s16: 302*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*0], m0 303*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*1], m1 304*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*0], m0 305*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*1], m1 306*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+16*0], m0 307*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+16*1], m1 308*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +16*0], m0 309*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q +16*1], m1 310*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 311*c0909341SAndroid Build Coastguard Worker sub hd, 4 312*c0909341SAndroid Build Coastguard Worker jg .s16 313*c0909341SAndroid Build Coastguard Worker RET 314*c0909341SAndroid Build Coastguard Worker.h32: 315*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-64] 316*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-48] 317*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-32] 318*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-16] 319*c0909341SAndroid Build Coastguard Worker jmp wq 320*c0909341SAndroid Build Coastguard Worker.w32: 321*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 2] 322*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+18] 323*c0909341SAndroid Build Coastguard Worker paddw m1, m2 324*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+34] 325*c0909341SAndroid Build Coastguard Worker paddw m0, m2 326*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+50] 327*c0909341SAndroid Build Coastguard Worker paddw m1, m2 328*c0909341SAndroid Build Coastguard Worker paddw m0, m1 329*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m3 330*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3 331*c0909341SAndroid Build Coastguard Worker paddd m0, m1 332*c0909341SAndroid Build Coastguard Worker paddd m4, m0 333*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 334*c0909341SAndroid Build Coastguard Worker paddd m0, m4 335*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 336*c0909341SAndroid Build Coastguard Worker paddd m0, m1 337*c0909341SAndroid Build Coastguard Worker psrld m0, m5 338*c0909341SAndroid Build Coastguard Worker cmp hd, 32 339*c0909341SAndroid Build Coastguard Worker je .w32_end 340*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAB 341*c0909341SAndroid Build Coastguard Worker mov r3d, 0x6667 342*c0909341SAndroid Build Coastguard Worker cmp hd, 8 343*c0909341SAndroid Build Coastguard Worker cmove r2d, r3d 344*c0909341SAndroid Build Coastguard Worker movd m1, r2d 345*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 346*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 347*c0909341SAndroid Build Coastguard Worker.w32_end: 348*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 349*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 350*c0909341SAndroid Build Coastguard Worker.s32c: 351*c0909341SAndroid Build Coastguard Worker mova m1, m0 352*c0909341SAndroid Build Coastguard Worker mova m2, m0 353*c0909341SAndroid Build Coastguard Worker mova m3, m0 354*c0909341SAndroid Build Coastguard Worker.s32: 355*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*0], m0 356*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*1], m1 357*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*2], m2 358*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*3], m3 359*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*0], m0 360*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*1], m1 361*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*2], m2 362*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*3], m3 363*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 364*c0909341SAndroid Build Coastguard Worker sub hd, 2 365*c0909341SAndroid Build Coastguard Worker jg .s32 366*c0909341SAndroid Build Coastguard Worker RET 367*c0909341SAndroid Build Coastguard Worker.h64: 368*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-128] 369*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-112] 370*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq- 96] 371*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq- 80] 372*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq- 64] 373*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq- 48] 374*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq- 32] 375*c0909341SAndroid Build Coastguard Worker paddw m1, [tlq- 16] 376*c0909341SAndroid Build Coastguard Worker paddw m0, m1 377*c0909341SAndroid Build Coastguard Worker jmp wq 378*c0909341SAndroid Build Coastguard Worker.w64: 379*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 2] 380*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 18] 381*c0909341SAndroid Build Coastguard Worker paddw m1, m2 382*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 34] 383*c0909341SAndroid Build Coastguard Worker paddw m0, m2 384*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 50] 385*c0909341SAndroid Build Coastguard Worker paddw m1, m2 386*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 66] 387*c0909341SAndroid Build Coastguard Worker paddw m0, m2 388*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 82] 389*c0909341SAndroid Build Coastguard Worker paddw m1, m2 390*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 98] 391*c0909341SAndroid Build Coastguard Worker paddw m0, m2 392*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+114] 393*c0909341SAndroid Build Coastguard Worker paddw m1, m2 394*c0909341SAndroid Build Coastguard Worker paddw m0, m1 395*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m3 396*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3 397*c0909341SAndroid Build Coastguard Worker paddd m0, m1 398*c0909341SAndroid Build Coastguard Worker paddd m4, m0 399*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 400*c0909341SAndroid Build Coastguard Worker paddd m0, m4 401*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 402*c0909341SAndroid Build Coastguard Worker paddd m0, m1 403*c0909341SAndroid Build Coastguard Worker psrld m0, m5 404*c0909341SAndroid Build Coastguard Worker cmp hd, 64 405*c0909341SAndroid Build Coastguard Worker je .w64_end 406*c0909341SAndroid Build Coastguard Worker mov r2d, 0xAAAB 407*c0909341SAndroid Build Coastguard Worker mov r3d, 0x6667 408*c0909341SAndroid Build Coastguard Worker cmp hd, 16 409*c0909341SAndroid Build Coastguard Worker cmove r2d, r3d 410*c0909341SAndroid Build Coastguard Worker movd m1, r2d 411*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 412*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 413*c0909341SAndroid Build Coastguard Worker.w64_end: 414*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 415*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 416*c0909341SAndroid Build Coastguard Worker.s64: 417*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 418*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m0 419*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 420*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m0 421*c0909341SAndroid Build Coastguard Worker mova [dstq+16*4], m0 422*c0909341SAndroid Build Coastguard Worker mova [dstq+16*5], m0 423*c0909341SAndroid Build Coastguard Worker mova [dstq+16*6], m0 424*c0909341SAndroid Build Coastguard Worker mova [dstq+16*7], m0 425*c0909341SAndroid Build Coastguard Worker add dstq, strideq 426*c0909341SAndroid Build Coastguard Worker dec hd 427*c0909341SAndroid Build Coastguard Worker jg .s64 428*c0909341SAndroid Build Coastguard Worker RET 429*c0909341SAndroid Build Coastguard Worker 430*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 431*c0909341SAndroid Build Coastguard Worker mov r6d, r8m 432*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_128_16bpc_ssse3_table 433*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 434*c0909341SAndroid Build Coastguard Worker shr r6d, 11 435*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 436*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 437*c0909341SAndroid Build Coastguard Worker movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] 438*c0909341SAndroid Build Coastguard Worker add wq, r5 439*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 440*c0909341SAndroid Build Coastguard Worker jmp wq 441*c0909341SAndroid Build Coastguard Worker 442*c0909341SAndroid Build Coastguard Workercglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 443*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_splat_16bpc_ssse3_table 444*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 445*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+ 2] 446*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 18] 447*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 34] 448*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+ 50] 449*c0909341SAndroid Build Coastguard Worker cmp wd, 64 450*c0909341SAndroid Build Coastguard Worker je .w64 451*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 452*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 453*c0909341SAndroid Build Coastguard Worker add wq, r5 454*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 455*c0909341SAndroid Build Coastguard Worker jmp wq 456*c0909341SAndroid Build Coastguard Worker.w64: 457*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 458*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+ 66] 459*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+ 82] 460*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+ 98] 461*c0909341SAndroid Build Coastguard Worker movu m7, [tlq+114] 462*c0909341SAndroid Build Coastguard Worker.w64_loop: 463*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 464*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 465*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m2 466*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m3 467*c0909341SAndroid Build Coastguard Worker mova [dstq+16*4], m4 468*c0909341SAndroid Build Coastguard Worker mova [dstq+16*5], m5 469*c0909341SAndroid Build Coastguard Worker mova [dstq+16*6], m6 470*c0909341SAndroid Build Coastguard Worker mova [dstq+16*7], m7 471*c0909341SAndroid Build Coastguard Worker add dstq, strideq 472*c0909341SAndroid Build Coastguard Worker dec hd 473*c0909341SAndroid Build Coastguard Worker jg .w64_loop 474*c0909341SAndroid Build Coastguard Worker RET 475*c0909341SAndroid Build Coastguard Worker 476*c0909341SAndroid Build Coastguard Workercglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 477*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_h_16bpc_ssse3_table 478*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 479*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_h_16bpc_ssse3_table 480*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 481*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 482*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_256] 483*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pb_2_3] 484*c0909341SAndroid Build Coastguard Worker add wq, r5 485*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 486*c0909341SAndroid Build Coastguard Worker jmp wq 487*c0909341SAndroid Build Coastguard Worker.w4: 488*c0909341SAndroid Build Coastguard Worker sub tlq, 8 489*c0909341SAndroid Build Coastguard Worker movq m3, [tlq] 490*c0909341SAndroid Build Coastguard Worker pshuflw m0, m3, q3333 491*c0909341SAndroid Build Coastguard Worker pshuflw m1, m3, q2222 492*c0909341SAndroid Build Coastguard Worker pshuflw m2, m3, q1111 493*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q0000 494*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 495*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m1 496*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m2 497*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], m3 498*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 499*c0909341SAndroid Build Coastguard Worker sub hd, 4 500*c0909341SAndroid Build Coastguard Worker jg .w4 501*c0909341SAndroid Build Coastguard Worker RET 502*c0909341SAndroid Build Coastguard Worker.w8: 503*c0909341SAndroid Build Coastguard Worker sub tlq, 8 504*c0909341SAndroid Build Coastguard Worker movq m3, [tlq] 505*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 506*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q3333 507*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q2222 508*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q1111 509*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 510*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 511*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 512*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 513*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m3 514*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 515*c0909341SAndroid Build Coastguard Worker sub hd, 4 516*c0909341SAndroid Build Coastguard Worker jg .w8 517*c0909341SAndroid Build Coastguard Worker RET 518*c0909341SAndroid Build Coastguard Worker.w16: 519*c0909341SAndroid Build Coastguard Worker sub tlq, 4 520*c0909341SAndroid Build Coastguard Worker movd m1, [tlq] 521*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m3 522*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 523*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*0], m0 524*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*1], m0 525*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*0], m1 526*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*1], m1 527*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 528*c0909341SAndroid Build Coastguard Worker sub hd, 2 529*c0909341SAndroid Build Coastguard Worker jg .w16 530*c0909341SAndroid Build Coastguard Worker RET 531*c0909341SAndroid Build Coastguard Worker.w32: 532*c0909341SAndroid Build Coastguard Worker sub tlq, 4 533*c0909341SAndroid Build Coastguard Worker movd m1, [tlq] 534*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m3 535*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 536*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*0], m0 537*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*1], m0 538*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*2], m0 539*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+16*3], m0 540*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*0], m1 541*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*1], m1 542*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*2], m1 543*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+16*3], m1 544*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 545*c0909341SAndroid Build Coastguard Worker sub hd, 2 546*c0909341SAndroid Build Coastguard Worker jg .w32 547*c0909341SAndroid Build Coastguard Worker RET 548*c0909341SAndroid Build Coastguard Worker.w64: 549*c0909341SAndroid Build Coastguard Worker sub tlq, 2 550*c0909341SAndroid Build Coastguard Worker movd m0, [tlq] 551*c0909341SAndroid Build Coastguard Worker pshufb m0, m2 552*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 553*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m0 554*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 555*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m0 556*c0909341SAndroid Build Coastguard Worker mova [dstq+16*4], m0 557*c0909341SAndroid Build Coastguard Worker mova [dstq+16*5], m0 558*c0909341SAndroid Build Coastguard Worker mova [dstq+16*6], m0 559*c0909341SAndroid Build Coastguard Worker mova [dstq+16*7], m0 560*c0909341SAndroid Build Coastguard Worker add dstq, strideq 561*c0909341SAndroid Build Coastguard Worker dec hd 562*c0909341SAndroid Build Coastguard Worker jg .w64 563*c0909341SAndroid Build Coastguard Worker RET 564*c0909341SAndroid Build Coastguard Worker 565*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left 566*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_16bpc_ssse3_table 567*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 568*c0909341SAndroid Build Coastguard Worker pshuflw m4, [tlq], q0000 569*c0909341SAndroid Build Coastguard Worker mov leftq, tlq 570*c0909341SAndroid Build Coastguard Worker add hd, hd 571*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m4 ; topleft 572*c0909341SAndroid Build Coastguard Worker sub leftq, hq 573*c0909341SAndroid Build Coastguard Worker and wd, ~7 574*c0909341SAndroid Build Coastguard Worker jnz .w8 575*c0909341SAndroid Build Coastguard Worker movddup m5, [tlq+2] ; top 576*c0909341SAndroid Build Coastguard Worker psubw m6, m5, m4 577*c0909341SAndroid Build Coastguard Worker pabsw m7, m6 578*c0909341SAndroid Build Coastguard Worker.w4_loop: 579*c0909341SAndroid Build Coastguard Worker movd m1, [leftq+hq-4] 580*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m1 581*c0909341SAndroid Build Coastguard Worker punpckldq m1, m1 ; left 582*c0909341SAndroid Build Coastguard Worker%macro PAETH 0 583*c0909341SAndroid Build Coastguard Worker paddw m0, m6, m1 584*c0909341SAndroid Build Coastguard Worker psubw m2, m4, m0 ; tldiff 585*c0909341SAndroid Build Coastguard Worker psubw m0, m5 ; tdiff 586*c0909341SAndroid Build Coastguard Worker pabsw m2, m2 587*c0909341SAndroid Build Coastguard Worker pabsw m0, m0 588*c0909341SAndroid Build Coastguard Worker pminsw m2, m0 589*c0909341SAndroid Build Coastguard Worker pcmpeqw m0, m2 590*c0909341SAndroid Build Coastguard Worker pand m3, m5, m0 591*c0909341SAndroid Build Coastguard Worker pandn m0, m4 592*c0909341SAndroid Build Coastguard Worker por m0, m3 593*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m7, m2 594*c0909341SAndroid Build Coastguard Worker pand m0, m3 595*c0909341SAndroid Build Coastguard Worker pandn m3, m1 596*c0909341SAndroid Build Coastguard Worker por m0, m3 597*c0909341SAndroid Build Coastguard Worker%endmacro 598*c0909341SAndroid Build Coastguard Worker PAETH 599*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], m0 600*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m0 601*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 602*c0909341SAndroid Build Coastguard Worker sub hd, 2*2 603*c0909341SAndroid Build Coastguard Worker jg .w4_loop 604*c0909341SAndroid Build Coastguard Worker RET 605*c0909341SAndroid Build Coastguard Worker.w8: 606*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 607*c0909341SAndroid Build Coastguard Worker PUSH r6 608*c0909341SAndroid Build Coastguard Worker %define r7d hm 609*c0909341SAndroid Build Coastguard Worker %assign regs_used 7 610*c0909341SAndroid Build Coastguard Worker%elif WIN64 611*c0909341SAndroid Build Coastguard Worker movaps r4m, m8 612*c0909341SAndroid Build Coastguard Worker PUSH r7 613*c0909341SAndroid Build Coastguard Worker %assign regs_used 8 614*c0909341SAndroid Build Coastguard Worker%endif 615*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 616*c0909341SAndroid Build Coastguard Worker movddup m8, [pw_256] 617*c0909341SAndroid Build Coastguard Worker%endif 618*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+wq*2+2] 619*c0909341SAndroid Build Coastguard Worker neg wq 620*c0909341SAndroid Build Coastguard Worker mov r7d, hd 621*c0909341SAndroid Build Coastguard Worker.w8_loop0: 622*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+wq*2] 623*c0909341SAndroid Build Coastguard Worker mov r6, dstq 624*c0909341SAndroid Build Coastguard Worker add dstq, 16 625*c0909341SAndroid Build Coastguard Worker psubw m6, m5, m4 626*c0909341SAndroid Build Coastguard Worker pabsw m7, m6 627*c0909341SAndroid Build Coastguard Worker.w8_loop: 628*c0909341SAndroid Build Coastguard Worker movd m1, [leftq+hq-2] 629*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 630*c0909341SAndroid Build Coastguard Worker pshufb m1, m8 631*c0909341SAndroid Build Coastguard Worker%else 632*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 633*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 634*c0909341SAndroid Build Coastguard Worker%endif 635*c0909341SAndroid Build Coastguard Worker PAETH 636*c0909341SAndroid Build Coastguard Worker mova [r6], m0 637*c0909341SAndroid Build Coastguard Worker add r6, strideq 638*c0909341SAndroid Build Coastguard Worker sub hd, 1*2 639*c0909341SAndroid Build Coastguard Worker jg .w8_loop 640*c0909341SAndroid Build Coastguard Worker mov hd, r7d 641*c0909341SAndroid Build Coastguard Worker add wq, 8 642*c0909341SAndroid Build Coastguard Worker jl .w8_loop0 643*c0909341SAndroid Build Coastguard Worker%if WIN64 644*c0909341SAndroid Build Coastguard Worker movaps m8, r4m 645*c0909341SAndroid Build Coastguard Worker%endif 646*c0909341SAndroid Build Coastguard Worker RET 647*c0909341SAndroid Build Coastguard Worker 648*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 649*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 650*c0909341SAndroid Build Coastguard Worker%else 651*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4 652*c0909341SAndroid Build Coastguard Worker%endif 653*c0909341SAndroid Build Coastguard Worker 654*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights 655*c0909341SAndroid Build Coastguard Worker LEA weightsq, smooth_weights_1d_16bpc 656*c0909341SAndroid Build Coastguard Worker mov hd, hm 657*c0909341SAndroid Build Coastguard Worker lea weightsq, [weightsq+hq*4] 658*c0909341SAndroid Build Coastguard Worker neg hq 659*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+hq*2] ; bottom 660*c0909341SAndroid Build Coastguard Worker pshuflw m5, m5, q0000 661*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m5 662*c0909341SAndroid Build Coastguard Worker cmp wd, 4 663*c0909341SAndroid Build Coastguard Worker jne .w8 664*c0909341SAndroid Build Coastguard Worker movddup m4, [tlq+2] ; top 665*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 666*c0909341SAndroid Build Coastguard Worker psubw m4, m5 ; top - bottom 667*c0909341SAndroid Build Coastguard Worker.w4_loop: 668*c0909341SAndroid Build Coastguard Worker movq m1, [weightsq+hq*2] 669*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m1 670*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q1100 671*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m1 672*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 673*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 674*c0909341SAndroid Build Coastguard Worker paddw m0, m5 675*c0909341SAndroid Build Coastguard Worker paddw m1, m5 676*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 677*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 678*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m1 679*c0909341SAndroid Build Coastguard Worker movhps [dstq+r3 ], m1 680*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 681*c0909341SAndroid Build Coastguard Worker add hq, 4 682*c0909341SAndroid Build Coastguard Worker jl .w4_loop 683*c0909341SAndroid Build Coastguard Worker RET 684*c0909341SAndroid Build Coastguard Worker.w8: 685*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 686*c0909341SAndroid Build Coastguard Worker PUSH r6 687*c0909341SAndroid Build Coastguard Worker %assign regs_used 7 688*c0909341SAndroid Build Coastguard Worker mov hm, hq 689*c0909341SAndroid Build Coastguard Worker %define hq hm 690*c0909341SAndroid Build Coastguard Worker%elif WIN64 691*c0909341SAndroid Build Coastguard Worker PUSH r7 692*c0909341SAndroid Build Coastguard Worker %assign regs_used 8 693*c0909341SAndroid Build Coastguard Worker%endif 694*c0909341SAndroid Build Coastguard Worker.w8_loop0: 695*c0909341SAndroid Build Coastguard Worker mov t0, hq 696*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+2] 697*c0909341SAndroid Build Coastguard Worker add tlq, 16 698*c0909341SAndroid Build Coastguard Worker mov r6, dstq 699*c0909341SAndroid Build Coastguard Worker add dstq, 16 700*c0909341SAndroid Build Coastguard Worker psubw m4, m5 701*c0909341SAndroid Build Coastguard Worker.w8_loop: 702*c0909341SAndroid Build Coastguard Worker movq m3, [weightsq+t0*2] 703*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 704*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q0000 705*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q1111 706*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q2222 707*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q3333 708*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4}, m0, m1, m2, m3 709*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m5}, m0, m1, m2, m3 710*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*0], m0 711*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*1], m1 712*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 713*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*0], m2 714*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*1], m3 715*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 716*c0909341SAndroid Build Coastguard Worker add t0, 4 717*c0909341SAndroid Build Coastguard Worker jl .w8_loop 718*c0909341SAndroid Build Coastguard Worker sub wd, 8 719*c0909341SAndroid Build Coastguard Worker jg .w8_loop0 720*c0909341SAndroid Build Coastguard Worker RET 721*c0909341SAndroid Build Coastguard Worker 722*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights 723*c0909341SAndroid Build Coastguard Worker LEA weightsq, smooth_weights_1d_16bpc 724*c0909341SAndroid Build Coastguard Worker mov wd, wm 725*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 726*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+wq*2] ; right 727*c0909341SAndroid Build Coastguard Worker sub tlq, 8 728*c0909341SAndroid Build Coastguard Worker add hd, hd 729*c0909341SAndroid Build Coastguard Worker pshuflw m5, m5, q0000 730*c0909341SAndroid Build Coastguard Worker sub tlq, hq 731*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m5 732*c0909341SAndroid Build Coastguard Worker cmp wd, 4 733*c0909341SAndroid Build Coastguard Worker jne .w8 734*c0909341SAndroid Build Coastguard Worker movddup m4, [weightsq+4*2] 735*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 736*c0909341SAndroid Build Coastguard Worker.w4_loop: 737*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+hq] ; left 738*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m1 739*c0909341SAndroid Build Coastguard Worker psubw m1, m5 ; left - right 740*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q3322 741*c0909341SAndroid Build Coastguard Worker punpckldq m1, m1 742*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 743*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 744*c0909341SAndroid Build Coastguard Worker paddw m0, m5 745*c0909341SAndroid Build Coastguard Worker paddw m1, m5 746*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], m0 747*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m0 748*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], m1 749*c0909341SAndroid Build Coastguard Worker movq [dstq+r3 ], m1 750*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 751*c0909341SAndroid Build Coastguard Worker sub hd, 4*2 752*c0909341SAndroid Build Coastguard Worker jg .w4_loop 753*c0909341SAndroid Build Coastguard Worker RET 754*c0909341SAndroid Build Coastguard Worker.w8: 755*c0909341SAndroid Build Coastguard Worker lea weightsq, [weightsq+wq*4] 756*c0909341SAndroid Build Coastguard Worker neg wq 757*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 758*c0909341SAndroid Build Coastguard Worker PUSH r6 759*c0909341SAndroid Build Coastguard Worker %assign regs_used 7 760*c0909341SAndroid Build Coastguard Worker %define hd hm 761*c0909341SAndroid Build Coastguard Worker%elif WIN64 762*c0909341SAndroid Build Coastguard Worker PUSH r7 763*c0909341SAndroid Build Coastguard Worker %assign regs_used 8 764*c0909341SAndroid Build Coastguard Worker%endif 765*c0909341SAndroid Build Coastguard Worker.w8_loop0: 766*c0909341SAndroid Build Coastguard Worker mov t0d, hd 767*c0909341SAndroid Build Coastguard Worker mova m4, [weightsq+wq*2] 768*c0909341SAndroid Build Coastguard Worker mov r6, dstq 769*c0909341SAndroid Build Coastguard Worker add dstq, 16 770*c0909341SAndroid Build Coastguard Worker.w8_loop: 771*c0909341SAndroid Build Coastguard Worker movq m3, [tlq+t0*(1+ARCH_X86_32)] 772*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 773*c0909341SAndroid Build Coastguard Worker psubw m3, m5 774*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q3333 775*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q2222 776*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q1111 777*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 778*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4}, m0, m1, m2, m3 779*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m5}, m0, m1, m2, m3 780*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*0], m0 781*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*1], m1 782*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 783*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*0], m2 784*c0909341SAndroid Build Coastguard Worker mova [r6+strideq*1], m3 785*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 786*c0909341SAndroid Build Coastguard Worker sub t0d, 4*(1+ARCH_X86_64) 787*c0909341SAndroid Build Coastguard Worker jg .w8_loop 788*c0909341SAndroid Build Coastguard Worker add wq, 8 789*c0909341SAndroid Build Coastguard Worker jl .w8_loop0 790*c0909341SAndroid Build Coastguard Worker RET 791*c0909341SAndroid Build Coastguard Worker 792*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 793*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 10 794*c0909341SAndroid Build Coastguard Worker%else 795*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 3 796*c0909341SAndroid Build Coastguard Worker%endif 797*c0909341SAndroid Build Coastguard Worker 798*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ 799*c0909341SAndroid Build Coastguard Worker h_weights, v_weights, top 800*c0909341SAndroid Build Coastguard Worker LEA h_weightsq, smooth_weights_2d_16bpc 801*c0909341SAndroid Build Coastguard Worker mov wd, wm 802*c0909341SAndroid Build Coastguard Worker mov hd, hm 803*c0909341SAndroid Build Coastguard Worker movd m7, [tlq+wq*2] ; right 804*c0909341SAndroid Build Coastguard Worker lea v_weightsq, [h_weightsq+hq*8] 805*c0909341SAndroid Build Coastguard Worker neg hq 806*c0909341SAndroid Build Coastguard Worker movd m6, [tlq+hq*2] ; bottom 807*c0909341SAndroid Build Coastguard Worker pshuflw m7, m7, q0000 808*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 809*c0909341SAndroid Build Coastguard Worker cmp wd, 4 810*c0909341SAndroid Build Coastguard Worker jne .w8 811*c0909341SAndroid Build Coastguard Worker movq m4, [tlq+2] ; top 812*c0909341SAndroid Build Coastguard Worker mova m5, [h_weightsq+4*4] 813*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m6 ; top, bottom 814*c0909341SAndroid Build Coastguard Worker pxor m6, m6 815*c0909341SAndroid Build Coastguard Worker.w4_loop: 816*c0909341SAndroid Build Coastguard Worker movq m1, [v_weightsq+hq*4] 817*c0909341SAndroid Build Coastguard Worker sub tlq, 4 818*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] ; left 819*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q0000 820*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1111 821*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 822*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m7 ; left, right 823*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m4 824*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q1111 825*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 826*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 827*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5 828*c0909341SAndroid Build Coastguard Worker paddd m0, m2 829*c0909341SAndroid Build Coastguard Worker paddd m1, m3 830*c0909341SAndroid Build Coastguard Worker psrld m0, 8 831*c0909341SAndroid Build Coastguard Worker psrld m1, 8 832*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 833*c0909341SAndroid Build Coastguard Worker pavgw m0, m6 834*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 835*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 836*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 837*c0909341SAndroid Build Coastguard Worker add hq, 2 838*c0909341SAndroid Build Coastguard Worker jl .w4_loop 839*c0909341SAndroid Build Coastguard Worker RET 840*c0909341SAndroid Build Coastguard Worker.w8: 841*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 842*c0909341SAndroid Build Coastguard Worker lea h_weightsq, [h_weightsq+wq*4] 843*c0909341SAndroid Build Coastguard Worker mov t0, tlq 844*c0909341SAndroid Build Coastguard Worker mov r1m, tlq 845*c0909341SAndroid Build Coastguard Worker mov r2m, hq 846*c0909341SAndroid Build Coastguard Worker %define m8 [h_weightsq+16*0] 847*c0909341SAndroid Build Coastguard Worker %define m9 [h_weightsq+16*1] 848*c0909341SAndroid Build Coastguard Worker%else 849*c0909341SAndroid Build Coastguard Worker%if WIN64 850*c0909341SAndroid Build Coastguard Worker movaps r4m, m8 851*c0909341SAndroid Build Coastguard Worker movaps r6m, m9 852*c0909341SAndroid Build Coastguard Worker PUSH r7 853*c0909341SAndroid Build Coastguard Worker PUSH r8 854*c0909341SAndroid Build Coastguard Worker%endif 855*c0909341SAndroid Build Coastguard Worker PUSH r9 856*c0909341SAndroid Build Coastguard Worker PUSH r10 857*c0909341SAndroid Build Coastguard Worker %assign regs_used 11 858*c0909341SAndroid Build Coastguard Worker lea h_weightsq, [h_weightsq+wq*8] 859*c0909341SAndroid Build Coastguard Worker lea topq, [tlq+wq*2] 860*c0909341SAndroid Build Coastguard Worker neg wq 861*c0909341SAndroid Build Coastguard Worker mov r8, tlq 862*c0909341SAndroid Build Coastguard Worker mov r9, hq 863*c0909341SAndroid Build Coastguard Worker%endif 864*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m6 865*c0909341SAndroid Build Coastguard Worker.w8_loop0: 866*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 867*c0909341SAndroid Build Coastguard Worker movu m5, [t0+2] 868*c0909341SAndroid Build Coastguard Worker add t0, 16 869*c0909341SAndroid Build Coastguard Worker mov r0m, t0 870*c0909341SAndroid Build Coastguard Worker%else 871*c0909341SAndroid Build Coastguard Worker movu m5, [topq+wq*2+2] 872*c0909341SAndroid Build Coastguard Worker mova m8, [h_weightsq+wq*4+16*0] 873*c0909341SAndroid Build Coastguard Worker mova m9, [h_weightsq+wq*4+16*1] 874*c0909341SAndroid Build Coastguard Worker%endif 875*c0909341SAndroid Build Coastguard Worker mov t0, dstq 876*c0909341SAndroid Build Coastguard Worker add dstq, 16 877*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 878*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 879*c0909341SAndroid Build Coastguard Worker.w8_loop: 880*c0909341SAndroid Build Coastguard Worker movd m1, [v_weightsq+hq*4] 881*c0909341SAndroid Build Coastguard Worker sub tlq, 2 882*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] ; left 883*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q0000 884*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4, m1 885*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q0000 886*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 887*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m7 ; left, right 888*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8, m3 889*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 890*c0909341SAndroid Build Coastguard Worker paddd m0, m2 891*c0909341SAndroid Build Coastguard Worker paddd m1, m3 892*c0909341SAndroid Build Coastguard Worker psrld m0, 8 893*c0909341SAndroid Build Coastguard Worker psrld m1, 8 894*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 895*c0909341SAndroid Build Coastguard Worker pxor m1, m1 896*c0909341SAndroid Build Coastguard Worker pavgw m0, m1 897*c0909341SAndroid Build Coastguard Worker mova [t0], m0 898*c0909341SAndroid Build Coastguard Worker add t0, strideq 899*c0909341SAndroid Build Coastguard Worker inc hq 900*c0909341SAndroid Build Coastguard Worker jl .w8_loop 901*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 902*c0909341SAndroid Build Coastguard Worker mov t0, r0m 903*c0909341SAndroid Build Coastguard Worker mov tlq, r1m 904*c0909341SAndroid Build Coastguard Worker add h_weightsq, 16*2 905*c0909341SAndroid Build Coastguard Worker mov hq, r2m 906*c0909341SAndroid Build Coastguard Worker sub dword wm, 8 907*c0909341SAndroid Build Coastguard Worker jg .w8_loop0 908*c0909341SAndroid Build Coastguard Worker%else 909*c0909341SAndroid Build Coastguard Worker mov tlq, r8 910*c0909341SAndroid Build Coastguard Worker mov hq, r9 911*c0909341SAndroid Build Coastguard Worker add wq, 8 912*c0909341SAndroid Build Coastguard Worker jl .w8_loop0 913*c0909341SAndroid Build Coastguard Worker%endif 914*c0909341SAndroid Build Coastguard Worker%if WIN64 915*c0909341SAndroid Build Coastguard Worker movaps m8, r4m 916*c0909341SAndroid Build Coastguard Worker movaps m9, r6m 917*c0909341SAndroid Build Coastguard Worker%endif 918*c0909341SAndroid Build Coastguard Worker RET 919*c0909341SAndroid Build Coastguard Worker 920*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 921*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx 922*c0909341SAndroid Build Coastguard Worker %define base r7-$$ 923*c0909341SAndroid Build Coastguard Worker %define bdmaxm r8m 924*c0909341SAndroid Build Coastguard Worker lea r7, [$$] 925*c0909341SAndroid Build Coastguard Worker%else 926*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx 927*c0909341SAndroid Build Coastguard Worker %define base r1-$$ 928*c0909341SAndroid Build Coastguard Worker %define stridemp [rsp+4*0] 929*c0909341SAndroid Build Coastguard Worker %define bdmaxm [rsp+4*1] 930*c0909341SAndroid Build Coastguard Worker mov r3, r8m 931*c0909341SAndroid Build Coastguard Worker mov stridemp, r1 932*c0909341SAndroid Build Coastguard Worker mov bdmaxm, r3 933*c0909341SAndroid Build Coastguard Worker LEA r1, $$ 934*c0909341SAndroid Build Coastguard Worker%endif 935*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 936*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 937*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 938*c0909341SAndroid Build Coastguard Worker add tlq, 2 939*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] 940*c0909341SAndroid Build Coastguard Worker mov dxd, angled 941*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pw_256] 942*c0909341SAndroid Build Coastguard Worker and dxd, 0x7e 943*c0909341SAndroid Build Coastguard Worker movddup m7, [base+pw_62] 944*c0909341SAndroid Build Coastguard Worker add angled, 165 ; ~90 945*c0909341SAndroid Build Coastguard Worker lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] 946*c0909341SAndroid Build Coastguard Worker movzx dxd, word [base+dr_intra_derivative+dxq] 947*c0909341SAndroid Build Coastguard Worker xor angled, 0x4ff ; d = 90 - angle 948*c0909341SAndroid Build Coastguard Worker jmp wq 949*c0909341SAndroid Build Coastguard Worker.w4: 950*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+88] 951*c0909341SAndroid Build Coastguard Worker test r3d, 0x480 952*c0909341SAndroid Build Coastguard Worker jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 953*c0909341SAndroid Build Coastguard Worker sar r3d, 9 954*c0909341SAndroid Build Coastguard Worker add r3d, hd 955*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 956*c0909341SAndroid Build Coastguard Worker jg .w4_no_upsample ; h > 8 || (w == h && is_sm) 957*c0909341SAndroid Build Coastguard Worker movd m3, [tlq+14] 958*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 959*c0909341SAndroid Build Coastguard Worker movd m1, bdmaxm 960*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 961*c0909341SAndroid Build Coastguard Worker palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 962*c0909341SAndroid Build Coastguard Worker paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 963*c0909341SAndroid Build Coastguard Worker add dxd, dxd 964*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m3 965*c0909341SAndroid Build Coastguard Worker palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 966*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 967*c0909341SAndroid Build Coastguard Worker paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d 968*c0909341SAndroid Build Coastguard Worker psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 969*c0909341SAndroid Build Coastguard Worker movd m4, dxd 970*c0909341SAndroid Build Coastguard Worker psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 971*c0909341SAndroid Build Coastguard Worker paddw m3, m5 972*c0909341SAndroid Build Coastguard Worker pxor m5, m5 973*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m5 974*c0909341SAndroid Build Coastguard Worker mov r3d, dxd 975*c0909341SAndroid Build Coastguard Worker pavgw m3, m5 976*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 977*c0909341SAndroid Build Coastguard Worker pminsw m3, m1 978*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 979*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 980*c0909341SAndroid Build Coastguard Worker mova m3, [base+z_upsample] 981*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 982*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], m1 983*c0909341SAndroid Build Coastguard Worker paddw m5, m4, m4 984*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m2 985*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5 ; xpos0 xpos1 986*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop: 987*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 988*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base0 989*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+r3*2] 990*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 991*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base1 992*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+r2*2] 993*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 994*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 995*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 996*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2 997*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 ; frac 998*c0909341SAndroid Build Coastguard Worker psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 999*c0909341SAndroid Build Coastguard Worker psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) 1000*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) 1001*c0909341SAndroid Build Coastguard Worker paddw m4, m5 ; xpos += dx 1002*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1003*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 1004*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 1005*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1006*c0909341SAndroid Build Coastguard Worker sub hd, 2 1007*c0909341SAndroid Build Coastguard Worker jg .w4_upsample_loop 1008*c0909341SAndroid Build Coastguard Worker RET 1009*c0909341SAndroid Build Coastguard Worker.w4_no_upsample: 1010*c0909341SAndroid Build Coastguard Worker mov r3d, 7 ; max_base 1011*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1012*c0909341SAndroid Build Coastguard Worker jnz .w4_main 1013*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1014*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1015*c0909341SAndroid Build Coastguard Worker movd m3, angled 1016*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1017*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1018*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 1019*c0909341SAndroid Build Coastguard Worker pshufb m3, m2 1020*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, [base+z_filt_wh4] 1021*c0909341SAndroid Build Coastguard Worker pand m1, m3 1022*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, [base+z_filt_t_w48+angleq*8] 1023*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 1024*c0909341SAndroid Build Coastguard Worker mov r3d, 7 1025*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1026*c0909341SAndroid Build Coastguard Worker jz .w4_main ; filter_strength == 0 1027*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq-2], q0000 1028*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*0] 1029*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 1030*c0909341SAndroid Build Coastguard Worker movd m3, [tlq+r3*2] 1031*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 1032*c0909341SAndroid Build Coastguard Worker movd [rsp+12], m1 1033*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q0000 1034*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1035*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+2] 1036*c0909341SAndroid Build Coastguard Worker movq [rsp+r3*2+18], m3 1037*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1038*c0909341SAndroid Build Coastguard Worker cmovae r3d, r2d 1039*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*1] 1040*c0909341SAndroid Build Coastguard Worker call .filter_edge 1041*c0909341SAndroid Build Coastguard Worker.w4_main: 1042*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+r3*2] 1043*c0909341SAndroid Build Coastguard Worker movd m4, dxd 1044*c0909341SAndroid Build Coastguard Worker movddup m1, [base+z_base_inc] ; base_inc << 6 1045*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] ; top[max_base_x] 1046*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1047*c0909341SAndroid Build Coastguard Worker movd m3, r3d 1048*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 1049*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1050*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1051*c0909341SAndroid Build Coastguard Worker sub r5, r3 1052*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 1053*c0909341SAndroid Build Coastguard Worker paddw m5, m4, m4 1054*c0909341SAndroid Build Coastguard Worker psubw m3, m1 ; max_base_x 1055*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5 ; xpos0 xpos1 1056*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1057*c0909341SAndroid Build Coastguard Worker.w4_loop: 1058*c0909341SAndroid Build Coastguard Worker lea r3, [r5+dxq] 1059*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base0 1060*c0909341SAndroid Build Coastguard Worker movq m0, [tlq+r5*2+0] 1061*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+r5*2+2] 1062*c0909341SAndroid Build Coastguard Worker lea r5, [r3+dxq] 1063*c0909341SAndroid Build Coastguard Worker sar r3, 6 ; base1 1064*c0909341SAndroid Build Coastguard Worker movhps m0, [tlq+r3*2+0] 1065*c0909341SAndroid Build Coastguard Worker movhps m1, [tlq+r3*2+2] 1066*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 1067*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1068*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1069*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1070*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m3, m4 ; xpos < max_base_x 1071*c0909341SAndroid Build Coastguard Worker paddw m4, m5 ; xpos += dx 1072*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1073*c0909341SAndroid Build Coastguard Worker pand m0, m2 1074*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1075*c0909341SAndroid Build Coastguard Worker por m0, m2 1076*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 1077*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 1078*c0909341SAndroid Build Coastguard Worker sub hd, 2 1079*c0909341SAndroid Build Coastguard Worker jz .w4_end 1080*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1081*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1082*c0909341SAndroid Build Coastguard Worker jl .w4_loop 1083*c0909341SAndroid Build Coastguard Worker.w4_end_loop: 1084*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m6 1085*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m6 1086*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1087*c0909341SAndroid Build Coastguard Worker sub hd, 2 1088*c0909341SAndroid Build Coastguard Worker jg .w4_end_loop 1089*c0909341SAndroid Build Coastguard Worker.w4_end: 1090*c0909341SAndroid Build Coastguard Worker RET 1091*c0909341SAndroid Build Coastguard Worker.w8: 1092*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+88] 1093*c0909341SAndroid Build Coastguard Worker and r3d, ~0x7f 1094*c0909341SAndroid Build Coastguard Worker or r3d, hd 1095*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 1096*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1097*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1098*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 1099*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a 1100*c0909341SAndroid Build Coastguard Worker paddw m5, m1 1101*c0909341SAndroid Build Coastguard Worker paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 1102*c0909341SAndroid Build Coastguard Worker psubw m2, m5, m3 1103*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+18] ; a b c d e f g _ 1104*c0909341SAndroid Build Coastguard Worker psraw m2, 3 1105*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+20] ; b c d e f g _ _ 1106*c0909341SAndroid Build Coastguard Worker paddw m5, m2 1107*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16] ; 9 a b c d e f g 1108*c0909341SAndroid Build Coastguard Worker paddw m6, m2 1109*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1110*c0909341SAndroid Build Coastguard Worker cmp hd, 4 1111*c0909341SAndroid Build Coastguard Worker jne .w8_upsample_h8 ; awkward single-pixel edge case 1112*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ 1113*c0909341SAndroid Build Coastguard Worker.w8_upsample_h8: 1114*c0909341SAndroid Build Coastguard Worker paddw m3, [tlq+14] ; 8 9 a b c d e f 1115*c0909341SAndroid Build Coastguard Worker psubw m4, m6, m3 1116*c0909341SAndroid Build Coastguard Worker movd m3, bdmaxm 1117*c0909341SAndroid Build Coastguard Worker psraw m4, 3 1118*c0909341SAndroid Build Coastguard Worker mov r3d, dxd 1119*c0909341SAndroid Build Coastguard Worker paddw m6, m4 1120*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1121*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m4 1122*c0909341SAndroid Build Coastguard Worker pmaxsw m6, m4 1123*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 1124*c0909341SAndroid Build Coastguard Worker pavgw m5, m4 1125*c0909341SAndroid Build Coastguard Worker pavgw m6, m4 1126*c0909341SAndroid Build Coastguard Worker movd m4, dxd 1127*c0909341SAndroid Build Coastguard Worker pminsw m5, m3 1128*c0909341SAndroid Build Coastguard Worker pminsw m6, m3 1129*c0909341SAndroid Build Coastguard Worker mova m3, [base+z_upsample] 1130*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 1131*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1132*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m5 1133*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0], m0 1134*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m5 1135*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m1 1136*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m6 1137*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m0 1138*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m6 1139*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m2 1140*c0909341SAndroid Build Coastguard Worker mova m5, m4 1141*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop: 1142*c0909341SAndroid Build Coastguard Worker mov r2d, r3d 1143*c0909341SAndroid Build Coastguard Worker shr r2d, 6 1144*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+r2*2+ 0] 1145*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+r2*2+16] 1146*c0909341SAndroid Build Coastguard Worker add r3d, dxd 1147*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 1148*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 1149*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 1150*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2 1151*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 1152*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1153*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1154*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1155*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1156*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1157*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1158*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1159*c0909341SAndroid Build Coastguard Worker dec hd 1160*c0909341SAndroid Build Coastguard Worker jg .w8_upsample_loop 1161*c0909341SAndroid Build Coastguard Worker RET 1162*c0909341SAndroid Build Coastguard Worker.w8_no_upsample: 1163*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 1164*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1165*c0909341SAndroid Build Coastguard Worker and r3d, 7 1166*c0909341SAndroid Build Coastguard Worker or r3d, 8 ; imin(h+7, 15) 1167*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1168*c0909341SAndroid Build Coastguard Worker jnz .w8_main 1169*c0909341SAndroid Build Coastguard Worker movd m3, angled 1170*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1171*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1172*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 1173*c0909341SAndroid Build Coastguard Worker pshufb m3, m2 1174*c0909341SAndroid Build Coastguard Worker movu m2, [base+z_filt_wh8] 1175*c0909341SAndroid Build Coastguard Worker psrldq m4, [base+z_filt_t_w48+angleq*8], 4 1176*c0909341SAndroid Build Coastguard Worker pcmpeqb m2, m1 1177*c0909341SAndroid Build Coastguard Worker pand m2, m3 1178*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m4 1179*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m2 1180*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1181*c0909341SAndroid Build Coastguard Worker jz .w8_main ; filter_strength == 0 1182*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq-2], q0000 1183*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*0] 1184*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 1185*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+16*1] 1186*c0909341SAndroid Build Coastguard Worker movd m4, [tlq+r3*2] 1187*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 1188*c0909341SAndroid Build Coastguard Worker movd [rsp+12], m1 1189*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1190*c0909341SAndroid Build Coastguard Worker pshuflw m4, m4, q0000 1191*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m3 1192*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+2] 1193*c0909341SAndroid Build Coastguard Worker movq [rsp+r3*2+18], m4 1194*c0909341SAndroid Build Coastguard Worker cmp hd, 16 1195*c0909341SAndroid Build Coastguard Worker cmovae r3d, r2d 1196*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*1] 1197*c0909341SAndroid Build Coastguard Worker call .filter_edge 1198*c0909341SAndroid Build Coastguard Worker.w8_main: 1199*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+r3*2] 1200*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1201*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_base_inc] 1202*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1203*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] ; top[max_base_x] 1204*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1205*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 1206*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1207*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 1208*c0909341SAndroid Build Coastguard Worker sub r5, r3 1209*c0909341SAndroid Build Coastguard Worker psubw m4, m1 ; max_base_x 1210*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1211*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1212*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1213*c0909341SAndroid Build Coastguard Worker.w8_loop: 1214*c0909341SAndroid Build Coastguard Worker mov r3, r5 1215*c0909341SAndroid Build Coastguard Worker sar r3, 6 1216*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+0] 1217*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+2] 1218*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 1219*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1220*c0909341SAndroid Build Coastguard Worker psubw m1, m0 1221*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1222*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 ; xpos < max_base_x 1223*c0909341SAndroid Build Coastguard Worker paddw m4, m5 ; xpos += dx 1224*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1225*c0909341SAndroid Build Coastguard Worker pand m0, m2 1226*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1227*c0909341SAndroid Build Coastguard Worker por m0, m2 1228*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1229*c0909341SAndroid Build Coastguard Worker dec hd 1230*c0909341SAndroid Build Coastguard Worker jz .w8_end 1231*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1232*c0909341SAndroid Build Coastguard Worker add r5, dxq 1233*c0909341SAndroid Build Coastguard Worker jl .w8_loop 1234*c0909341SAndroid Build Coastguard Worker.w8_end_loop: 1235*c0909341SAndroid Build Coastguard Worker mova [dstq], m6 1236*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1237*c0909341SAndroid Build Coastguard Worker dec hd 1238*c0909341SAndroid Build Coastguard Worker jg .w8_end_loop 1239*c0909341SAndroid Build Coastguard Worker.w8_end: 1240*c0909341SAndroid Build Coastguard Worker RET 1241*c0909341SAndroid Build Coastguard Worker.w16: 1242*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1243*c0909341SAndroid Build Coastguard Worker %define strideq r3 1244*c0909341SAndroid Build Coastguard Worker%endif 1245*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 1246*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1247*c0909341SAndroid Build Coastguard Worker and r3d, 15 1248*c0909341SAndroid Build Coastguard Worker or r3d, 16 ; imin(h+15, 31) 1249*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1250*c0909341SAndroid Build Coastguard Worker jnz .w16_main 1251*c0909341SAndroid Build Coastguard Worker movd m3, angled 1252*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1253*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1254*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 1255*c0909341SAndroid Build Coastguard Worker pshufb m3, m2 1256*c0909341SAndroid Build Coastguard Worker movq m4, [base+z_filt_t_w16+angleq*4] 1257*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, [base+z_filt_wh16] 1258*c0909341SAndroid Build Coastguard Worker pand m1, m3 1259*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4 1260*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 1261*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1262*c0909341SAndroid Build Coastguard Worker jz .w16_main ; filter_strength == 0 1263*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq-2], q0000 1264*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*0] 1265*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 1266*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+16*1] 1267*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+16*2] 1268*c0909341SAndroid Build Coastguard Worker shr r5d, 30 1269*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+16*3] 1270*c0909341SAndroid Build Coastguard Worker movd m6, [tlq+r3*2] 1271*c0909341SAndroid Build Coastguard Worker adc r5d, -1 ; filter_strength 1272*c0909341SAndroid Build Coastguard Worker movd [rsp+12], m1 1273*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1274*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m3 1275*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 1276*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m4 1277*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m5 1278*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+2] 1279*c0909341SAndroid Build Coastguard Worker movq [rsp+r3*2+18], m6 1280*c0909341SAndroid Build Coastguard Worker cmp hd, 32 1281*c0909341SAndroid Build Coastguard Worker cmovae r3d, r2d 1282*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*1] 1283*c0909341SAndroid Build Coastguard Worker call .filter_edge 1284*c0909341SAndroid Build Coastguard Worker.w16_main: 1285*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+r3*2] 1286*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1287*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_base_inc] 1288*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1289*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] ; top[max_base_x] 1290*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1291*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 1292*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1293*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 1294*c0909341SAndroid Build Coastguard Worker sub r5, r3 1295*c0909341SAndroid Build Coastguard Worker psubw m4, m1 ; max_base_x 1296*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1297*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1298*c0909341SAndroid Build Coastguard Worker.w16_loop: 1299*c0909341SAndroid Build Coastguard Worker mov r3, r5 1300*c0909341SAndroid Build Coastguard Worker sar r3, 6 1301*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+ 0] 1302*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+ 2] 1303*c0909341SAndroid Build Coastguard Worker pand m3, m7, m4 1304*c0909341SAndroid Build Coastguard Worker psllw m3, 9 1305*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1306*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1307*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16] 1308*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1309*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+18] 1310*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1311*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1312*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m512] 1313*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1314*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 1315*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 1316*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1317*c0909341SAndroid Build Coastguard Worker pand m0, m2 1318*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1319*c0909341SAndroid Build Coastguard Worker pand m1, m3 1320*c0909341SAndroid Build Coastguard Worker pandn m3, m6 1321*c0909341SAndroid Build Coastguard Worker por m0, m2 1322*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 1323*c0909341SAndroid Build Coastguard Worker por m1, m3 1324*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 1325*c0909341SAndroid Build Coastguard Worker dec hd 1326*c0909341SAndroid Build Coastguard Worker jz .w16_end 1327*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1328*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1329*c0909341SAndroid Build Coastguard Worker add r5, dxq 1330*c0909341SAndroid Build Coastguard Worker jl .w16_loop 1331*c0909341SAndroid Build Coastguard Worker.w16_end_loop: 1332*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m6 1333*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m6 1334*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1335*c0909341SAndroid Build Coastguard Worker dec hd 1336*c0909341SAndroid Build Coastguard Worker jg .w16_end_loop 1337*c0909341SAndroid Build Coastguard Worker.w16_end: 1338*c0909341SAndroid Build Coastguard Worker RET 1339*c0909341SAndroid Build Coastguard Worker.w32: 1340*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+31] 1341*c0909341SAndroid Build Coastguard Worker and r3d, 31 1342*c0909341SAndroid Build Coastguard Worker or r3d, 32 ; imin(h+31, 63) 1343*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1344*c0909341SAndroid Build Coastguard Worker jnz .w32_main 1345*c0909341SAndroid Build Coastguard Worker call .filter_copy 1346*c0909341SAndroid Build Coastguard Worker lea r5d, [r3+2] 1347*c0909341SAndroid Build Coastguard Worker cmp hd, 64 1348*c0909341SAndroid Build Coastguard Worker cmove r3d, r5d 1349*c0909341SAndroid Build Coastguard Worker call .filter_edge_s3 1350*c0909341SAndroid Build Coastguard Worker.w32_main: 1351*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+r3*2] 1352*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1353*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_base_inc] 1354*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1355*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] ; top[max_base_x] 1356*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1357*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 1358*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1359*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 1360*c0909341SAndroid Build Coastguard Worker sub r5, r3 1361*c0909341SAndroid Build Coastguard Worker psubw m4, m1 ; max_base_x 1362*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1363*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1364*c0909341SAndroid Build Coastguard Worker.w32_loop: 1365*c0909341SAndroid Build Coastguard Worker mov r3, r5 1366*c0909341SAndroid Build Coastguard Worker sar r3, 6 1367*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+ 0] 1368*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+ 2] 1369*c0909341SAndroid Build Coastguard Worker pand m3, m7, m4 1370*c0909341SAndroid Build Coastguard Worker psllw m3, 9 1371*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1372*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1373*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16] 1374*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1375*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+18] 1376*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1377*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1378*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1379*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 1380*c0909341SAndroid Build Coastguard Worker pand m0, m2 1381*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1382*c0909341SAndroid Build Coastguard Worker por m0, m2 1383*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m512] 1384*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1385*c0909341SAndroid Build Coastguard Worker pand m1, m2 1386*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1387*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 1388*c0909341SAndroid Build Coastguard Worker por m1, m2 1389*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 1390*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+32] 1391*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+34] 1392*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1393*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1394*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+48] 1395*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1396*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+50] 1397*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1398*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1399*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1400*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m1024] 1401*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m1536] 1402*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1403*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 1404*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1405*c0909341SAndroid Build Coastguard Worker pand m0, m2 1406*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1407*c0909341SAndroid Build Coastguard Worker pand m1, m3 1408*c0909341SAndroid Build Coastguard Worker pandn m3, m6 1409*c0909341SAndroid Build Coastguard Worker por m0, m2 1410*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 1411*c0909341SAndroid Build Coastguard Worker por m1, m3 1412*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m1 1413*c0909341SAndroid Build Coastguard Worker dec hd 1414*c0909341SAndroid Build Coastguard Worker jz .w32_end 1415*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1416*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1417*c0909341SAndroid Build Coastguard Worker add r5, dxq 1418*c0909341SAndroid Build Coastguard Worker jl .w32_loop 1419*c0909341SAndroid Build Coastguard Worker.w32_end_loop: 1420*c0909341SAndroid Build Coastguard Worker REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 1421*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1422*c0909341SAndroid Build Coastguard Worker dec hd 1423*c0909341SAndroid Build Coastguard Worker jg .w32_end_loop 1424*c0909341SAndroid Build Coastguard Worker.w32_end: 1425*c0909341SAndroid Build Coastguard Worker RET 1426*c0909341SAndroid Build Coastguard Worker.w64: 1427*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+63] 1428*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1429*c0909341SAndroid Build Coastguard Worker jnz .w64_main 1430*c0909341SAndroid Build Coastguard Worker call .filter_copy 1431*c0909341SAndroid Build Coastguard Worker call .filter_edge_s3 1432*c0909341SAndroid Build Coastguard Worker.w64_main: 1433*c0909341SAndroid Build Coastguard Worker lea tlq, [tlq+r3*2] 1434*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1435*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_base_inc] 1436*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1437*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] ; top[max_base_x] 1438*c0909341SAndroid Build Coastguard Worker movd m1, r3d 1439*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 1440*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1441*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 1442*c0909341SAndroid Build Coastguard Worker sub r5, r3 1443*c0909341SAndroid Build Coastguard Worker psubw m4, m1 ; max_base_x 1444*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1445*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1446*c0909341SAndroid Build Coastguard Worker.w64_loop: 1447*c0909341SAndroid Build Coastguard Worker mov r3, r5 1448*c0909341SAndroid Build Coastguard Worker sar r3, 6 1449*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+ 0] 1450*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+ 2] 1451*c0909341SAndroid Build Coastguard Worker pand m3, m7, m4 1452*c0909341SAndroid Build Coastguard Worker psllw m3, 9 1453*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1454*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1455*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16] 1456*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1457*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+18] 1458*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1459*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1460*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1461*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 1462*c0909341SAndroid Build Coastguard Worker pand m0, m2 1463*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1464*c0909341SAndroid Build Coastguard Worker por m0, m2 1465*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m512] 1466*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1467*c0909341SAndroid Build Coastguard Worker pand m1, m2 1468*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1469*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 1470*c0909341SAndroid Build Coastguard Worker por m1, m2 1471*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 1472*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+32] 1473*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+34] 1474*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1475*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1476*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+48] 1477*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1478*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+50] 1479*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1480*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1481*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1482*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m1024] 1483*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1484*c0909341SAndroid Build Coastguard Worker pand m0, m2 1485*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1486*c0909341SAndroid Build Coastguard Worker por m0, m2 1487*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m1536] 1488*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1489*c0909341SAndroid Build Coastguard Worker pand m1, m2 1490*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1491*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 1492*c0909341SAndroid Build Coastguard Worker por m1, m2 1493*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m1 1494*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+64] 1495*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+66] 1496*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1497*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1498*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+80] 1499*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1500*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+82] 1501*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1502*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1503*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1504*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m2048] 1505*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1506*c0909341SAndroid Build Coastguard Worker pand m0, m2 1507*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1508*c0909341SAndroid Build Coastguard Worker por m0, m2 1509*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m2560] 1510*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1511*c0909341SAndroid Build Coastguard Worker pand m1, m2 1512*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1513*c0909341SAndroid Build Coastguard Worker mova [dstq+16*4], m0 1514*c0909341SAndroid Build Coastguard Worker por m1, m2 1515*c0909341SAndroid Build Coastguard Worker mova [dstq+16*5], m1 1516*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3*2+96] 1517*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+98] 1518*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1519*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1520*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+112] 1521*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1522*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+114] 1523*c0909341SAndroid Build Coastguard Worker psubw m2, m1 1524*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1525*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1526*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m3072] 1527*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m3584] 1528*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 1529*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 1530*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1531*c0909341SAndroid Build Coastguard Worker pand m0, m2 1532*c0909341SAndroid Build Coastguard Worker pandn m2, m6 1533*c0909341SAndroid Build Coastguard Worker pand m1, m3 1534*c0909341SAndroid Build Coastguard Worker pandn m3, m6 1535*c0909341SAndroid Build Coastguard Worker por m0, m2 1536*c0909341SAndroid Build Coastguard Worker mova [dstq+16*6], m0 1537*c0909341SAndroid Build Coastguard Worker por m1, m3 1538*c0909341SAndroid Build Coastguard Worker mova [dstq+16*7], m1 1539*c0909341SAndroid Build Coastguard Worker dec hd 1540*c0909341SAndroid Build Coastguard Worker jz .w64_end 1541*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1542*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1543*c0909341SAndroid Build Coastguard Worker add r5, dxq 1544*c0909341SAndroid Build Coastguard Worker jl .w64_loop 1545*c0909341SAndroid Build Coastguard Worker.w64_end_loop: 1546*c0909341SAndroid Build Coastguard Worker REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 1547*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1548*c0909341SAndroid Build Coastguard Worker dec hd 1549*c0909341SAndroid Build Coastguard Worker jg .w64_end_loop 1550*c0909341SAndroid Build Coastguard Worker.w64_end: 1551*c0909341SAndroid Build Coastguard Worker RET 1552*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1553*c0909341SAndroid Build Coastguard Worker.filter_copy: 1554*c0909341SAndroid Build Coastguard Worker pshuflw m2, [tlq-2], q0000 1555*c0909341SAndroid Build Coastguard Worker pshuflw m3, [tlq+r3*2], q0000 1556*c0909341SAndroid Build Coastguard Worker xor r5d, r5d 1557*c0909341SAndroid Build Coastguard Worker movd [rsp+gprsize+12], m2 1558*c0909341SAndroid Build Coastguard Worker.filter_copy_loop: 1559*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2+16*0] 1560*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2+16*1] 1561*c0909341SAndroid Build Coastguard Worker add r5d, 16 1562*c0909341SAndroid Build Coastguard Worker mova [rsp+r5*2+gprsize-16*1], m1 1563*c0909341SAndroid Build Coastguard Worker mova [rsp+r5*2+gprsize-16*0], m2 1564*c0909341SAndroid Build Coastguard Worker cmp r5d, r3d 1565*c0909341SAndroid Build Coastguard Worker jle .filter_copy_loop 1566*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+gprsize+16*1] 1567*c0909341SAndroid Build Coastguard Worker movq [tlq+r3*2+2], m3 1568*c0909341SAndroid Build Coastguard Worker ret 1569*c0909341SAndroid Build Coastguard Worker.filter_edge: 1570*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 1571*c0909341SAndroid Build Coastguard Worker je .filter_edge_s3 1572*c0909341SAndroid Build Coastguard Worker movddup m4, [base+z_filt_k+r5*8-8] 1573*c0909341SAndroid Build Coastguard Worker movddup m5, [base+z_filt_k+r5*8+8] 1574*c0909341SAndroid Build Coastguard Worker xor r5d, r5d 1575*c0909341SAndroid Build Coastguard Worker movddup m6, [base+pw_8] 1576*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-2] 1577*c0909341SAndroid Build Coastguard Worker jmp .filter_edge_start 1578*c0909341SAndroid Build Coastguard Worker.filter_edge_loop: 1579*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2-2] 1580*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2-16], m1 1581*c0909341SAndroid Build Coastguard Worker.filter_edge_start: 1582*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [tlq+r5*2] 1583*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r5*2+2] 1584*c0909341SAndroid Build Coastguard Worker paddw m2, m3 1585*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 1586*c0909341SAndroid Build Coastguard Worker add r5d, 8 1587*c0909341SAndroid Build Coastguard Worker paddw m1, m6 1588*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1589*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 1590*c0909341SAndroid Build Coastguard Worker cmp r5d, r3d 1591*c0909341SAndroid Build Coastguard Worker jl .filter_edge_loop 1592*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2-16], m1 1593*c0909341SAndroid Build Coastguard Worker ret 1594*c0909341SAndroid Build Coastguard Worker.filter_edge_s3: 1595*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pw_3] 1596*c0909341SAndroid Build Coastguard Worker xor r5d, r5d 1597*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-2] 1598*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-4] 1599*c0909341SAndroid Build Coastguard Worker jmp .filter_edge_s3_start 1600*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_loop: 1601*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2-2] 1602*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r5*2-4] 1603*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2-16], m1 1604*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_start: 1605*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq+r5*2+0] 1606*c0909341SAndroid Build Coastguard Worker paddw m3, m5 1607*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2+2] 1608*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+r5*2+4] 1609*c0909341SAndroid Build Coastguard Worker add r5d, 8 1610*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1611*c0909341SAndroid Build Coastguard Worker pavgw m3, m4 1612*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1613*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 1614*c0909341SAndroid Build Coastguard Worker cmp r5d, r3d 1615*c0909341SAndroid Build Coastguard Worker jl .filter_edge_s3_loop 1616*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2-16], m1 1617*c0909341SAndroid Build Coastguard Worker ret 1618*c0909341SAndroid Build Coastguard Worker 1619*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1620*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy 1621*c0909341SAndroid Build Coastguard Worker %define base r7-$$ 1622*c0909341SAndroid Build Coastguard Worker %define maxwm r6m 1623*c0909341SAndroid Build Coastguard Worker %define maxhm r7m 1624*c0909341SAndroid Build Coastguard Worker %define bdmaxm r8m 1625*c0909341SAndroid Build Coastguard Worker lea r7, [$$] 1626*c0909341SAndroid Build Coastguard Worker mov hd, hm 1627*c0909341SAndroid Build Coastguard Worker movddup m8, [base+pw_62] 1628*c0909341SAndroid Build Coastguard Worker lea r9d, [wq-4] 1629*c0909341SAndroid Build Coastguard Worker shl r9d, 6 1630*c0909341SAndroid Build Coastguard Worker mova m9, [base+z2_top_shufA] 1631*c0909341SAndroid Build Coastguard Worker or r9d, hd 1632*c0909341SAndroid Build Coastguard Worker mova m10, [base+z2_left_shufA] 1633*c0909341SAndroid Build Coastguard Worker%else 1634*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx 1635*c0909341SAndroid Build Coastguard Worker %define base r1-$$ 1636*c0909341SAndroid Build Coastguard Worker %define r9b byte [rsp+16*26+4*0] 1637*c0909341SAndroid Build Coastguard Worker %define r9d dword [rsp+16*26+4*0] 1638*c0909341SAndroid Build Coastguard Worker %define r10d dword [rsp+16*26+4*1] 1639*c0909341SAndroid Build Coastguard Worker %define r11d dword [rsp+16*26+4*2] 1640*c0909341SAndroid Build Coastguard Worker %define maxwm [rsp+16*2+4*0] 1641*c0909341SAndroid Build Coastguard Worker %define maxhm [rsp+16*2+4*1] 1642*c0909341SAndroid Build Coastguard Worker %define bdmaxm [rsp+16*2+4*2] 1643*c0909341SAndroid Build Coastguard Worker %define stridemp [rsp+16*26+4*3] 1644*c0909341SAndroid Build Coastguard Worker %define strideq r3 1645*c0909341SAndroid Build Coastguard Worker %define dyd r4 1646*c0909341SAndroid Build Coastguard Worker %define dyq r4 1647*c0909341SAndroid Build Coastguard Worker mov stridemp, r1 1648*c0909341SAndroid Build Coastguard Worker mov r1d, r6m 1649*c0909341SAndroid Build Coastguard Worker mov r4d, r7m 1650*c0909341SAndroid Build Coastguard Worker mov r5d, r8m 1651*c0909341SAndroid Build Coastguard Worker mov maxwm, r1d 1652*c0909341SAndroid Build Coastguard Worker mov maxhm, r4d 1653*c0909341SAndroid Build Coastguard Worker mov bdmaxm, r5d 1654*c0909341SAndroid Build Coastguard Worker LEA r1, $$ 1655*c0909341SAndroid Build Coastguard Worker lea hd, [wq-4] 1656*c0909341SAndroid Build Coastguard Worker mova m0, [base+z2_top_shufA] 1657*c0909341SAndroid Build Coastguard Worker shl hd, 6 1658*c0909341SAndroid Build Coastguard Worker mova m1, [base+z2_left_shufA] 1659*c0909341SAndroid Build Coastguard Worker or hd, hm 1660*c0909341SAndroid Build Coastguard Worker mova [rsp+16*24], m0 1661*c0909341SAndroid Build Coastguard Worker mov r9d, hd 1662*c0909341SAndroid Build Coastguard Worker mova [rsp+16*25], m1 1663*c0909341SAndroid Build Coastguard Worker%endif 1664*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1665*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 1666*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16*8] 1667*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16*7] 1668*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*6] 1669*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*5] 1670*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] 1671*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1672*c0909341SAndroid Build Coastguard Worker movzx dxd, angleb 1673*c0909341SAndroid Build Coastguard Worker%else 1674*c0909341SAndroid Build Coastguard Worker movzx dxd, byte anglem 1675*c0909341SAndroid Build Coastguard Worker%endif 1676*c0909341SAndroid Build Coastguard Worker mova m4, [tlq-16*4] 1677*c0909341SAndroid Build Coastguard Worker mova m5, [tlq-16*3] 1678*c0909341SAndroid Build Coastguard Worker mova m6, [tlq-16*2] 1679*c0909341SAndroid Build Coastguard Worker mova m7, [tlq-16*1] 1680*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 5], m0 1681*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 1682*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 6], m1 1683*c0909341SAndroid Build Coastguard Worker mov dyd, dxd 1684*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 7], m2 1685*c0909341SAndroid Build Coastguard Worker neg dxq 1686*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 8], m3 1687*c0909341SAndroid Build Coastguard Worker and dyd, ~1 1688*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 9], m4 1689*c0909341SAndroid Build Coastguard Worker and dxq, ~1 1690*c0909341SAndroid Build Coastguard Worker mova [rsp+16*10], m5 1691*c0909341SAndroid Build Coastguard Worker lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] 1692*c0909341SAndroid Build Coastguard Worker mova [rsp+16*11], m6 1693*c0909341SAndroid Build Coastguard Worker pxor m3, m3 1694*c0909341SAndroid Build Coastguard Worker mova [rsp+16*12], m7 1695*c0909341SAndroid Build Coastguard Worker movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 1696*c0909341SAndroid Build Coastguard Worker movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle 1697*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pw_256] ; 4<<6 1698*c0909341SAndroid Build Coastguard Worker movd m4, [tlq] 1699*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+16*0+2] 1700*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+16*1+2] 1701*c0909341SAndroid Build Coastguard Worker movsldup m1, [base+z2_dy_offset] 1702*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 1703*c0909341SAndroid Build Coastguard Worker movq m7, [base+z_base_inc+2] 1704*c0909341SAndroid Build Coastguard Worker mov r11d, (112-4)<<6 1705*c0909341SAndroid Build Coastguard Worker mova [rsp+16*13], m4 1706*c0909341SAndroid Build Coastguard Worker neg dxd 1707*c0909341SAndroid Build Coastguard Worker mova [rsp+16*14], m5 1708*c0909341SAndroid Build Coastguard Worker or dyd, 4<<16 1709*c0909341SAndroid Build Coastguard Worker mova [rsp+16*15], m6 1710*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1711*c0909341SAndroid Build Coastguard Worker lea r10d, [dxq+(112<<6)] ; xpos 1712*c0909341SAndroid Build Coastguard Worker%else 1713*c0909341SAndroid Build Coastguard Worker mov [rsp+8*3], dyd 1714*c0909341SAndroid Build Coastguard Worker lea r4d, [dxq+(112<<6)] 1715*c0909341SAndroid Build Coastguard Worker mov r10d, r4d 1716*c0909341SAndroid Build Coastguard Worker movzx hd, r9b 1717*c0909341SAndroid Build Coastguard Worker%endif 1718*c0909341SAndroid Build Coastguard Worker movq [rsp+8*0], m1 1719*c0909341SAndroid Build Coastguard Worker movq [rsp+8*1], m0 1720*c0909341SAndroid Build Coastguard Worker movq [rsp+8*2], m7 1721*c0909341SAndroid Build Coastguard Worker jmp wq 1722*c0909341SAndroid Build Coastguard Worker.w4: 1723*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1724*c0909341SAndroid Build Coastguard Worker jnz .w4_main 1725*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 1726*c0909341SAndroid Build Coastguard Worker add angled, 1022 1727*c0909341SAndroid Build Coastguard Worker pshuflw m1, m5, q3333 1728*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1729*c0909341SAndroid Build Coastguard Worker movq [rsp+16*14+8], m1 1730*c0909341SAndroid Build Coastguard Worker test r3d, angled 1731*c0909341SAndroid Build Coastguard Worker jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1732*c0909341SAndroid Build Coastguard Worker call .upsample_above 1733*c0909341SAndroid Build Coastguard Worker sub angled, 1075 ; angle - 53 1734*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1735*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 1736*c0909341SAndroid Build Coastguard Worker movd m2, r3d 1737*c0909341SAndroid Build Coastguard Worker movd m7, angled 1738*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1739*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 1740*c0909341SAndroid Build Coastguard Worker pshufb m7, m3 1741*c0909341SAndroid Build Coastguard Worker pcmpeqb m2, [base+z_filt_wh4] 1742*c0909341SAndroid Build Coastguard Worker pand m7, m2 1743*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, [base+z_filt_t_w48+angleq*8] 1744*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left 1745*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8 1746*c0909341SAndroid Build Coastguard Worker paddw m2, m5, [tlq] 1747*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+gprsize+16*14+2] 1748*c0909341SAndroid Build Coastguard Worker movu m4, [rsp+gprsize+16*14-4] 1749*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1750*c0909341SAndroid Build Coastguard Worker movd m6, r9m ; bdmax, offset due to call 1751*c0909341SAndroid Build Coastguard Worker%else 1752*c0909341SAndroid Build Coastguard Worker movd m6, [rsp+gprsize+16*2+4*2] 1753*c0909341SAndroid Build Coastguard Worker%endif 1754*c0909341SAndroid Build Coastguard Worker paddw m4, m1 1755*c0909341SAndroid Build Coastguard Worker psubw m1, m2, m4 1756*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1757*c0909341SAndroid Build Coastguard Worker psraw m1, 3 1758*c0909341SAndroid Build Coastguard Worker paddw m2, m1 1759*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1760*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m3 1761*c0909341SAndroid Build Coastguard Worker paddw m7, m7 1762*c0909341SAndroid Build Coastguard Worker pavgw m2, m3 1763*c0909341SAndroid Build Coastguard Worker pminsw m2, m6 1764*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1765*c0909341SAndroid Build Coastguard Worker mova m9, [base+z2_top_shufB] 1766*c0909341SAndroid Build Coastguard Worker lea r10d, [dxq+(113<<6)] 1767*c0909341SAndroid Build Coastguard Worker mov r11d, (112-7)<<6 1768*c0909341SAndroid Build Coastguard Worker%else 1769*c0909341SAndroid Build Coastguard Worker mova m1, [base+z2_top_shufB] 1770*c0909341SAndroid Build Coastguard Worker lea r3d, [dxq+(113<<6)] 1771*c0909341SAndroid Build Coastguard Worker mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 1772*c0909341SAndroid Build Coastguard Worker mov [rsp+gprsize+16*26+4*1], r3d 1773*c0909341SAndroid Build Coastguard Worker mova [rsp+gprsize+16*24], m1 1774*c0909341SAndroid Build Coastguard Worker%endif 1775*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m5 1776*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 1777*c0909341SAndroid Build Coastguard Worker movq [rsp+gprsize+8*2], m7 1778*c0909341SAndroid Build Coastguard Worker mova [rsp+gprsize+16*14], m1 1779*c0909341SAndroid Build Coastguard Worker mova [rsp+gprsize+16*15], m2 1780*c0909341SAndroid Build Coastguard Worker ret 1781*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above: 1782*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1783*c0909341SAndroid Build Coastguard Worker mov [rsp+16*4], angled 1784*c0909341SAndroid Build Coastguard Worker sub angled, 1112 ; angle - 90 1785*c0909341SAndroid Build Coastguard Worker movd m2, r3d 1786*c0909341SAndroid Build Coastguard Worker mov r3d, 90 1787*c0909341SAndroid Build Coastguard Worker movd m1, angled 1788*c0909341SAndroid Build Coastguard Worker sub r3d, angled ; 180 - angle 1789*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1790*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_filt_wh4] 1791*c0909341SAndroid Build Coastguard Worker movd m7, r3d 1792*c0909341SAndroid Build Coastguard Worker mova m5, [base+z_filt_t_w48+angleq*8] 1793*c0909341SAndroid Build Coastguard Worker mov r3d, 4 1794*c0909341SAndroid Build Coastguard Worker call .w8_filter_top 1795*c0909341SAndroid Build Coastguard Worker mov angled, [rsp+16*4] 1796*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 1797*c0909341SAndroid Build Coastguard Worker sub angled, 139 1798*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1799*c0909341SAndroid Build Coastguard Worker test r3d, angled 1800*c0909341SAndroid Build Coastguard Worker jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1801*c0909341SAndroid Build Coastguard Worker.upsample_left: ; w4/w8 1802*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16] 1803*c0909341SAndroid Build Coastguard Worker lea r3d, [hq-4] 1804*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-14] 1805*c0909341SAndroid Build Coastguard Worker movu m4, [rsp+16*12+4] 1806*c0909341SAndroid Build Coastguard Worker pshufb m1, m2, [base+z2_upsample_l+r3*4] 1807*c0909341SAndroid Build Coastguard Worker movd m6, bdmaxm 1808*c0909341SAndroid Build Coastguard Worker pxor m5, m5 1809*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1810*c0909341SAndroid Build Coastguard Worker paddw m4, m1 1811*c0909341SAndroid Build Coastguard Worker psubw m1, m3, m4 1812*c0909341SAndroid Build Coastguard Worker movshdup m4, [base+z2_dy_offset] 1813*c0909341SAndroid Build Coastguard Worker psraw m1, 3 1814*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1815*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1816*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m5 1817*c0909341SAndroid Build Coastguard Worker pavgw m3, m5 1818*c0909341SAndroid Build Coastguard Worker pminsw m3, m6 1819*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1820*c0909341SAndroid Build Coastguard Worker mova m10, [base+z2_left_shufB] 1821*c0909341SAndroid Build Coastguard Worker add dyd, dyd 1822*c0909341SAndroid Build Coastguard Worker%else 1823*c0909341SAndroid Build Coastguard Worker mova m1, [base+z2_left_shufB] 1824*c0909341SAndroid Build Coastguard Worker shl dword [rsp+8*3], 1 1825*c0909341SAndroid Build Coastguard Worker mova [rsp+16*25], m1 1826*c0909341SAndroid Build Coastguard Worker%endif 1827*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 1828*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 1829*c0909341SAndroid Build Coastguard Worker movq [rsp+8*0], m4 1830*c0909341SAndroid Build Coastguard Worker mova [rsp+16*12], m1 1831*c0909341SAndroid Build Coastguard Worker mova [rsp+16*11], m2 1832*c0909341SAndroid Build Coastguard Worker.w4_main: 1833*c0909341SAndroid Build Coastguard Worker movd m6, dxd 1834*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1835*c0909341SAndroid Build Coastguard Worker movd m3, dyd 1836*c0909341SAndroid Build Coastguard Worker%else 1837*c0909341SAndroid Build Coastguard Worker movd m3, [rsp+8*3] 1838*c0909341SAndroid Build Coastguard Worker%endif 1839*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 1840*c0909341SAndroid Build Coastguard Worker movddup m0, [rsp+8*2] 1841*c0909341SAndroid Build Coastguard Worker paddw m7, m6, m6 1842*c0909341SAndroid Build Coastguard Worker movq m5, [base+pw_m1to4] 1843*c0909341SAndroid Build Coastguard Worker pshuflw m4, m3, q0000 1844*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7 1845*c0909341SAndroid Build Coastguard Worker pmullw m4, m5 1846*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q1111 1847*c0909341SAndroid Build Coastguard Worker paddw m6, m0 1848*c0909341SAndroid Build Coastguard Worker mov r2d, r10d 1849*c0909341SAndroid Build Coastguard Worker pshuflw m0, m4, q3333 1850*c0909341SAndroid Build Coastguard Worker psubw m4, [rsp+8*0] 1851*c0909341SAndroid Build Coastguard Worker movq [rsp+8*3], m3 1852*c0909341SAndroid Build Coastguard Worker movq [rsp+8*5], m0 ; dy*4 1853*c0909341SAndroid Build Coastguard Worker mov r5, dstq 1854*c0909341SAndroid Build Coastguard Worker.w4_loop0: 1855*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m6 1856*c0909341SAndroid Build Coastguard Worker movq [rsp+8*4], m4 1857*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1858*c0909341SAndroid Build Coastguard Worker pand m0, m8, m4 1859*c0909341SAndroid Build Coastguard Worker%else 1860*c0909341SAndroid Build Coastguard Worker movq m0, [base+pw_62] 1861*c0909341SAndroid Build Coastguard Worker pand m0, m4 1862*c0909341SAndroid Build Coastguard Worker%endif 1863*c0909341SAndroid Build Coastguard Worker psraw m4, 6 1864*c0909341SAndroid Build Coastguard Worker psllw m0, 9 ; frac_y << 9 1865*c0909341SAndroid Build Coastguard Worker movq [rsp+8*7], m0 1866*c0909341SAndroid Build Coastguard Worker pabsw m4, m4 1867*c0909341SAndroid Build Coastguard Worker movq [rsp+8*6], m4 1868*c0909341SAndroid Build Coastguard Worker movzx hd, r9b 1869*c0909341SAndroid Build Coastguard Worker.w4_loop: 1870*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 1871*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x0 1872*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+r2*2] 1873*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 1874*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x1 1875*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+r3*2] 1876*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 1877*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x2 1878*c0909341SAndroid Build Coastguard Worker movu m3, [rsp+r2*2] 1879*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 1880*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x3 1881*c0909341SAndroid Build Coastguard Worker movu m4, [rsp+r3*2] 1882*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1883*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m9}, m2, m1, m3, m4 1884*c0909341SAndroid Build Coastguard Worker%else 1885*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+16*24] 1886*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m0}, m2, m1, m3, m4 1887*c0909341SAndroid Build Coastguard Worker%endif 1888*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m2, m1 1889*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m1 1890*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m3, m4 1891*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m4 1892*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1893*c0909341SAndroid Build Coastguard Worker pand m5, m8, m6 1894*c0909341SAndroid Build Coastguard Worker%else 1895*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pw_62] 1896*c0909341SAndroid Build Coastguard Worker pand m5, m6 1897*c0909341SAndroid Build Coastguard Worker%endif 1898*c0909341SAndroid Build Coastguard Worker psllw m5, 9 1899*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1900*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 1901*c0909341SAndroid Build Coastguard Worker paddw m5, m6, m7 1902*c0909341SAndroid Build Coastguard Worker psubw m3, m1 1903*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1904*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1905*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 1906*c0909341SAndroid Build Coastguard Worker%else 1907*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_62] 1908*c0909341SAndroid Build Coastguard Worker pand m2, m5 1909*c0909341SAndroid Build Coastguard Worker%endif 1910*c0909341SAndroid Build Coastguard Worker psllw m2, 9 1911*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 1912*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1913*c0909341SAndroid Build Coastguard Worker cmp r3d, 111 ; topleft 1914*c0909341SAndroid Build Coastguard Worker jge .w4_toponly 1915*c0909341SAndroid Build Coastguard Worker mova [rsp+16*22], m0 1916*c0909341SAndroid Build Coastguard Worker mova [rsp+16*23], m1 1917*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*6+0] ; base_y0 1918*c0909341SAndroid Build Coastguard Worker movu m3, [rsp+r3*2] 1919*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*6+2] ; base_y1 1920*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+r3*2] 1921*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*6+4] ; base_y2 1922*c0909341SAndroid Build Coastguard Worker movu m4, [rsp+r3*2] 1923*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*6+6] ; base_y3 1924*c0909341SAndroid Build Coastguard Worker movu m0, [rsp+r3*2] 1925*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1926*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m10}, m3, m2, m4, m0 1927*c0909341SAndroid Build Coastguard Worker%else 1928*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16*25] 1929*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m1}, m3, m2, m4, m0 1930*c0909341SAndroid Build Coastguard Worker%endif 1931*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m2 1932*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2 ; 01 1933*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 1934*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; 23 1935*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 ; y0 d1 1936*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 ; y2 y3 1937*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3, m4 1938*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4 1939*c0909341SAndroid Build Coastguard Worker movddup m4, [rsp+8*7] 1940*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1941*c0909341SAndroid Build Coastguard Worker psubw m3, m1 1942*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 1943*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 1944*c0909341SAndroid Build Coastguard Worker psraw m6, 15 ; base_x < topleft 1945*c0909341SAndroid Build Coastguard Worker psraw m4, m5, 15 1946*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1947*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1948*c0909341SAndroid Build Coastguard Worker pand m0, m6 1949*c0909341SAndroid Build Coastguard Worker pandn m6, [rsp+16*22] 1950*c0909341SAndroid Build Coastguard Worker pand m1, m4 1951*c0909341SAndroid Build Coastguard Worker pandn m4, [rsp+16*23] 1952*c0909341SAndroid Build Coastguard Worker por m0, m6 1953*c0909341SAndroid Build Coastguard Worker por m1, m4 1954*c0909341SAndroid Build Coastguard Worker.w4_toponly: 1955*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1956*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 1957*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 1958*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1959*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m1 1960*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m1 1961*c0909341SAndroid Build Coastguard Worker sub hd, 4 1962*c0909341SAndroid Build Coastguard Worker jz .w4_end 1963*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+8*6] 1964*c0909341SAndroid Build Coastguard Worker paddsw m6, m5, m7 ; xpos += dx 1965*c0909341SAndroid Build Coastguard Worker movq m5, [rsp+8*3] 1966*c0909341SAndroid Build Coastguard Worker psubw m4, m5 1967*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1968*c0909341SAndroid Build Coastguard Worker movq [rsp+8*6], m4 1969*c0909341SAndroid Build Coastguard Worker cmp r2d, r11d 1970*c0909341SAndroid Build Coastguard Worker jge .w4_loop 1971*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop: 1972*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*6+0] ; base_y0 1973*c0909341SAndroid Build Coastguard Worker movu m3, [rsp+r2*2] 1974*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*6+2] ; base_y1 1975*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+r2*2] 1976*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*6+4] ; base_y2 1977*c0909341SAndroid Build Coastguard Worker movu m6, [rsp+r2*2] 1978*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*6+6] ; base_y3 1979*c0909341SAndroid Build Coastguard Worker movu m0, [rsp+r2*2] 1980*c0909341SAndroid Build Coastguard Worker psubw m4, m5 1981*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1982*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m10}, m3, m2, m6, m0 1983*c0909341SAndroid Build Coastguard Worker%else 1984*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16*25] 1985*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m1}, m3, m2, m6, m0 1986*c0909341SAndroid Build Coastguard Worker%endif 1987*c0909341SAndroid Build Coastguard Worker movq [rsp+8*6], m4 1988*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m2 1989*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2 1990*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m6, m0 1991*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 1992*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 1993*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 1994*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3, m6 1995*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m6 1996*c0909341SAndroid Build Coastguard Worker movddup m6, [rsp+8*7] 1997*c0909341SAndroid Build Coastguard Worker psubw m2, m0 1998*c0909341SAndroid Build Coastguard Worker psubw m3, m1 1999*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 2000*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 2001*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2002*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2003*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 2004*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 2005*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2006*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m1 2007*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m1 2008*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2009*c0909341SAndroid Build Coastguard Worker sub hd, 4 2010*c0909341SAndroid Build Coastguard Worker jg .w4_leftonly_loop 2011*c0909341SAndroid Build Coastguard Worker.w4_end: 2012*c0909341SAndroid Build Coastguard Worker sub r9d, 1<<8 2013*c0909341SAndroid Build Coastguard Worker jl .w4_ret 2014*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+8*5] 2015*c0909341SAndroid Build Coastguard Worker add r5, 8 2016*c0909341SAndroid Build Coastguard Worker mov dstq, r5 2017*c0909341SAndroid Build Coastguard Worker paddw m4, [rsp+8*4] ; base_y += 4*dy 2018*c0909341SAndroid Build Coastguard Worker movzx r2d, word [rsp+8*1] 2019*c0909341SAndroid Build Coastguard Worker movddup m6, [rsp+8*1] 2020*c0909341SAndroid Build Coastguard Worker paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) 2021*c0909341SAndroid Build Coastguard Worker add r2d, r10d 2022*c0909341SAndroid Build Coastguard Worker mov r10d, r2d 2023*c0909341SAndroid Build Coastguard Worker jmp .w4_loop0 2024*c0909341SAndroid Build Coastguard Worker.w4_ret: 2025*c0909341SAndroid Build Coastguard Worker RET 2026*c0909341SAndroid Build Coastguard Worker.w8: 2027*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2028*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2029*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+126] 2030*c0909341SAndroid Build Coastguard Worker pshufhw m1, m5, q3333 2031*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2032*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2033*c0909341SAndroid Build Coastguard Worker%else 2034*c0909341SAndroid Build Coastguard Worker xor r3b, r3b 2035*c0909341SAndroid Build Coastguard Worker or r3d, hd 2036*c0909341SAndroid Build Coastguard Worker%endif 2037*c0909341SAndroid Build Coastguard Worker movhps [rsp+16*15], m1 2038*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2039*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2040*c0909341SAndroid Build Coastguard Worker call .upsample_above 2041*c0909341SAndroid Build Coastguard Worker sub angled, 53 2042*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2043*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 2044*c0909341SAndroid Build Coastguard Worker movu m1, [base+z_filt_wh8] 2045*c0909341SAndroid Build Coastguard Worker movd m2, r3d 2046*c0909341SAndroid Build Coastguard Worker movd m7, angled 2047*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2048*c0909341SAndroid Build Coastguard Worker psrldq m4, [base+z_filt_t_w48+angleq*8], 4 2049*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 2050*c0909341SAndroid Build Coastguard Worker pshufb m7, m3 2051*c0909341SAndroid Build Coastguard Worker pcmpeqb m2, m1 2052*c0909341SAndroid Build Coastguard Worker movq m1, [base+pw_512] 2053*c0909341SAndroid Build Coastguard Worker pand m7, m2 2054*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m4 2055*c0909341SAndroid Build Coastguard Worker movq [rsp+8*1], m1 ; 8<<6 2056*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left 2057*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above: 2058*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2059*c0909341SAndroid Build Coastguard Worker mov [rsp+16*4], angled 2060*c0909341SAndroid Build Coastguard Worker sub angled, 90 2061*c0909341SAndroid Build Coastguard Worker movd m2, r3d 2062*c0909341SAndroid Build Coastguard Worker mov r3d, 90 2063*c0909341SAndroid Build Coastguard Worker movd m1, angled 2064*c0909341SAndroid Build Coastguard Worker sub r3d, angled ; 180 - angle 2065*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2066*c0909341SAndroid Build Coastguard Worker movu m4, [base+z_filt_wh8] 2067*c0909341SAndroid Build Coastguard Worker movd m7, r3d 2068*c0909341SAndroid Build Coastguard Worker psrldq m5, [base+z_filt_t_w48+angleq*8], 4 2069*c0909341SAndroid Build Coastguard Worker mov r3d, 8 2070*c0909341SAndroid Build Coastguard Worker call .w8_filter_top 2071*c0909341SAndroid Build Coastguard Worker mov r3d, [rsp+16*4] 2072*c0909341SAndroid Build Coastguard Worker sub r3d, 141 2073*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2074*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2075*c0909341SAndroid Build Coastguard Worker%else 2076*c0909341SAndroid Build Coastguard Worker xor r3b, r3b 2077*c0909341SAndroid Build Coastguard Worker or r3d, hd 2078*c0909341SAndroid Build Coastguard Worker%endif 2079*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2080*c0909341SAndroid Build Coastguard Worker jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm 2081*c0909341SAndroid Build Coastguard Worker.w8_filter_left: 2082*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m7 2083*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2084*c0909341SAndroid Build Coastguard Worker jz .w4_main 2085*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2086*c0909341SAndroid Build Coastguard Worker neg hq 2087*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2088*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+hq*2] 2089*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 2090*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*13-2] 2091*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 2092*c0909341SAndroid Build Coastguard Worker movq [tlq+hq*2-6], m1 2093*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge 2094*c0909341SAndroid Build Coastguard Worker jmp .filter_left_end 2095*c0909341SAndroid Build Coastguard Worker.w8_filter_top: 2096*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m3}, m2, m1, m7 2097*c0909341SAndroid Build Coastguard Worker pcmpeqb m2, m4 2098*c0909341SAndroid Build Coastguard Worker pand m1, m2 2099*c0909341SAndroid Build Coastguard Worker pand m7, m2 2100*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m5 2101*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m5 2102*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2103*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2104*c0909341SAndroid Build Coastguard Worker jz .w8_filter_top_end ; filter_strength == 0 2105*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2106*c0909341SAndroid Build Coastguard Worker mov [dstq], tlq 2107*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*14+gprsize] 2108*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 2109*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge 2110*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2111*c0909341SAndroid Build Coastguard Worker mov r3d, r7m ; maxw, offset due to call 2112*c0909341SAndroid Build Coastguard Worker%else 2113*c0909341SAndroid Build Coastguard Worker mov r3d, [rsp+16*2+4*1] 2114*c0909341SAndroid Build Coastguard Worker%endif 2115*c0909341SAndroid Build Coastguard Worker mov tlq, [dstq] 2116*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2117*c0909341SAndroid Build Coastguard Worker jge .w8_filter_top_end 2118*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*0+2] 2119*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*1+2] 2120*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*14+gprsize], m1 2121*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*15+gprsize], m2 2122*c0909341SAndroid Build Coastguard Worker.w8_filter_top_end: 2123*c0909341SAndroid Build Coastguard Worker ret 2124*c0909341SAndroid Build Coastguard Worker.w16: 2125*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2126*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2127*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 2128*c0909341SAndroid Build Coastguard Worker sub angled, 90 2129*c0909341SAndroid Build Coastguard Worker movd m2, r3d 2130*c0909341SAndroid Build Coastguard Worker mov r3d, 90 2131*c0909341SAndroid Build Coastguard Worker movd m1, angled 2132*c0909341SAndroid Build Coastguard Worker sub r3d, angled ; 180 - angle 2133*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2134*c0909341SAndroid Build Coastguard Worker movd m7, r3d 2135*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m3}, m2, m1, m7 2136*c0909341SAndroid Build Coastguard Worker movq m4, [base+z_filt_t_w16+angleq*4] 2137*c0909341SAndroid Build Coastguard Worker pcmpeqb m2, [base+z_filt_wh16] 2138*c0909341SAndroid Build Coastguard Worker pand m1, m2 2139*c0909341SAndroid Build Coastguard Worker pand m7, m2 2140*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4 2141*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m4 2142*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2143*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2144*c0909341SAndroid Build Coastguard Worker jz .w16_filter_left ; filter_strength == 0 2145*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 2146*c0909341SAndroid Build Coastguard Worker pshufhw m6, m6, q3333 2147*c0909341SAndroid Build Coastguard Worker mov [dstq], tlq 2148*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*14] 2149*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2150*c0909341SAndroid Build Coastguard Worker movhps [tlq+16*2], m6 2151*c0909341SAndroid Build Coastguard Worker adc r5d, -1 ; filter_strength 2152*c0909341SAndroid Build Coastguard Worker mov r3d, 16 2153*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge 2154*c0909341SAndroid Build Coastguard Worker mov r3d, maxwm 2155*c0909341SAndroid Build Coastguard Worker mov tlq, [dstq] 2156*c0909341SAndroid Build Coastguard Worker cmp r3d, 16 2157*c0909341SAndroid Build Coastguard Worker jge .w16_filter_left 2158*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*0+2] 2159*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*1+2] 2160*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*14], m1 2161*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*15], m2 2162*c0909341SAndroid Build Coastguard Worker.w16_filter_left: 2163*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m7 2164*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2165*c0909341SAndroid Build Coastguard Worker jz .w4_main 2166*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 2167*c0909341SAndroid Build Coastguard Worker neg hq 2168*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2169*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+hq*2] 2170*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2171*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*13-2] 2172*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 2173*c0909341SAndroid Build Coastguard Worker adc r5d, -1 ; filter_strength 2174*c0909341SAndroid Build Coastguard Worker movq [tlq+hq*2-6], m1 2175*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge 2176*c0909341SAndroid Build Coastguard Worker jmp .filter_left_end 2177*c0909341SAndroid Build Coastguard Worker.w32: 2178*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*2+2] 2179*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*3+2] 2180*c0909341SAndroid Build Coastguard Worker mova [rsp+16*16], m1 2181*c0909341SAndroid Build Coastguard Worker mova [rsp+16*17], m2 2182*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2183*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2184*c0909341SAndroid Build Coastguard Worker mov [dstq], tlq 2185*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*14] 2186*c0909341SAndroid Build Coastguard Worker pshufhw m2, m2, q3333 2187*c0909341SAndroid Build Coastguard Worker mov r3d, 32 2188*c0909341SAndroid Build Coastguard Worker movhps [tlq+16*4], m2 2189*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 2190*c0909341SAndroid Build Coastguard Worker mov r3d, maxwm 2191*c0909341SAndroid Build Coastguard Worker mov tlq, [dstq] 2192*c0909341SAndroid Build Coastguard Worker cmp r3d, 32 2193*c0909341SAndroid Build Coastguard Worker jge .filter_left 2194*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*0+2] 2195*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*1+2] 2196*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*14], m1 2197*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*15], m2 2198*c0909341SAndroid Build Coastguard Worker cmp r3d, 16 2199*c0909341SAndroid Build Coastguard Worker jge .filter_left 2200*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*2+2] 2201*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*3+2] 2202*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*16], m1 2203*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*17], m2 2204*c0909341SAndroid Build Coastguard Worker.filter_left: 2205*c0909341SAndroid Build Coastguard Worker neg hq 2206*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2207*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq+hq*2], q0000 2208*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*13-2] 2209*c0909341SAndroid Build Coastguard Worker movq [tlq+hq*2-6], m1 2210*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 2211*c0909341SAndroid Build Coastguard Worker.filter_left_end: 2212*c0909341SAndroid Build Coastguard Worker mov r2d, maxhm 2213*c0909341SAndroid Build Coastguard Worker cmp r2d, hd 2214*c0909341SAndroid Build Coastguard Worker jge .w4_main 2215*c0909341SAndroid Build Coastguard Worker neg r2 2216*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2*2-16*1] 2217*c0909341SAndroid Build Coastguard Worker movu m2, [r3+r2*2-16*2] 2218*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16*12], m1 2219*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16*11], m2 2220*c0909341SAndroid Build Coastguard Worker cmp r2d, -48 2221*c0909341SAndroid Build Coastguard Worker jle .w4_main 2222*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2*2-16*3] 2223*c0909341SAndroid Build Coastguard Worker movu m2, [r3+r2*2-16*4] 2224*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16*10], m1 2225*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16* 9], m2 2226*c0909341SAndroid Build Coastguard Worker cmp r2d, -32 2227*c0909341SAndroid Build Coastguard Worker jle .w4_main 2228*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2*2-16*5] 2229*c0909341SAndroid Build Coastguard Worker movu m2, [r3+r2*2-16*6] 2230*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16* 8], m1 2231*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16* 7], m2 2232*c0909341SAndroid Build Coastguard Worker cmp r2d, -16 2233*c0909341SAndroid Build Coastguard Worker jle .w4_main 2234*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2*2-16*7] 2235*c0909341SAndroid Build Coastguard Worker movu m2, [r3+r2*2-16*8] 2236*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16* 6], m1 2237*c0909341SAndroid Build Coastguard Worker movu [rsp+r2*2+16* 5], m2 2238*c0909341SAndroid Build Coastguard Worker jmp .w4_main 2239*c0909341SAndroid Build Coastguard Worker.w64: 2240*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*2+2] 2241*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*3+2] 2242*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+16*4+2] 2243*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+16*5+2] 2244*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+16*6+2] 2245*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+16*7+2] 2246*c0909341SAndroid Build Coastguard Worker mov [dstq], tlq 2247*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*14] 2248*c0909341SAndroid Build Coastguard Worker mova [tlq+16*2], m1 2249*c0909341SAndroid Build Coastguard Worker mova [tlq+16*3], m2 2250*c0909341SAndroid Build Coastguard Worker mova [tlq+16*4], m3 2251*c0909341SAndroid Build Coastguard Worker mova [tlq+16*5], m4 2252*c0909341SAndroid Build Coastguard Worker mova [tlq+16*6], m5 2253*c0909341SAndroid Build Coastguard Worker mova [tlq+16*7], m6 2254*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2255*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2256*c0909341SAndroid Build Coastguard Worker pshufhw m6, m6, q3333 2257*c0909341SAndroid Build Coastguard Worker mov r3d, 64 2258*c0909341SAndroid Build Coastguard Worker movhps [tlq+16*8], m6 2259*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 2260*c0909341SAndroid Build Coastguard Worker mov r3d, maxwm 2261*c0909341SAndroid Build Coastguard Worker mov tlq, [dstq] 2262*c0909341SAndroid Build Coastguard Worker cmp r3d, 64 2263*c0909341SAndroid Build Coastguard Worker jge .filter_left 2264*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*0+2] 2265*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*1+2] 2266*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*14], m1 2267*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*15], m2 2268*c0909341SAndroid Build Coastguard Worker cmp r3d, 48 2269*c0909341SAndroid Build Coastguard Worker jge .filter_left 2270*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*2+2] 2271*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*3+2] 2272*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*16], m1 2273*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*17], m2 2274*c0909341SAndroid Build Coastguard Worker cmp r3d, 32 2275*c0909341SAndroid Build Coastguard Worker jge .filter_left 2276*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*4+2] 2277*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*5+2] 2278*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*18], m1 2279*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*19], m2 2280*c0909341SAndroid Build Coastguard Worker cmp r3d, 16 2281*c0909341SAndroid Build Coastguard Worker jge .filter_left 2282*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3*2+16*6+2] 2283*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3*2+16*7+2] 2284*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*20], m1 2285*c0909341SAndroid Build Coastguard Worker movu [rsp+r3*2+16*21], m2 2286*c0909341SAndroid Build Coastguard Worker jmp .filter_left 2287*c0909341SAndroid Build Coastguard Worker 2288*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2289*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w 2290*c0909341SAndroid Build Coastguard Worker %define base r7-$$ 2291*c0909341SAndroid Build Coastguard Worker lea r7, [$$] 2292*c0909341SAndroid Build Coastguard Worker mov org_wd, wd 2293*c0909341SAndroid Build Coastguard Worker%else 2294*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy 2295*c0909341SAndroid Build Coastguard Worker %define base r1-$$ 2296*c0909341SAndroid Build Coastguard Worker %define org_wd r5 2297*c0909341SAndroid Build Coastguard Worker %define org_wq r5 2298*c0909341SAndroid Build Coastguard Worker movd m6, r8m ; pixel_max 2299*c0909341SAndroid Build Coastguard Worker mov [dstq+4*0], strideq 2300*c0909341SAndroid Build Coastguard Worker LEA r1, $$ 2301*c0909341SAndroid Build Coastguard Worker mov [dstq+4*1], wd 2302*c0909341SAndroid Build Coastguard Worker%endif 2303*c0909341SAndroid Build Coastguard Worker tzcnt hd, hm 2304*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 2305*c0909341SAndroid Build Coastguard Worker sub tlq, 2 2306*c0909341SAndroid Build Coastguard Worker movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] 2307*c0909341SAndroid Build Coastguard Worker sub angled, 180 2308*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pw_256] 2309*c0909341SAndroid Build Coastguard Worker mov dyd, angled 2310*c0909341SAndroid Build Coastguard Worker neg dyd 2311*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 2312*c0909341SAndroid Build Coastguard Worker movddup m7, [base+pw_62] 2313*c0909341SAndroid Build Coastguard Worker or dyq, ~0x7e 2314*c0909341SAndroid Build Coastguard Worker lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] 2315*c0909341SAndroid Build Coastguard Worker movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] 2316*c0909341SAndroid Build Coastguard Worker jmp hq 2317*c0909341SAndroid Build Coastguard Worker.h4: 2318*c0909341SAndroid Build Coastguard Worker lea r4d, [angleq+88] 2319*c0909341SAndroid Build Coastguard Worker test r4d, 0x480 2320*c0909341SAndroid Build Coastguard Worker jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 2321*c0909341SAndroid Build Coastguard Worker sar r4d, 9 2322*c0909341SAndroid Build Coastguard Worker add r4d, wd 2323*c0909341SAndroid Build Coastguard Worker cmp r4d, 8 2324*c0909341SAndroid Build Coastguard Worker jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) 2325*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 2326*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 2327*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2328*c0909341SAndroid Build Coastguard Worker movd m6, r8m 2329*c0909341SAndroid Build Coastguard Worker%endif 2330*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m0 2331*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 2332*c0909341SAndroid Build Coastguard Worker palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 2333*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2334*c0909341SAndroid Build Coastguard Worker palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 2335*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2336*c0909341SAndroid Build Coastguard Worker paddw m3, m5 2337*c0909341SAndroid Build Coastguard Worker psubw m5, m1, m3 2338*c0909341SAndroid Build Coastguard Worker mova m3, [base+z_upsample] 2339*c0909341SAndroid Build Coastguard Worker mova [tlq+ 0], m4 2340*c0909341SAndroid Build Coastguard Worker movd m4, dyd 2341*c0909341SAndroid Build Coastguard Worker psraw m5, 3 2342*c0909341SAndroid Build Coastguard Worker neg dyd 2343*c0909341SAndroid Build Coastguard Worker paddw m1, m5 2344*c0909341SAndroid Build Coastguard Worker pxor m5, m5 2345*c0909341SAndroid Build Coastguard Worker lea r5d, [dyq+(16<<6)+63] ; ypos 2346*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m5 2347*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2348*c0909341SAndroid Build Coastguard Worker shl wd, 3 2349*c0909341SAndroid Build Coastguard Worker pavgw m1, m5 2350*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 2351*c0909341SAndroid Build Coastguard Worker pminsw m1, m6 2352*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2353*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1, m2 2354*c0909341SAndroid Build Coastguard Worker paddw m5, m4, m4 2355*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 2356*c0909341SAndroid Build Coastguard Worker mova [tlq+32], m0 2357*c0909341SAndroid Build Coastguard Worker movsd m4, m5 2358*c0909341SAndroid Build Coastguard Worker mova [tlq+16], m1 2359*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop: 2360*c0909341SAndroid Build Coastguard Worker lea r4d, [r5+dyq] 2361*c0909341SAndroid Build Coastguard Worker sar r5d, 6 2362*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2] 2363*c0909341SAndroid Build Coastguard Worker lea r5d, [r4+dyq] 2364*c0909341SAndroid Build Coastguard Worker sar r4d, 6 2365*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2] 2366*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 2367*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 2368*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m1, m2 2369*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m2 2370*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 2371*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2372*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2373*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2374*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2375*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2376*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m0 2377*c0909341SAndroid Build Coastguard Worker sub wd, 16 2378*c0909341SAndroid Build Coastguard Worker jg .h4_upsample_loop 2379*c0909341SAndroid Build Coastguard Worker or r3d, 4*2 2380*c0909341SAndroid Build Coastguard Worker jmp .end_transpose 2381*c0909341SAndroid Build Coastguard Worker.h4_no_upsample: 2382*c0909341SAndroid Build Coastguard Worker mov r4d, 7 2383*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2384*c0909341SAndroid Build Coastguard Worker jnz .h4_main 2385*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+3] 2386*c0909341SAndroid Build Coastguard Worker movd m1, r4d 2387*c0909341SAndroid Build Coastguard Worker movd m3, angled 2388*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2389*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2390*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 2391*c0909341SAndroid Build Coastguard Worker pshufb m3, m2 2392*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, [base+z_filt_wh4] 2393*c0909341SAndroid Build Coastguard Worker pand m1, m3 2394*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, [base+z_filt_t_w48+angleq*8] 2395*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2396*c0909341SAndroid Build Coastguard Worker mov r4d, 7 2397*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2398*c0909341SAndroid Build Coastguard Worker jz .h4_main ; filter_strength == 0 2399*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq+2], q0000 2400*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2401*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-14] 2402*c0909341SAndroid Build Coastguard Worker neg r4 2403*c0909341SAndroid Build Coastguard Worker movd m3, [tlq+r4*2] 2404*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2405*c0909341SAndroid Build Coastguard Worker movd [rsp+16*17], m1 2406*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q0000 2407*c0909341SAndroid Build Coastguard Worker mova [rsp+16*16], m2 2408*c0909341SAndroid Build Coastguard Worker lea r2, [r4-2] 2409*c0909341SAndroid Build Coastguard Worker movq [rsp+16*17+r4*2-10], m3 2410*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2411*c0909341SAndroid Build Coastguard Worker cmovae r4, r2 2412*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*17-2] 2413*c0909341SAndroid Build Coastguard Worker call .filter_edge 2414*c0909341SAndroid Build Coastguard Worker.h4_main: 2415*c0909341SAndroid Build Coastguard Worker movd m4, dyd 2416*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2417*c0909341SAndroid Build Coastguard Worker movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 2418*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2419*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2420*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] 2421*c0909341SAndroid Build Coastguard Worker movd m3, r4d 2422*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 2423*c0909341SAndroid Build Coastguard Worker neg dyq 2424*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2425*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] ; ypos 2426*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 2427*c0909341SAndroid Build Coastguard Worker shl wd, 3 2428*c0909341SAndroid Build Coastguard Worker paddw m5, m4, m4 2429*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2430*c0909341SAndroid Build Coastguard Worker psubw m3, m1 ; max_base_y 2431*c0909341SAndroid Build Coastguard Worker movsd m4, m5 ; ypos1 ypos0 2432*c0909341SAndroid Build Coastguard Worker.h4_loop: 2433*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 2434*c0909341SAndroid Build Coastguard Worker sar r5, 6 2435*c0909341SAndroid Build Coastguard Worker movddup m0, [tlq+r5*2-6] 2436*c0909341SAndroid Build Coastguard Worker movddup m1, [tlq+r5*2-8] 2437*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 2438*c0909341SAndroid Build Coastguard Worker sar r4, 6 2439*c0909341SAndroid Build Coastguard Worker movlps m0, [tlq+r4*2-6] 2440*c0909341SAndroid Build Coastguard Worker movlps m1, [tlq+r4*2-8] 2441*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 2442*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2443*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2444*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2445*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m3, m4 2446*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2447*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2448*c0909341SAndroid Build Coastguard Worker pand m0, m2 2449*c0909341SAndroid Build Coastguard Worker pandn m2, m6 2450*c0909341SAndroid Build Coastguard Worker por m0, m2 2451*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m0 2452*c0909341SAndroid Build Coastguard Worker sub wd, 16 2453*c0909341SAndroid Build Coastguard Worker jz .h4_transpose 2454*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2455*c0909341SAndroid Build Coastguard Worker jg .h4_loop 2456*c0909341SAndroid Build Coastguard Worker.h4_end_loop: 2457*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m6 2458*c0909341SAndroid Build Coastguard Worker sub wd, 16 2459*c0909341SAndroid Build Coastguard Worker jg .h4_end_loop 2460*c0909341SAndroid Build Coastguard Worker.h4_transpose: 2461*c0909341SAndroid Build Coastguard Worker or r3d, 4*2 2462*c0909341SAndroid Build Coastguard Worker jmp .end_transpose 2463*c0909341SAndroid Build Coastguard Worker.h8: 2464*c0909341SAndroid Build Coastguard Worker lea r4d, [angleq+88] 2465*c0909341SAndroid Build Coastguard Worker and r4d, ~0x7f 2466*c0909341SAndroid Build Coastguard Worker or r4d, wd 2467*c0909341SAndroid Build Coastguard Worker cmp r4d, 8 2468*c0909341SAndroid Build Coastguard Worker ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2469*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-30] ; g f e d c b a 9 2470*c0909341SAndroid Build Coastguard Worker movu m1, [tlq-32] ; _ g f e d c b a 2471*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 2472*c0909341SAndroid Build Coastguard Worker paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 2473*c0909341SAndroid Build Coastguard Worker pshufd m4, m2, q2100 ; _ _ g f e d c b 2474*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2475*c0909341SAndroid Build Coastguard Worker movu m5, [tlq-28] ; f e d c b a 9 8 2476*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2477*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2478*c0909341SAndroid Build Coastguard Worker je .h8_upsample_w8 2479*c0909341SAndroid Build Coastguard Worker pshufhw m4, m2, q1000 ; _ _ _ _ c c c b 2480*c0909341SAndroid Build Coastguard Worker.h8_upsample_w8: 2481*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2482*c0909341SAndroid Build Coastguard Worker psubw m5, m1, m4 2483*c0909341SAndroid Build Coastguard Worker movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 2484*c0909341SAndroid Build Coastguard Worker psraw m5, 3 2485*c0909341SAndroid Build Coastguard Worker paddw m1, m5 2486*c0909341SAndroid Build Coastguard Worker movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 2487*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2488*c0909341SAndroid Build Coastguard Worker movd m6, r8m ; pixel_max 2489*c0909341SAndroid Build Coastguard Worker%endif 2490*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2491*c0909341SAndroid Build Coastguard Worker shl wd, 4 2492*c0909341SAndroid Build Coastguard Worker psubw m5, m3, m4 2493*c0909341SAndroid Build Coastguard Worker movd m4, dyd 2494*c0909341SAndroid Build Coastguard Worker psraw m5, 3 2495*c0909341SAndroid Build Coastguard Worker neg dyd 2496*c0909341SAndroid Build Coastguard Worker paddw m3, m5 2497*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2498*c0909341SAndroid Build Coastguard Worker mova m5, [tlq-14] 2499*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 2500*c0909341SAndroid Build Coastguard Worker pxor m0, m0 2501*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m0 2502*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m0 2503*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 2504*c0909341SAndroid Build Coastguard Worker pavgw m1, m0 2505*c0909341SAndroid Build Coastguard Worker pavgw m3, m0 2506*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2507*c0909341SAndroid Build Coastguard Worker pminsw m1, m6 2508*c0909341SAndroid Build Coastguard Worker pminsw m6, m3 2509*c0909341SAndroid Build Coastguard Worker mova m3, [base+z_upsample] 2510*c0909341SAndroid Build Coastguard Worker lea r5d, [dyq+(16<<6)+63] ; ypos 2511*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 2512*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m0 2513*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 2514*c0909341SAndroid Build Coastguard Worker mova [tlq+16*1], m1 2515*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m6, m5 2516*c0909341SAndroid Build Coastguard Worker mova [tlq+16*2], m0 2517*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m5 2518*c0909341SAndroid Build Coastguard Worker mova [tlq+16*3], m6 2519*c0909341SAndroid Build Coastguard Worker mova m5, m4 2520*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop: 2521*c0909341SAndroid Build Coastguard Worker mov r4d, r5d 2522*c0909341SAndroid Build Coastguard Worker sar r4d, 6 2523*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2+16*0] 2524*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r4*2+16*1] 2525*c0909341SAndroid Build Coastguard Worker add r5d, dyd 2526*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 2527*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 2528*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m1, m2 2529*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m2 2530*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 2531*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2532*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2533*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2534*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2535*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2536*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m0 2537*c0909341SAndroid Build Coastguard Worker sub wd, 16 2538*c0909341SAndroid Build Coastguard Worker jg .h8_upsample_loop 2539*c0909341SAndroid Build Coastguard Worker or r3d, 8*2 2540*c0909341SAndroid Build Coastguard Worker jmp .end_transpose 2541*c0909341SAndroid Build Coastguard Worker.h8_no_upsample: 2542*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+7] 2543*c0909341SAndroid Build Coastguard Worker movd m1, r4d 2544*c0909341SAndroid Build Coastguard Worker and r4d, 7 2545*c0909341SAndroid Build Coastguard Worker or r4d, 8 ; imin(w+7, 15) 2546*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2547*c0909341SAndroid Build Coastguard Worker jnz .h8_main 2548*c0909341SAndroid Build Coastguard Worker movd m3, angled 2549*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2550*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2551*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 2552*c0909341SAndroid Build Coastguard Worker pshufb m3, m2 2553*c0909341SAndroid Build Coastguard Worker movu m2, [base+z_filt_wh8] 2554*c0909341SAndroid Build Coastguard Worker psrldq m4, [base+z_filt_t_w48+angleq*8], 4 2555*c0909341SAndroid Build Coastguard Worker pcmpeqb m2, m1 2556*c0909341SAndroid Build Coastguard Worker pand m2, m3 2557*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m4 2558*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m2 2559*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2560*c0909341SAndroid Build Coastguard Worker jz .h8_main ; filter_strength == 0 2561*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq+2], q0000 2562*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2563*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*1+2] 2564*c0909341SAndroid Build Coastguard Worker neg r4 2565*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*2+2] 2566*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2567*c0909341SAndroid Build Coastguard Worker movd m4, [tlq+r4*2] 2568*c0909341SAndroid Build Coastguard Worker movd [rsp+16*17], m1 2569*c0909341SAndroid Build Coastguard Worker mova [rsp+16*16], m2 2570*c0909341SAndroid Build Coastguard Worker pshuflw m4, m4, q0000 2571*c0909341SAndroid Build Coastguard Worker mova [rsp+16*15], m3 2572*c0909341SAndroid Build Coastguard Worker lea r2, [r4-2] 2573*c0909341SAndroid Build Coastguard Worker movq [rsp+16*17+r4*2-10], m4 2574*c0909341SAndroid Build Coastguard Worker cmp wd, 16 2575*c0909341SAndroid Build Coastguard Worker cmovae r4, r2 2576*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*17-2] 2577*c0909341SAndroid Build Coastguard Worker call .filter_edge 2578*c0909341SAndroid Build Coastguard Worker.h8_main: 2579*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2580*c0909341SAndroid Build Coastguard Worker movd m4, dyd 2581*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2582*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2583*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] 2584*c0909341SAndroid Build Coastguard Worker movd m3, r4d 2585*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 2586*c0909341SAndroid Build Coastguard Worker neg dyq 2587*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2588*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 2589*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 2590*c0909341SAndroid Build Coastguard Worker shl wd, 4 2591*c0909341SAndroid Build Coastguard Worker mova m5, m4 2592*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2593*c0909341SAndroid Build Coastguard Worker psubw m3, [base+z_base_inc_z2] 2594*c0909341SAndroid Build Coastguard Worker.h8_loop: 2595*c0909341SAndroid Build Coastguard Worker mov r4, r5 2596*c0909341SAndroid Build Coastguard Worker sar r4, 6 2597*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2-14] 2598*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2-16] 2599*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 2600*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2601*c0909341SAndroid Build Coastguard Worker psubw m1, m0 2602*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 2603*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m3, m4 2604*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2605*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2606*c0909341SAndroid Build Coastguard Worker pand m0, m2 2607*c0909341SAndroid Build Coastguard Worker pandn m2, m6 2608*c0909341SAndroid Build Coastguard Worker por m0, m2 2609*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m0 2610*c0909341SAndroid Build Coastguard Worker sub wd, 8*2 2611*c0909341SAndroid Build Coastguard Worker jz .h8_transpose 2612*c0909341SAndroid Build Coastguard Worker add r5, dyq 2613*c0909341SAndroid Build Coastguard Worker jg .h8_loop 2614*c0909341SAndroid Build Coastguard Worker.h8_end_loop: 2615*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m6 2616*c0909341SAndroid Build Coastguard Worker sub wd, 8*2 2617*c0909341SAndroid Build Coastguard Worker jg .h8_end_loop 2618*c0909341SAndroid Build Coastguard Worker.h8_transpose: 2619*c0909341SAndroid Build Coastguard Worker or r3d, 8*2 2620*c0909341SAndroid Build Coastguard Worker jmp .end_transpose 2621*c0909341SAndroid Build Coastguard Worker.h16: 2622*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+15] 2623*c0909341SAndroid Build Coastguard Worker movd m1, r4d 2624*c0909341SAndroid Build Coastguard Worker and r4d, 15 2625*c0909341SAndroid Build Coastguard Worker or r4d, 16 ; imin(w+15, 31) 2626*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2627*c0909341SAndroid Build Coastguard Worker jnz .h16_main 2628*c0909341SAndroid Build Coastguard Worker movd m3, angled 2629*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2630*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2631*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 2632*c0909341SAndroid Build Coastguard Worker pshufb m3, m2 2633*c0909341SAndroid Build Coastguard Worker movq m4, [base+z_filt_t_w16+angleq*4] 2634*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, [base+z_filt_wh16] 2635*c0909341SAndroid Build Coastguard Worker pand m1, m3 2636*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4 2637*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2638*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2639*c0909341SAndroid Build Coastguard Worker jz .h16_main ; filter_strength == 0 2640*c0909341SAndroid Build Coastguard Worker pshuflw m1, [tlq+2], q0000 2641*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*1+2] 2642*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 2643*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*2+2] 2644*c0909341SAndroid Build Coastguard Worker neg r4 2645*c0909341SAndroid Build Coastguard Worker mova m4, [tlq-16*3+2] 2646*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2647*c0909341SAndroid Build Coastguard Worker mova m5, [tlq-16*4+2] 2648*c0909341SAndroid Build Coastguard Worker movd m6, [tlq+r4*2] 2649*c0909341SAndroid Build Coastguard Worker adc r5d, -1 ; filter_strength 2650*c0909341SAndroid Build Coastguard Worker movd [rsp+16*17], m1 2651*c0909341SAndroid Build Coastguard Worker mova [rsp+16*16], m2 2652*c0909341SAndroid Build Coastguard Worker mova [rsp+16*15], m3 2653*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 2654*c0909341SAndroid Build Coastguard Worker mova [rsp+16*14], m4 2655*c0909341SAndroid Build Coastguard Worker mova [rsp+16*13], m5 2656*c0909341SAndroid Build Coastguard Worker lea r2, [r4-2] 2657*c0909341SAndroid Build Coastguard Worker movq [rsp+16*17+r4*2-10], m6 2658*c0909341SAndroid Build Coastguard Worker cmp wd, 32 2659*c0909341SAndroid Build Coastguard Worker cmovae r4, r2 2660*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*17-2] 2661*c0909341SAndroid Build Coastguard Worker call .filter_edge 2662*c0909341SAndroid Build Coastguard Worker.h16_main: 2663*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2664*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2665*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2666*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2667*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] 2668*c0909341SAndroid Build Coastguard Worker movd m3, r4d 2669*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 2670*c0909341SAndroid Build Coastguard Worker neg dyq 2671*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2672*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 2673*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 2674*c0909341SAndroid Build Coastguard Worker shl wd, 5 2675*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [base+z_base_inc_z2] 2676*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2677*c0909341SAndroid Build Coastguard Worker psubw m4, m3 2678*c0909341SAndroid Build Coastguard Worker.h16_loop: 2679*c0909341SAndroid Build Coastguard Worker mov r4, r5 2680*c0909341SAndroid Build Coastguard Worker sar r4, 6 2681*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2-14] 2682*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r4*2-16] 2683*c0909341SAndroid Build Coastguard Worker pand m3, m7, m4 2684*c0909341SAndroid Build Coastguard Worker psllw m3, 9 2685*c0909341SAndroid Build Coastguard Worker psubw m2, m0 2686*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 2687*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2-30] 2688*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2689*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r4*2-32] 2690*c0909341SAndroid Build Coastguard Worker psubw m2, m1 2691*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 2692*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m512] 2693*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2694*c0909341SAndroid Build Coastguard Worker psraw m2, m4, 15 2695*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2696*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2697*c0909341SAndroid Build Coastguard Worker pand m0, m2 2698*c0909341SAndroid Build Coastguard Worker pandn m2, m6 2699*c0909341SAndroid Build Coastguard Worker pand m1, m3 2700*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2701*c0909341SAndroid Build Coastguard Worker por m0, m2 2702*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16*1], m0 2703*c0909341SAndroid Build Coastguard Worker por m1, m3 2704*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16*2], m1 2705*c0909341SAndroid Build Coastguard Worker sub wd, 16*2 2706*c0909341SAndroid Build Coastguard Worker jz .h16_transpose 2707*c0909341SAndroid Build Coastguard Worker add r5, dyq 2708*c0909341SAndroid Build Coastguard Worker jg .h16_loop 2709*c0909341SAndroid Build Coastguard Worker.h16_end_loop: 2710*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16*1], m6 2711*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16*2], m6 2712*c0909341SAndroid Build Coastguard Worker sub wd, 16*2 2713*c0909341SAndroid Build Coastguard Worker jg .h16_end_loop 2714*c0909341SAndroid Build Coastguard Worker.h16_transpose: 2715*c0909341SAndroid Build Coastguard Worker or r3d, 16*2 2716*c0909341SAndroid Build Coastguard Worker jmp .end_transpose 2717*c0909341SAndroid Build Coastguard Worker.h32: 2718*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+31] 2719*c0909341SAndroid Build Coastguard Worker and r4d, 31 2720*c0909341SAndroid Build Coastguard Worker or r4d, 32 ; imin(w+31, 63) 2721*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2722*c0909341SAndroid Build Coastguard Worker jnz .h32_main 2723*c0909341SAndroid Build Coastguard Worker call .filter_copy 2724*c0909341SAndroid Build Coastguard Worker lea r5, [r4-2] 2725*c0909341SAndroid Build Coastguard Worker cmp wd, 64 2726*c0909341SAndroid Build Coastguard Worker cmove r4, r5 2727*c0909341SAndroid Build Coastguard Worker call .filter_edge_s3 2728*c0909341SAndroid Build Coastguard Worker.h32_main: 2729*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2730*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2731*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2732*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2733*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] 2734*c0909341SAndroid Build Coastguard Worker movd m3, r4d 2735*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 2736*c0909341SAndroid Build Coastguard Worker neg dyq 2737*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2738*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 2739*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 2740*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [base+z_base_inc_z2] 2741*c0909341SAndroid Build Coastguard Worker psubw m4, m3 2742*c0909341SAndroid Build Coastguard Worker.h32_loop: 2743*c0909341SAndroid Build Coastguard Worker mov r4, r5 2744*c0909341SAndroid Build Coastguard Worker sar r4, 6 2745*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2-14] 2746*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2-16] 2747*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 2748*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2749*c0909341SAndroid Build Coastguard Worker psubw m3, m0 2750*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2751*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2-30] 2752*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2753*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2-32] 2754*c0909341SAndroid Build Coastguard Worker psubw m3, m1 2755*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2756*c0909341SAndroid Build Coastguard Worker sub rsp, 16*4 2757*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2758*c0909341SAndroid Build Coastguard Worker psraw m3, m4, 15 2759*c0909341SAndroid Build Coastguard Worker pand m0, m3 2760*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2761*c0909341SAndroid Build Coastguard Worker por m0, m3 2762*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m512] 2763*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2764*c0909341SAndroid Build Coastguard Worker pand m1, m3 2765*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2766*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 2767*c0909341SAndroid Build Coastguard Worker por m1, m3 2768*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m1 2769*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2-46] 2770*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2-48] 2771*c0909341SAndroid Build Coastguard Worker psubw m3, m0 2772*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2773*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2-62] 2774*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2775*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2-64] 2776*c0909341SAndroid Build Coastguard Worker psubw m3, m1 2777*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2778*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m1024] 2779*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2780*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m1536] 2781*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 2782*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2783*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2784*c0909341SAndroid Build Coastguard Worker pand m0, m2 2785*c0909341SAndroid Build Coastguard Worker pandn m2, m6 2786*c0909341SAndroid Build Coastguard Worker pand m1, m3 2787*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2788*c0909341SAndroid Build Coastguard Worker por m0, m2 2789*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 2790*c0909341SAndroid Build Coastguard Worker por m1, m3 2791*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 2792*c0909341SAndroid Build Coastguard Worker dec wd 2793*c0909341SAndroid Build Coastguard Worker jz .h32_transpose 2794*c0909341SAndroid Build Coastguard Worker add r5, dyq 2795*c0909341SAndroid Build Coastguard Worker jg .h32_loop 2796*c0909341SAndroid Build Coastguard Worker.h32_end_loop: 2797*c0909341SAndroid Build Coastguard Worker sub rsp, 16*4 2798*c0909341SAndroid Build Coastguard Worker REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 2799*c0909341SAndroid Build Coastguard Worker dec wd 2800*c0909341SAndroid Build Coastguard Worker jg .h32_end_loop 2801*c0909341SAndroid Build Coastguard Worker.h32_transpose: 2802*c0909341SAndroid Build Coastguard Worker or r3d, 32*2 2803*c0909341SAndroid Build Coastguard Worker jmp .end_transpose 2804*c0909341SAndroid Build Coastguard Worker.h64: 2805*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+63] 2806*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2807*c0909341SAndroid Build Coastguard Worker jnz .h64_main 2808*c0909341SAndroid Build Coastguard Worker call .filter_copy 2809*c0909341SAndroid Build Coastguard Worker call .filter_edge_s3 2810*c0909341SAndroid Build Coastguard Worker.h64_main: 2811*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2812*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2813*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2814*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2815*c0909341SAndroid Build Coastguard Worker movd m6, [tlq] 2816*c0909341SAndroid Build Coastguard Worker movd m3, r4d 2817*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 2818*c0909341SAndroid Build Coastguard Worker neg dyq 2819*c0909341SAndroid Build Coastguard Worker pshufb m6, m0 2820*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 2821*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 2822*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [base+z_base_inc_z2] 2823*c0909341SAndroid Build Coastguard Worker psubw m4, m3 2824*c0909341SAndroid Build Coastguard Worker.h64_loop: 2825*c0909341SAndroid Build Coastguard Worker mov r4, r5 2826*c0909341SAndroid Build Coastguard Worker sar r4, 6 2827*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2- 14] 2828*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2- 16] 2829*c0909341SAndroid Build Coastguard Worker pand m2, m7, m4 2830*c0909341SAndroid Build Coastguard Worker psllw m2, 9 2831*c0909341SAndroid Build Coastguard Worker psubw m3, m0 2832*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2833*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2- 30] 2834*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2835*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2- 32] 2836*c0909341SAndroid Build Coastguard Worker psubw m3, m1 2837*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2838*c0909341SAndroid Build Coastguard Worker sub rsp, 16*8 2839*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2840*c0909341SAndroid Build Coastguard Worker psraw m3, m4, 15 2841*c0909341SAndroid Build Coastguard Worker pand m0, m3 2842*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2843*c0909341SAndroid Build Coastguard Worker por m0, m3 2844*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m512] 2845*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2846*c0909341SAndroid Build Coastguard Worker pand m1, m3 2847*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2848*c0909341SAndroid Build Coastguard Worker mova [rsp+16*7], m0 2849*c0909341SAndroid Build Coastguard Worker por m1, m3 2850*c0909341SAndroid Build Coastguard Worker mova [rsp+16*6], m1 2851*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2- 46] 2852*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2- 48] 2853*c0909341SAndroid Build Coastguard Worker psubw m3, m0 2854*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2855*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2- 62] 2856*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2857*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2- 64] 2858*c0909341SAndroid Build Coastguard Worker psubw m3, m1 2859*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2860*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2861*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m1024] 2862*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2863*c0909341SAndroid Build Coastguard Worker pand m0, m3 2864*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2865*c0909341SAndroid Build Coastguard Worker por m0, m3 2866*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m1536] 2867*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2868*c0909341SAndroid Build Coastguard Worker pand m1, m3 2869*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2870*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m0 2871*c0909341SAndroid Build Coastguard Worker por m1, m3 2872*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m1 2873*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2- 78] 2874*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2- 80] 2875*c0909341SAndroid Build Coastguard Worker psubw m3, m0 2876*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2877*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2- 94] 2878*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2879*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2- 96] 2880*c0909341SAndroid Build Coastguard Worker psubw m3, m1 2881*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2882*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2883*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m2048] 2884*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2885*c0909341SAndroid Build Coastguard Worker pand m0, m3 2886*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2887*c0909341SAndroid Build Coastguard Worker por m0, m3 2888*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m2560] 2889*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2890*c0909341SAndroid Build Coastguard Worker pand m1, m3 2891*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2892*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 2893*c0909341SAndroid Build Coastguard Worker por m1, m3 2894*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m1 2895*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4*2-110] 2896*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2-112] 2897*c0909341SAndroid Build Coastguard Worker psubw m3, m0 2898*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2899*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4*2-126] 2900*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2901*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r4*2-128] 2902*c0909341SAndroid Build Coastguard Worker psubw m3, m1 2903*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2 2904*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pw_m3072] 2905*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2906*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_m3584] 2907*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 2908*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m4 2909*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2910*c0909341SAndroid Build Coastguard Worker pand m0, m2 2911*c0909341SAndroid Build Coastguard Worker pandn m2, m6 2912*c0909341SAndroid Build Coastguard Worker pand m1, m3 2913*c0909341SAndroid Build Coastguard Worker pandn m3, m6 2914*c0909341SAndroid Build Coastguard Worker por m0, m2 2915*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 2916*c0909341SAndroid Build Coastguard Worker por m1, m3 2917*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 2918*c0909341SAndroid Build Coastguard Worker dec wd 2919*c0909341SAndroid Build Coastguard Worker jz .h64_transpose 2920*c0909341SAndroid Build Coastguard Worker add r5, dyq 2921*c0909341SAndroid Build Coastguard Worker jg .h64_loop 2922*c0909341SAndroid Build Coastguard Worker.h64_end_loop: 2923*c0909341SAndroid Build Coastguard Worker sub rsp, 16*8 2924*c0909341SAndroid Build Coastguard Worker REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 2925*c0909341SAndroid Build Coastguard Worker dec wd 2926*c0909341SAndroid Build Coastguard Worker jg .h64_end_loop 2927*c0909341SAndroid Build Coastguard Worker.h64_transpose: 2928*c0909341SAndroid Build Coastguard Worker add r3d, 64*2 2929*c0909341SAndroid Build Coastguard Worker.end_transpose: 2930*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2931*c0909341SAndroid Build Coastguard Worker lea r7, [strideq*3] 2932*c0909341SAndroid Build Coastguard Worker%else 2933*c0909341SAndroid Build Coastguard Worker mov strideq, [dstq+4*0] 2934*c0909341SAndroid Build Coastguard Worker mov org_wd, [dstq+4*1] 2935*c0909341SAndroid Build Coastguard Worker%endif 2936*c0909341SAndroid Build Coastguard Worker lea r4d, [r3*3] 2937*c0909341SAndroid Build Coastguard Worker.end_transpose_loop: 2938*c0909341SAndroid Build Coastguard Worker lea r2, [rsp+r3-8] 2939*c0909341SAndroid Build Coastguard Worker lea r6, [dstq+org_wq*2-8] 2940*c0909341SAndroid Build Coastguard Worker.end_transpose_loop_y: 2941*c0909341SAndroid Build Coastguard Worker movq m0, [r2+r4 ] 2942*c0909341SAndroid Build Coastguard Worker movq m1, [r2+r3*2] 2943*c0909341SAndroid Build Coastguard Worker movq m2, [r2+r3*1] 2944*c0909341SAndroid Build Coastguard Worker movq m3, [r2+r3*0] 2945*c0909341SAndroid Build Coastguard Worker sub r2, 8 2946*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 2947*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 2948*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m0, m2 2949*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 2950*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*0], m1 2951*c0909341SAndroid Build Coastguard Worker movq [r6+strideq*1], m1 2952*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2953*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*2], m0 2954*c0909341SAndroid Build Coastguard Worker movq [r6+r7 ], m0 2955*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*4] 2956*c0909341SAndroid Build Coastguard Worker%else 2957*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 2958*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*0], m0 2959*c0909341SAndroid Build Coastguard Worker movq [r6+strideq*1], m0 2960*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 2961*c0909341SAndroid Build Coastguard Worker%endif 2962*c0909341SAndroid Build Coastguard Worker cmp r2, rsp 2963*c0909341SAndroid Build Coastguard Worker jae .end_transpose_loop_y 2964*c0909341SAndroid Build Coastguard Worker lea rsp, [rsp+r3*4] 2965*c0909341SAndroid Build Coastguard Worker sub org_wd, 4 2966*c0909341SAndroid Build Coastguard Worker jg .end_transpose_loop 2967*c0909341SAndroid Build Coastguard Worker RET 2968*c0909341SAndroid Build Coastguard Worker.filter_copy: 2969*c0909341SAndroid Build Coastguard Worker neg r4 2970*c0909341SAndroid Build Coastguard Worker pshuflw m2, [tlq+2], q0000 2971*c0909341SAndroid Build Coastguard Worker xor r5d, r5d 2972*c0909341SAndroid Build Coastguard Worker pshuflw m3, [tlq+r4*2], q0000 2973*c0909341SAndroid Build Coastguard Worker movq [rsp+gprsize+16*17], m2 2974*c0909341SAndroid Build Coastguard Worker.filter_copy_loop: 2975*c0909341SAndroid Build Coastguard Worker mova m1, [tlq+r5*2-16*1+2] 2976*c0909341SAndroid Build Coastguard Worker mova m2, [tlq+r5*2-16*2+2] 2977*c0909341SAndroid Build Coastguard Worker sub r5, 16 2978*c0909341SAndroid Build Coastguard Worker mova [rsp+r5*2+gprsize+16*18], m1 2979*c0909341SAndroid Build Coastguard Worker mova [rsp+r5*2+gprsize+16*17], m2 2980*c0909341SAndroid Build Coastguard Worker cmp r5d, r4d 2981*c0909341SAndroid Build Coastguard Worker jg .filter_copy_loop 2982*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+gprsize+16*17-2] 2983*c0909341SAndroid Build Coastguard Worker movq [tlq+r4*2-8], m3 2984*c0909341SAndroid Build Coastguard Worker ret 2985*c0909341SAndroid Build Coastguard Worker.filter_edge: 2986*c0909341SAndroid Build Coastguard Worker cmp r5d, 3 2987*c0909341SAndroid Build Coastguard Worker je .filter_edge_s3 2988*c0909341SAndroid Build Coastguard Worker movddup m4, [base+z_filt_k+r5*8-8] 2989*c0909341SAndroid Build Coastguard Worker movddup m5, [base+z_filt_k+r5*8+8] 2990*c0909341SAndroid Build Coastguard Worker xor r5d, r5d 2991*c0909341SAndroid Build Coastguard Worker movddup m6, [base+pw_8] 2992*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-12] 2993*c0909341SAndroid Build Coastguard Worker jmp .filter_edge_start 2994*c0909341SAndroid Build Coastguard Worker.filter_edge_loop: 2995*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2-12] 2996*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2+2], m1 2997*c0909341SAndroid Build Coastguard Worker.filter_edge_start: 2998*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [tlq+r5*2-14] 2999*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r5*2-16] 3000*c0909341SAndroid Build Coastguard Worker sub r5, 8 3001*c0909341SAndroid Build Coastguard Worker paddw m2, m3 3002*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 3003*c0909341SAndroid Build Coastguard Worker paddw m1, m6 3004*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3005*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 3006*c0909341SAndroid Build Coastguard Worker cmp r5d, r4d 3007*c0909341SAndroid Build Coastguard Worker jg .filter_edge_loop 3008*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2+2], m1 3009*c0909341SAndroid Build Coastguard Worker neg r4d 3010*c0909341SAndroid Build Coastguard Worker ret 3011*c0909341SAndroid Build Coastguard Worker.filter_edge_s3: 3012*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pw_3] 3013*c0909341SAndroid Build Coastguard Worker xor r5d, r5d 3014*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-12] 3015*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-10] 3016*c0909341SAndroid Build Coastguard Worker jmp .filter_edge_s3_start 3017*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_loop: 3018*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r5*2-12] 3019*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+r5*2-10] 3020*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2+2], m1 3021*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_start: 3022*c0909341SAndroid Build Coastguard Worker paddw m2, [tlq+r5*2-14] 3023*c0909341SAndroid Build Coastguard Worker paddw m3, m5 3024*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r5*2-16] 3025*c0909341SAndroid Build Coastguard Worker movu m4, [tlq+r5*2-18] 3026*c0909341SAndroid Build Coastguard Worker sub r5, 8 3027*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3028*c0909341SAndroid Build Coastguard Worker pavgw m3, m4 3029*c0909341SAndroid Build Coastguard Worker paddw m1, m3 3030*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 3031*c0909341SAndroid Build Coastguard Worker cmp r5d, r4d 3032*c0909341SAndroid Build Coastguard Worker jg .filter_edge_s3_loop 3033*c0909341SAndroid Build Coastguard Worker mova [tlq+r5*2+2], m1 3034*c0909341SAndroid Build Coastguard Worker neg r4d 3035*c0909341SAndroid Build Coastguard Worker ret 3036*c0909341SAndroid Build Coastguard Worker 3037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3038*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter 3039*c0909341SAndroid Build Coastguard Worker%else 3040*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter 3041*c0909341SAndroid Build Coastguard Worker%define m8 [esp+16*0] 3042*c0909341SAndroid Build Coastguard Worker%define m9 [esp+16*1] 3043*c0909341SAndroid Build Coastguard Worker%define m10 [esp+16*2] 3044*c0909341SAndroid Build Coastguard Worker%define m11 [esp+16*3] 3045*c0909341SAndroid Build Coastguard Worker%define m12 [esp+16*4] 3046*c0909341SAndroid Build Coastguard Worker%define m13 [esp+16*5] 3047*c0909341SAndroid Build Coastguard Worker%define m14 [esp+16*6] 3048*c0909341SAndroid Build Coastguard Worker%define m15 [esp+16*7] 3049*c0909341SAndroid Build Coastguard Worker%endif 3050*c0909341SAndroid Build Coastguard Worker%define base r6-$$ 3051*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3052*c0909341SAndroid Build Coastguard Worker movd m6, r8m ; bitdepth_max 3053*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm 3054*c0909341SAndroid Build Coastguard Worker movzx filterd, filterb 3055*c0909341SAndroid Build Coastguard Worker%else 3056*c0909341SAndroid Build Coastguard Worker movzx filterd, byte filterm 3057*c0909341SAndroid Build Coastguard Worker%endif 3058*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 3059*c0909341SAndroid Build Coastguard Worker shl filterd, 6 3060*c0909341SAndroid Build Coastguard Worker movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 3061*c0909341SAndroid Build Coastguard Worker mova m1, [base+filter_intra_taps+filterq+16*0] 3062*c0909341SAndroid Build Coastguard Worker mova m2, [base+filter_intra_taps+filterq+16*1] 3063*c0909341SAndroid Build Coastguard Worker mova m3, [base+filter_intra_taps+filterq+16*2] 3064*c0909341SAndroid Build Coastguard Worker mova m4, [base+filter_intra_taps+filterq+16*3] 3065*c0909341SAndroid Build Coastguard Worker pxor m5, m5 3066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3067*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper 3068*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid 3069*c0909341SAndroid Build Coastguard Worker punpcklbw m10, m5, m2 ; having to perform sign-extension. 3070*c0909341SAndroid Build Coastguard Worker punpckhbw m11, m5, m2 3071*c0909341SAndroid Build Coastguard Worker punpcklbw m12, m5, m3 3072*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m5, m3 3073*c0909341SAndroid Build Coastguard Worker punpcklbw m14, m5, m4 3074*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m5, m4 3075*c0909341SAndroid Build Coastguard Worker%else 3076*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m5, m1 3077*c0909341SAndroid Build Coastguard Worker mova m8, m7 3078*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m5, m1 3079*c0909341SAndroid Build Coastguard Worker mova m9, m7 3080*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m5, m2 3081*c0909341SAndroid Build Coastguard Worker mova m10, m7 3082*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m5, m2 3083*c0909341SAndroid Build Coastguard Worker mova m11, m7 3084*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m5, m3 3085*c0909341SAndroid Build Coastguard Worker mova m12, m7 3086*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m5, m3 3087*c0909341SAndroid Build Coastguard Worker mova m13, m7 3088*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m5, m4 3089*c0909341SAndroid Build Coastguard Worker mova m14, m7 3090*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m5, m4 3091*c0909341SAndroid Build Coastguard Worker mova m15, m7 3092*c0909341SAndroid Build Coastguard Worker%endif 3093*c0909341SAndroid Build Coastguard Worker mova m7, [base+filter_shuf] 3094*c0909341SAndroid Build Coastguard Worker add hd, hd 3095*c0909341SAndroid Build Coastguard Worker mov r5, dstq 3096*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 3097*c0909341SAndroid Build Coastguard Worker mov r6, tlq 3098*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m6 3099*c0909341SAndroid Build Coastguard Worker sub tlq, hq 3100*c0909341SAndroid Build Coastguard Worker.left_loop: 3101*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ 3102*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q0000 3103*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8, m1 3104*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m9 3105*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1111 3106*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m10, m4 3107*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m11 3108*c0909341SAndroid Build Coastguard Worker paddd m2, m3 3109*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3110*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q2222 3111*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m4 3112*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 3113*c0909341SAndroid Build Coastguard Worker paddd m2, m3 3114*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3115*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q3333 3116*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m14, m3 3117*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m15 3118*c0909341SAndroid Build Coastguard Worker paddd m0, m2 3119*c0909341SAndroid Build Coastguard Worker paddd m1, m3 3120*c0909341SAndroid Build Coastguard Worker psrad m0, 11 ; x >> 3 3121*c0909341SAndroid Build Coastguard Worker psrad m1, 11 3122*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3123*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m5 3124*c0909341SAndroid Build Coastguard Worker pavgw m0, m5 ; (x + 8) >> 4 3125*c0909341SAndroid Build Coastguard Worker pminsw m0, m6 3126*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 3127*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 3128*c0909341SAndroid Build Coastguard Worker movlps m0, [tlq+hq-10] 3129*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3130*c0909341SAndroid Build Coastguard Worker sub hd, 2*2 3131*c0909341SAndroid Build Coastguard Worker jg .left_loop 3132*c0909341SAndroid Build Coastguard Worker sub wd, 4 3133*c0909341SAndroid Build Coastguard Worker jz .end 3134*c0909341SAndroid Build Coastguard Worker sub tld, r6d ; -h*2 3135*c0909341SAndroid Build Coastguard Worker sub r6, r5 ; tl-dst 3136*c0909341SAndroid Build Coastguard Worker.right_loop0: 3137*c0909341SAndroid Build Coastguard Worker add r5, 8 3138*c0909341SAndroid Build Coastguard Worker mov hd, tld 3139*c0909341SAndroid Build Coastguard Worker movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ 3140*c0909341SAndroid Build Coastguard Worker mov dstq, r5 3141*c0909341SAndroid Build Coastguard Worker.right_loop: 3142*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q0000 3143*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8, m2 3144*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m9 3145*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1111 3146*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m10, m4 3147*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m11 3148*c0909341SAndroid Build Coastguard Worker pinsrw m0, [dstq+strideq*0-2], 5 3149*c0909341SAndroid Build Coastguard Worker paddd m1, m3 3150*c0909341SAndroid Build Coastguard Worker paddd m2, m4 3151*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q2222 3152*c0909341SAndroid Build Coastguard Worker movddup m4, [dstq+strideq*1-8] 3153*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12, m0 3154*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 3155*c0909341SAndroid Build Coastguard Worker paddd m1, m3 3156*c0909341SAndroid Build Coastguard Worker paddd m0, m2 3157*c0909341SAndroid Build Coastguard Worker pshuflw m2, m4, q3333 3158*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5 3159*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14, m2 3160*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 3161*c0909341SAndroid Build Coastguard Worker paddd m1, m3 3162*c0909341SAndroid Build Coastguard Worker paddd m0, m2 3163*c0909341SAndroid Build Coastguard Worker psrad m1, 11 3164*c0909341SAndroid Build Coastguard Worker psrad m0, 11 3165*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3166*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m5 3167*c0909341SAndroid Build Coastguard Worker pavgw m0, m5 3168*c0909341SAndroid Build Coastguard Worker pminsw m0, m6 3169*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], m0 3170*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m0 3171*c0909341SAndroid Build Coastguard Worker palignr m0, m4, 14 3172*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3173*c0909341SAndroid Build Coastguard Worker add hd, 2*2 3174*c0909341SAndroid Build Coastguard Worker jl .right_loop 3175*c0909341SAndroid Build Coastguard Worker sub wd, 4 3176*c0909341SAndroid Build Coastguard Worker jg .right_loop0 3177*c0909341SAndroid Build Coastguard Worker.end: 3178*c0909341SAndroid Build Coastguard Worker RET 3179*c0909341SAndroid Build Coastguard Worker 3180*c0909341SAndroid Build Coastguard Worker%if UNIX64 3181*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 3182*c0909341SAndroid Build Coastguard Worker%else 3183*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5 3184*c0909341SAndroid Build Coastguard Worker%endif 3185*c0909341SAndroid Build Coastguard Worker 3186*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac 3187*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_left_16bpc_ssse3_table 3188*c0909341SAndroid Build Coastguard Worker movd m4, wd 3189*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 3190*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3191*c0909341SAndroid Build Coastguard Worker add tlq, 2 3192*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+wq*4] 3193*c0909341SAndroid Build Coastguard Worker movd m5, wd 3194*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) 3195*c0909341SAndroid Build Coastguard Worker 3196*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 3197*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3198*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_left_16bpc_ssse3_table 3199*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3200*c0909341SAndroid Build Coastguard Worker lea r6d, [hq*2] 3201*c0909341SAndroid Build Coastguard Worker movd m4, hd 3202*c0909341SAndroid Build Coastguard Worker sub tlq, r6 3203*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 3204*c0909341SAndroid Build Coastguard Worker movd m5, r6d 3205*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+r6*4] 3206*c0909341SAndroid Build Coastguard Worker.start: 3207*c0909341SAndroid Build Coastguard Worker movd m7, r7m 3208*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 3209*c0909341SAndroid Build Coastguard Worker add r6, t0 3210*c0909341SAndroid Build Coastguard Worker add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table 3211*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 3212*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3213*c0909341SAndroid Build Coastguard Worker pshuflw m7, m7, q0000 3214*c0909341SAndroid Build Coastguard Worker pcmpeqw m3, m3 3215*c0909341SAndroid Build Coastguard Worker add wq, t0 3216*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3217*c0909341SAndroid Build Coastguard Worker pavgw m4, m6 3218*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 3219*c0909341SAndroid Build Coastguard Worker jmp r6 3220*c0909341SAndroid Build Coastguard Worker.h32: 3221*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+48] 3222*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+32] 3223*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3224*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3225*c0909341SAndroid Build Coastguard Worker.h16: 3226*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16] 3227*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3228*c0909341SAndroid Build Coastguard Worker.h8: 3229*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q1032 3230*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3231*c0909341SAndroid Build Coastguard Worker.h4: 3232*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3233*c0909341SAndroid Build Coastguard Worker psubd m4, m0 3234*c0909341SAndroid Build Coastguard Worker pshuflw m0, m4, q1032 3235*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3236*c0909341SAndroid Build Coastguard Worker psrld m0, m5 3237*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3238*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3239*c0909341SAndroid Build Coastguard Worker jmp wq 3240*c0909341SAndroid Build Coastguard Worker 3241*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 2 ; dst, src 3242*c0909341SAndroid Build Coastguard Worker pabsw m%1, m%2 3243*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m2 3244*c0909341SAndroid Build Coastguard Worker psignw m%2, m1 3245*c0909341SAndroid Build Coastguard Worker psignw m%1, m%2 3246*c0909341SAndroid Build Coastguard Worker paddw m%1, m0 3247*c0909341SAndroid Build Coastguard Worker pmaxsw m%1, m6 3248*c0909341SAndroid Build Coastguard Worker pminsw m%1, m7 3249*c0909341SAndroid Build Coastguard Worker%endmacro 3250*c0909341SAndroid Build Coastguard Worker 3251*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha 3252*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3253*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 3254*c0909341SAndroid Build Coastguard Worker lea t0d, [wq+hq] 3255*c0909341SAndroid Build Coastguard Worker movd m4, t0d 3256*c0909341SAndroid Build Coastguard Worker tzcnt t0d, t0d 3257*c0909341SAndroid Build Coastguard Worker movd m5, t0d 3258*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_16bpc_ssse3_table 3259*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 3260*c0909341SAndroid Build Coastguard Worker movd m7, r7m 3261*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+r6*4] 3262*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4+4*4] 3263*c0909341SAndroid Build Coastguard Worker psrlw m4, 1 3264*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3265*c0909341SAndroid Build Coastguard Worker pshuflw m7, m7, q0000 3266*c0909341SAndroid Build Coastguard Worker add r6, t0 3267*c0909341SAndroid Build Coastguard Worker add wq, t0 3268*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3269*c0909341SAndroid Build Coastguard Worker pcmpeqw m3, m3 3270*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 3271*c0909341SAndroid Build Coastguard Worker jmp r6 3272*c0909341SAndroid Build Coastguard Worker.h4: 3273*c0909341SAndroid Build Coastguard Worker movq m0, [tlq-8] 3274*c0909341SAndroid Build Coastguard Worker jmp wq 3275*c0909341SAndroid Build Coastguard Worker.w4: 3276*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+2] 3277*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3278*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3279*c0909341SAndroid Build Coastguard Worker psubd m4, m0 3280*c0909341SAndroid Build Coastguard Worker pshufd m0, m4, q1032 3281*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3282*c0909341SAndroid Build Coastguard Worker pshuflw m4, m0, q1032 3283*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3284*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3285*c0909341SAndroid Build Coastguard Worker jg .w4_mul 3286*c0909341SAndroid Build Coastguard Worker psrld m0, 3 3287*c0909341SAndroid Build Coastguard Worker jmp .w4_end 3288*c0909341SAndroid Build Coastguard Worker.w4_mul: 3289*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 3290*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 3291*c0909341SAndroid Build Coastguard Worker cmp hd, 16 3292*c0909341SAndroid Build Coastguard Worker cmove r6d, r2d 3293*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3294*c0909341SAndroid Build Coastguard Worker psrld m0, 2 3295*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3296*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 3297*c0909341SAndroid Build Coastguard Worker.w4_end: 3298*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3299*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3300*c0909341SAndroid Build Coastguard Worker.s4: 3301*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3302*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 3303*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3304*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3305*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3306*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3307*c0909341SAndroid Build Coastguard Worker.s4_loop: 3308*c0909341SAndroid Build Coastguard Worker mova m4, [acq+16*0] 3309*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16*1] 3310*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3311*c0909341SAndroid Build Coastguard Worker IPRED_CFL 3, 4 3312*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4, 5 3313*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m3 3314*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m3 3315*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m4 3316*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], m4 3317*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3318*c0909341SAndroid Build Coastguard Worker sub hd, 4 3319*c0909341SAndroid Build Coastguard Worker jg .s4_loop 3320*c0909341SAndroid Build Coastguard Worker RET 3321*c0909341SAndroid Build Coastguard Worker.h8: 3322*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16] 3323*c0909341SAndroid Build Coastguard Worker jmp wq 3324*c0909341SAndroid Build Coastguard Worker.w8: 3325*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+2] 3326*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3327*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3328*c0909341SAndroid Build Coastguard Worker psubd m4, m0 3329*c0909341SAndroid Build Coastguard Worker pshufd m0, m4, q1032 3330*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3331*c0909341SAndroid Build Coastguard Worker pshuflw m4, m0, q1032 3332*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3333*c0909341SAndroid Build Coastguard Worker psrld m0, m5 3334*c0909341SAndroid Build Coastguard Worker cmp hd, 8 3335*c0909341SAndroid Build Coastguard Worker je .w8_end 3336*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 3337*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 3338*c0909341SAndroid Build Coastguard Worker cmp hd, 32 3339*c0909341SAndroid Build Coastguard Worker cmove r6d, r2d 3340*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3341*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3342*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 3343*c0909341SAndroid Build Coastguard Worker.w8_end: 3344*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3345*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3346*c0909341SAndroid Build Coastguard Worker.s8: 3347*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3348*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3349*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3350*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3351*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3352*c0909341SAndroid Build Coastguard Worker.s8_loop: 3353*c0909341SAndroid Build Coastguard Worker mova m4, [acq+16*0] 3354*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16*1] 3355*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3356*c0909341SAndroid Build Coastguard Worker IPRED_CFL 3, 4 3357*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4, 5 3358*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m3 3359*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m4 3360*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3361*c0909341SAndroid Build Coastguard Worker sub hd, 2 3362*c0909341SAndroid Build Coastguard Worker jg .s8_loop 3363*c0909341SAndroid Build Coastguard Worker RET 3364*c0909341SAndroid Build Coastguard Worker.h16: 3365*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] 3366*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-16] 3367*c0909341SAndroid Build Coastguard Worker jmp wq 3368*c0909341SAndroid Build Coastguard Worker.w16: 3369*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 2] 3370*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+18] 3371*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3372*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3373*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3374*c0909341SAndroid Build Coastguard Worker psubd m4, m0 3375*c0909341SAndroid Build Coastguard Worker pshufd m0, m4, q1032 3376*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3377*c0909341SAndroid Build Coastguard Worker pshuflw m4, m0, q1032 3378*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3379*c0909341SAndroid Build Coastguard Worker psrld m0, m5 3380*c0909341SAndroid Build Coastguard Worker cmp hd, 16 3381*c0909341SAndroid Build Coastguard Worker je .w16_end 3382*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 3383*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 3384*c0909341SAndroid Build Coastguard Worker test hd, 8|32 3385*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 3386*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3387*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3388*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 3389*c0909341SAndroid Build Coastguard Worker.w16_end: 3390*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3391*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3392*c0909341SAndroid Build Coastguard Worker.s16: 3393*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3394*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3395*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3396*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3397*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3398*c0909341SAndroid Build Coastguard Worker.s16_loop: 3399*c0909341SAndroid Build Coastguard Worker mova m4, [acq+16*0] 3400*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16*1] 3401*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3402*c0909341SAndroid Build Coastguard Worker IPRED_CFL 3, 4 3403*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4, 5 3404*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m3 3405*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m4 3406*c0909341SAndroid Build Coastguard Worker add dstq, strideq 3407*c0909341SAndroid Build Coastguard Worker dec hd 3408*c0909341SAndroid Build Coastguard Worker jg .s16_loop 3409*c0909341SAndroid Build Coastguard Worker RET 3410*c0909341SAndroid Build Coastguard Worker.h32: 3411*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-64] 3412*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-48] 3413*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-32] 3414*c0909341SAndroid Build Coastguard Worker paddw m0, [tlq-16] 3415*c0909341SAndroid Build Coastguard Worker jmp wq 3416*c0909341SAndroid Build Coastguard Worker.w32: 3417*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 2] 3418*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+18] 3419*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3420*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+34] 3421*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3422*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+50] 3423*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3424*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3425*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3426*c0909341SAndroid Build Coastguard Worker psubd m4, m0 3427*c0909341SAndroid Build Coastguard Worker pshufd m0, m4, q1032 3428*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3429*c0909341SAndroid Build Coastguard Worker pshuflw m4, m0, q1032 3430*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3431*c0909341SAndroid Build Coastguard Worker psrld m0, m5 3432*c0909341SAndroid Build Coastguard Worker cmp hd, 32 3433*c0909341SAndroid Build Coastguard Worker je .w32_end 3434*c0909341SAndroid Build Coastguard Worker mov r6d, 0xAAAB 3435*c0909341SAndroid Build Coastguard Worker mov r2d, 0x6667 3436*c0909341SAndroid Build Coastguard Worker cmp hd, 8 3437*c0909341SAndroid Build Coastguard Worker cmove r6d, r2d 3438*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3439*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3440*c0909341SAndroid Build Coastguard Worker psrlw m0, 1 3441*c0909341SAndroid Build Coastguard Worker.w32_end: 3442*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3443*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3444*c0909341SAndroid Build Coastguard Worker.s32: 3445*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3446*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3447*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3448*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3449*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3450*c0909341SAndroid Build Coastguard Worker.s32_loop: 3451*c0909341SAndroid Build Coastguard Worker mova m4, [acq+16*0] 3452*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16*1] 3453*c0909341SAndroid Build Coastguard Worker IPRED_CFL 3, 4 3454*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4, 5 3455*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m3 3456*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m4 3457*c0909341SAndroid Build Coastguard Worker mova m4, [acq+16*2] 3458*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16*3] 3459*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3460*c0909341SAndroid Build Coastguard Worker IPRED_CFL 3, 4 3461*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4, 5 3462*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m3 3463*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m4 3464*c0909341SAndroid Build Coastguard Worker add dstq, strideq 3465*c0909341SAndroid Build Coastguard Worker dec hd 3466*c0909341SAndroid Build Coastguard Worker jg .s32_loop 3467*c0909341SAndroid Build Coastguard Worker RET 3468*c0909341SAndroid Build Coastguard Worker 3469*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac 3470*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3471*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_splat_16bpc_ssse3_table 3472*c0909341SAndroid Build Coastguard Worker mov r6d, r7m 3473*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3474*c0909341SAndroid Build Coastguard Worker shr r6d, 11 3475*c0909341SAndroid Build Coastguard Worker movd m7, r7m 3476*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 3477*c0909341SAndroid Build Coastguard Worker movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] 3478*c0909341SAndroid Build Coastguard Worker pshuflw m7, m7, q0000 3479*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3480*c0909341SAndroid Build Coastguard Worker add wq, t0 3481*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3482*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 3483*c0909341SAndroid Build Coastguard Worker jmp wq 3484*c0909341SAndroid Build Coastguard Worker 3485*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 3486*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 3487*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && PIC 3488*c0909341SAndroid Build Coastguard Worker pcmpeqw m5, m5 3489*c0909341SAndroid Build Coastguard Worker pabsw m5, m5 3490*c0909341SAndroid Build Coastguard Worker paddw m5, m5 3491*c0909341SAndroid Build Coastguard Worker%else 3492*c0909341SAndroid Build Coastguard Worker movddup m5, [pw_2] 3493*c0909341SAndroid Build Coastguard Worker%endif 3494*c0909341SAndroid Build Coastguard Worker mov hd, hm 3495*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 3496*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3497*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 3498*c0909341SAndroid Build Coastguard Worker cmp dword wm, 8 3499*c0909341SAndroid Build Coastguard Worker mov r5, acq 3500*c0909341SAndroid Build Coastguard Worker jg .w16 3501*c0909341SAndroid Build Coastguard Worker je .w8 3502*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3503*c0909341SAndroid Build Coastguard Worker.w4_loop: 3504*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0] 3505*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*1] 3506*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+strideq*2] 3507*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+r3 ] 3508*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 3509*c0909341SAndroid Build Coastguard Worker paddd m0, m1 3510*c0909341SAndroid Build Coastguard Worker paddd m2, m3 3511*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3512*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 3513*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3514*c0909341SAndroid Build Coastguard Worker mova [acq], m0 3515*c0909341SAndroid Build Coastguard Worker add acq, 16 3516*c0909341SAndroid Build Coastguard Worker sub hd, 2 3517*c0909341SAndroid Build Coastguard Worker jg .w4_loop 3518*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3519*c0909341SAndroid Build Coastguard Worker jz .dc 3520*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3521*c0909341SAndroid Build Coastguard Worker pslld m2, 2 3522*c0909341SAndroid Build Coastguard Worker.w4_hpad: 3523*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3524*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3525*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m0 3526*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3527*c0909341SAndroid Build Coastguard Worker sub hpadd, 4 3528*c0909341SAndroid Build Coastguard Worker jg .w4_hpad 3529*c0909341SAndroid Build Coastguard Worker jmp .dc 3530*c0909341SAndroid Build Coastguard Worker.w8: 3531*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3532*c0909341SAndroid Build Coastguard Worker cmp dword wpadm, 0 3533*c0909341SAndroid Build Coastguard Worker%else 3534*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 3535*c0909341SAndroid Build Coastguard Worker%endif 3536*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad1 3537*c0909341SAndroid Build Coastguard Worker.w8_loop: 3538*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0+16*0] 3539*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+strideq*1+16*0] 3540*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*0+16*1] 3541*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+strideq*1+16*1] 3542*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3543*c0909341SAndroid Build Coastguard Worker paddd m0, m2 3544*c0909341SAndroid Build Coastguard Worker paddd m1, m3 3545*c0909341SAndroid Build Coastguard Worker paddd m2, m0, m1 3546*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3547*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3548*c0909341SAndroid Build Coastguard Worker mova [acq], m0 3549*c0909341SAndroid Build Coastguard Worker add acq, 16 3550*c0909341SAndroid Build Coastguard Worker dec hd 3551*c0909341SAndroid Build Coastguard Worker jg .w8_loop 3552*c0909341SAndroid Build Coastguard Worker.w8_hpad: 3553*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3554*c0909341SAndroid Build Coastguard Worker jz .dc 3555*c0909341SAndroid Build Coastguard Worker pslld m2, 2 3556*c0909341SAndroid Build Coastguard Worker mova m1, m0 3557*c0909341SAndroid Build Coastguard Worker jmp .hpad 3558*c0909341SAndroid Build Coastguard Worker.w8_wpad1: 3559*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0] 3560*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*1] 3561*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3562*c0909341SAndroid Build Coastguard Worker paddd m0, m1 3563*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q3333 3564*c0909341SAndroid Build Coastguard Worker paddd m2, m0, m1 3565*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3566*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3567*c0909341SAndroid Build Coastguard Worker mova [acq], m0 3568*c0909341SAndroid Build Coastguard Worker add acq, 16 3569*c0909341SAndroid Build Coastguard Worker dec hd 3570*c0909341SAndroid Build Coastguard Worker jg .w8_wpad1 3571*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 3572*c0909341SAndroid Build Coastguard Worker.w16_wpad3: 3573*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q3333 3574*c0909341SAndroid Build Coastguard Worker mova m1, m3 3575*c0909341SAndroid Build Coastguard Worker mova m2, m3 3576*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3577*c0909341SAndroid Build Coastguard Worker.w16_wpad2: 3578*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q3333 3579*c0909341SAndroid Build Coastguard Worker mova m2, m1 3580*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3581*c0909341SAndroid Build Coastguard Worker.w16_wpad1: 3582*c0909341SAndroid Build Coastguard Worker pshufd m2, m1, q3333 3583*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3584*c0909341SAndroid Build Coastguard Worker.w16: 3585*c0909341SAndroid Build Coastguard Worker movifnidn wpadd, wpadm 3586*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 3587*c0909341SAndroid Build Coastguard Worker.w16_loop: 3588*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0+16*0] 3589*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m5, [ypxq+strideq*1+16*0] 3590*c0909341SAndroid Build Coastguard Worker paddd m0, m6 3591*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 3592*c0909341SAndroid Build Coastguard Worker jg .w16_wpad3 3593*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+strideq*0+16*1] 3594*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m5, [ypxq+strideq*1+16*1] 3595*c0909341SAndroid Build Coastguard Worker paddd m3, m6 3596*c0909341SAndroid Build Coastguard Worker je .w16_wpad2 3597*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*0+16*2] 3598*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m5, [ypxq+strideq*1+16*2] 3599*c0909341SAndroid Build Coastguard Worker paddd m1, m6 3600*c0909341SAndroid Build Coastguard Worker jp .w16_wpad1 3601*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+strideq*0+16*3] 3602*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m5, [ypxq+strideq*1+16*3] 3603*c0909341SAndroid Build Coastguard Worker paddd m2, m6 3604*c0909341SAndroid Build Coastguard Worker.w16_wpad_end: 3605*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3606*c0909341SAndroid Build Coastguard Worker paddd m6, m0, m3 3607*c0909341SAndroid Build Coastguard Worker packssdw m0, m3 3608*c0909341SAndroid Build Coastguard Worker paddd m6, m1 3609*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3610*c0909341SAndroid Build Coastguard Worker packssdw m1, m2 3611*c0909341SAndroid Build Coastguard Worker paddd m2, m6 3612*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3613*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3614*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3615*c0909341SAndroid Build Coastguard Worker dec hd 3616*c0909341SAndroid Build Coastguard Worker jg .w16_loop 3617*c0909341SAndroid Build Coastguard Worker WIN64_RESTORE_XMM 3618*c0909341SAndroid Build Coastguard Worker add hpadd, hpadd 3619*c0909341SAndroid Build Coastguard Worker jz .dc 3620*c0909341SAndroid Build Coastguard Worker paddd m2, m2 3621*c0909341SAndroid Build Coastguard Worker.hpad: 3622*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3623*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3624*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3625*c0909341SAndroid Build Coastguard Worker mova [acq+16*2], m0 3626*c0909341SAndroid Build Coastguard Worker mova [acq+16*3], m1 3627*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3628*c0909341SAndroid Build Coastguard Worker sub hpadd, 4 3629*c0909341SAndroid Build Coastguard Worker jg .hpad 3630*c0909341SAndroid Build Coastguard Worker.dc: 3631*c0909341SAndroid Build Coastguard Worker sub r5, acq ; -w*h*2 3632*c0909341SAndroid Build Coastguard Worker pshufd m2, m4, q1032 3633*c0909341SAndroid Build Coastguard Worker tzcnt r1d, r5d 3634*c0909341SAndroid Build Coastguard Worker paddd m2, m4 3635*c0909341SAndroid Build Coastguard Worker sub r1d, 2 3636*c0909341SAndroid Build Coastguard Worker pshufd m4, m2, q2301 3637*c0909341SAndroid Build Coastguard Worker movd m0, r1d 3638*c0909341SAndroid Build Coastguard Worker paddd m2, m4 3639*c0909341SAndroid Build Coastguard Worker psrld m2, m0 3640*c0909341SAndroid Build Coastguard Worker pxor m0, m0 3641*c0909341SAndroid Build Coastguard Worker pavgw m2, m0 3642*c0909341SAndroid Build Coastguard Worker packssdw m2, m2 3643*c0909341SAndroid Build Coastguard Worker.dc_loop: 3644*c0909341SAndroid Build Coastguard Worker mova m0, [acq+r5+16*0] 3645*c0909341SAndroid Build Coastguard Worker mova m1, [acq+r5+16*1] 3646*c0909341SAndroid Build Coastguard Worker psubw m0, m2 3647*c0909341SAndroid Build Coastguard Worker psubw m1, m2 3648*c0909341SAndroid Build Coastguard Worker mova [acq+r5+16*0], m0 3649*c0909341SAndroid Build Coastguard Worker mova [acq+r5+16*1], m1 3650*c0909341SAndroid Build Coastguard Worker add r5, 16*2 3651*c0909341SAndroid Build Coastguard Worker jl .dc_loop 3652*c0909341SAndroid Build Coastguard Worker RET 3653*c0909341SAndroid Build Coastguard Worker 3654*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 3655*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 3656*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && PIC 3657*c0909341SAndroid Build Coastguard Worker pcmpeqw m5, m5 3658*c0909341SAndroid Build Coastguard Worker pabsw m5, m5 3659*c0909341SAndroid Build Coastguard Worker psllw m5, 2 3660*c0909341SAndroid Build Coastguard Worker%else 3661*c0909341SAndroid Build Coastguard Worker movddup m5, [pw_4] 3662*c0909341SAndroid Build Coastguard Worker%endif 3663*c0909341SAndroid Build Coastguard Worker mov hd, hm 3664*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 3665*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3666*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 3667*c0909341SAndroid Build Coastguard Worker cmp dword wm, 8 3668*c0909341SAndroid Build Coastguard Worker mov r5, acq 3669*c0909341SAndroid Build Coastguard Worker jg .w16 3670*c0909341SAndroid Build Coastguard Worker je .w8 3671*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3672*c0909341SAndroid Build Coastguard Worker.w4_loop: 3673*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0] 3674*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+strideq*1] 3675*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*2] 3676*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+r3 ] 3677*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 3678*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3679*c0909341SAndroid Build Coastguard Worker packssdw m0, m3 3680*c0909341SAndroid Build Coastguard Worker paddd m3, m1 3681*c0909341SAndroid Build Coastguard Worker packssdw m1, m2 3682*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3683*c0909341SAndroid Build Coastguard Worker paddd m4, m3 3684*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3685*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3686*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3687*c0909341SAndroid Build Coastguard Worker sub hd, 4 3688*c0909341SAndroid Build Coastguard Worker jg .w4_loop 3689*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3690*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3691*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 3692*c0909341SAndroid Build Coastguard Worker pslld m2, 3 3693*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m1 3694*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3695*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3696*c0909341SAndroid Build Coastguard Worker mova [acq+16*2], m1 3697*c0909341SAndroid Build Coastguard Worker mova [acq+16*3], m1 3698*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3699*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3700*c0909341SAndroid Build Coastguard Worker.w8: 3701*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3702*c0909341SAndroid Build Coastguard Worker cmp dword wpadm, 0 3703*c0909341SAndroid Build Coastguard Worker%else 3704*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 3705*c0909341SAndroid Build Coastguard Worker%endif 3706*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad1 3707*c0909341SAndroid Build Coastguard Worker.w8_loop: 3708*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0+16*0] 3709*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+strideq*0+16*1] 3710*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*1+16*0] 3711*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+strideq*1+16*1] 3712*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3713*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3714*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 3715*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3716*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3717*c0909341SAndroid Build Coastguard Worker paddd m2, m1, m3 3718*c0909341SAndroid Build Coastguard Worker packssdw m1, m3 3719*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3720*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3721*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3722*c0909341SAndroid Build Coastguard Worker sub hd, 2 3723*c0909341SAndroid Build Coastguard Worker jg .w8_loop 3724*c0909341SAndroid Build Coastguard Worker.w8_hpad: 3725*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3726*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3727*c0909341SAndroid Build Coastguard Worker pslld m2, 2 3728*c0909341SAndroid Build Coastguard Worker mova m0, m1 3729*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3730*c0909341SAndroid Build Coastguard Worker.w8_wpad1: 3731*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+strideq*0] 3732*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+strideq*1] 3733*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3734*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q3333 3735*c0909341SAndroid Build Coastguard Worker pshufd m3, m1, q3333 3736*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3737*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 3738*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3739*c0909341SAndroid Build Coastguard Worker paddd m2, m1, m3 3740*c0909341SAndroid Build Coastguard Worker packssdw m1, m3 3741*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3742*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3743*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3744*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3745*c0909341SAndroid Build Coastguard Worker sub hd, 2 3746*c0909341SAndroid Build Coastguard Worker jg .w8_wpad1 3747*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 3748*c0909341SAndroid Build Coastguard Worker.w16_wpad3: 3749*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q3333 3750*c0909341SAndroid Build Coastguard Worker mova m1, m3 3751*c0909341SAndroid Build Coastguard Worker mova m2, m3 3752*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3753*c0909341SAndroid Build Coastguard Worker.w16_wpad2: 3754*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q3333 3755*c0909341SAndroid Build Coastguard Worker mova m2, m1 3756*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3757*c0909341SAndroid Build Coastguard Worker.w16_wpad1: 3758*c0909341SAndroid Build Coastguard Worker pshufd m2, m1, q3333 3759*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3760*c0909341SAndroid Build Coastguard Worker.w16: 3761*c0909341SAndroid Build Coastguard Worker movifnidn wpadd, wpadm 3762*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 3763*c0909341SAndroid Build Coastguard Worker.w16_loop: 3764*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5, [ypxq+16*0] 3765*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 3766*c0909341SAndroid Build Coastguard Worker jg .w16_wpad3 3767*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, [ypxq+16*1] 3768*c0909341SAndroid Build Coastguard Worker je .w16_wpad2 3769*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, [ypxq+16*2] 3770*c0909341SAndroid Build Coastguard Worker jp .w16_wpad1 3771*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, [ypxq+16*3] 3772*c0909341SAndroid Build Coastguard Worker.w16_wpad_end: 3773*c0909341SAndroid Build Coastguard Worker add ypxq, strideq 3774*c0909341SAndroid Build Coastguard Worker paddd m6, m0, m3 3775*c0909341SAndroid Build Coastguard Worker packssdw m0, m3 3776*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3777*c0909341SAndroid Build Coastguard Worker paddd m6, m1 3778*c0909341SAndroid Build Coastguard Worker packssdw m1, m2 3779*c0909341SAndroid Build Coastguard Worker paddd m2, m6 3780*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3781*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3782*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3783*c0909341SAndroid Build Coastguard Worker dec hd 3784*c0909341SAndroid Build Coastguard Worker jg .w16_loop 3785*c0909341SAndroid Build Coastguard Worker WIN64_RESTORE_XMM 3786*c0909341SAndroid Build Coastguard Worker add hpadd, hpadd 3787*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3788*c0909341SAndroid Build Coastguard Worker paddd m2, m2 3789*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3790*c0909341SAndroid Build Coastguard Worker 3791*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 3792*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table 3793*c0909341SAndroid Build Coastguard Worker LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table 3794*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3795*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 3796*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3797*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 3798*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pw_1] 3799*c0909341SAndroid Build Coastguard Worker add wq, r6 3800*c0909341SAndroid Build Coastguard Worker mov hd, hm 3801*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 3802*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 3803*c0909341SAndroid Build Coastguard Worker jmp wq 3804*c0909341SAndroid Build Coastguard Worker.w4: 3805*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3806*c0909341SAndroid Build Coastguard Worker mov r5, acq 3807*c0909341SAndroid Build Coastguard Worker.w4_loop: 3808*c0909341SAndroid Build Coastguard Worker movq m0, [ypxq+strideq*0] 3809*c0909341SAndroid Build Coastguard Worker movhps m0, [ypxq+strideq*1] 3810*c0909341SAndroid Build Coastguard Worker movq m1, [ypxq+strideq*2] 3811*c0909341SAndroid Build Coastguard Worker movhps m1, [ypxq+r3 ] 3812*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*4] 3813*c0909341SAndroid Build Coastguard Worker psllw m0, 3 3814*c0909341SAndroid Build Coastguard Worker psllw m1, 3 3815*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3816*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5 3817*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3818*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, m1 3819*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3820*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3821*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3822*c0909341SAndroid Build Coastguard Worker sub hd, 4 3823*c0909341SAndroid Build Coastguard Worker jg .w4_loop 3824*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3825*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3826*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 3827*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m1 3828*c0909341SAndroid Build Coastguard Worker pslld m2, 2 3829*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3830*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m2 3831*c0909341SAndroid Build Coastguard Worker mova [acq+16*2], m1 3832*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3833*c0909341SAndroid Build Coastguard Worker mova [acq+16*3], m1 3834*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3835*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3836*c0909341SAndroid Build Coastguard Worker.w8: 3837*c0909341SAndroid Build Coastguard Worker mov r5, acq 3838*c0909341SAndroid Build Coastguard Worker.w8_loop: 3839*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+strideq*0] 3840*c0909341SAndroid Build Coastguard Worker mova m1, [ypxq+strideq*1] 3841*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3842*c0909341SAndroid Build Coastguard Worker psllw m0, 3 3843*c0909341SAndroid Build Coastguard Worker psllw m1, 3 3844*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3845*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5 3846*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3847*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, m1 3848*c0909341SAndroid Build Coastguard Worker add acq, 16*2 3849*c0909341SAndroid Build Coastguard Worker paddd m4, m0 3850*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3851*c0909341SAndroid Build Coastguard Worker sub hd, 2 3852*c0909341SAndroid Build Coastguard Worker jg .w8_loop 3853*c0909341SAndroid Build Coastguard Worker.w8_hpad: 3854*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3855*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3856*c0909341SAndroid Build Coastguard Worker pslld m2, 2 3857*c0909341SAndroid Build Coastguard Worker mova m0, m1 3858*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3859*c0909341SAndroid Build Coastguard Worker.w16_wpad2: 3860*c0909341SAndroid Build Coastguard Worker pshufhw m3, m2, q3333 3861*c0909341SAndroid Build Coastguard Worker pshufhw m1, m0, q3333 3862*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 3863*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 3864*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_end 3865*c0909341SAndroid Build Coastguard Worker.w16: 3866*c0909341SAndroid Build Coastguard Worker movifnidn wpadd, wpadm 3867*c0909341SAndroid Build Coastguard Worker mov r5, acq 3868*c0909341SAndroid Build Coastguard Worker.w16_loop: 3869*c0909341SAndroid Build Coastguard Worker mova m2, [ypxq+strideq*0+16*0] 3870*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+strideq*1+16*0] 3871*c0909341SAndroid Build Coastguard Worker psllw m2, 3 3872*c0909341SAndroid Build Coastguard Worker psllw m0, 3 3873*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 3874*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad2 3875*c0909341SAndroid Build Coastguard Worker mova m3, [ypxq+strideq*0+16*1] 3876*c0909341SAndroid Build Coastguard Worker mova m1, [ypxq+strideq*1+16*1] 3877*c0909341SAndroid Build Coastguard Worker psllw m3, 3 3878*c0909341SAndroid Build Coastguard Worker psllw m1, 3 3879*c0909341SAndroid Build Coastguard Worker.w16_wpad_end: 3880*c0909341SAndroid Build Coastguard Worker lea ypxq, [ypxq+strideq*2] 3881*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m2 3882*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 3883*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m3 3884*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5 3885*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3886*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5, m0 3887*c0909341SAndroid Build Coastguard Worker mova [acq+16*2], m0 3888*c0909341SAndroid Build Coastguard Worker paddd m4, m3 3889*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5, m1 3890*c0909341SAndroid Build Coastguard Worker mova [acq+16*3], m1 3891*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3892*c0909341SAndroid Build Coastguard Worker paddd m2, m3 3893*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3894*c0909341SAndroid Build Coastguard Worker sub hd, 2 3895*c0909341SAndroid Build Coastguard Worker jg .w16_loop 3896*c0909341SAndroid Build Coastguard Worker add hpadd, hpadd 3897*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3898*c0909341SAndroid Build Coastguard Worker paddd m2, m2 3899*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3900*c0909341SAndroid Build Coastguard Worker.w32_wpad6: 3901*c0909341SAndroid Build Coastguard Worker pshufhw m1, m0, q3333 3902*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 3903*c0909341SAndroid Build Coastguard Worker mova m2, m1 3904*c0909341SAndroid Build Coastguard Worker mova m3, m1 3905*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_end 3906*c0909341SAndroid Build Coastguard Worker.w32_wpad4: 3907*c0909341SAndroid Build Coastguard Worker pshufhw m2, m1, q3333 3908*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m2 3909*c0909341SAndroid Build Coastguard Worker mova m3, m2 3910*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_end 3911*c0909341SAndroid Build Coastguard Worker.w32_wpad2: 3912*c0909341SAndroid Build Coastguard Worker pshufhw m3, m2, q3333 3913*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 3914*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_end 3915*c0909341SAndroid Build Coastguard Worker.w32: 3916*c0909341SAndroid Build Coastguard Worker movifnidn wpadd, wpadm 3917*c0909341SAndroid Build Coastguard Worker mov r5, acq 3918*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 3919*c0909341SAndroid Build Coastguard Worker.w32_loop: 3920*c0909341SAndroid Build Coastguard Worker mova m0, [ypxq+16*0] 3921*c0909341SAndroid Build Coastguard Worker psllw m0, 3 3922*c0909341SAndroid Build Coastguard Worker cmp wpadd, 4 3923*c0909341SAndroid Build Coastguard Worker jg .w32_wpad6 3924*c0909341SAndroid Build Coastguard Worker mova m1, [ypxq+16*1] 3925*c0909341SAndroid Build Coastguard Worker psllw m1, 3 3926*c0909341SAndroid Build Coastguard Worker je .w32_wpad4 3927*c0909341SAndroid Build Coastguard Worker mova m2, [ypxq+16*2] 3928*c0909341SAndroid Build Coastguard Worker psllw m2, 3 3929*c0909341SAndroid Build Coastguard Worker jnp .w32_wpad2 3930*c0909341SAndroid Build Coastguard Worker mova m3, [ypxq+16*3] 3931*c0909341SAndroid Build Coastguard Worker psllw m3, 3 3932*c0909341SAndroid Build Coastguard Worker.w32_wpad_end: 3933*c0909341SAndroid Build Coastguard Worker add ypxq, strideq 3934*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m5, m0 3935*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3936*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m5, m1 3937*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3938*c0909341SAndroid Build Coastguard Worker paddd m6, m7 3939*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m5, m2 3940*c0909341SAndroid Build Coastguard Worker mova [acq+16*2], m2 3941*c0909341SAndroid Build Coastguard Worker paddd m6, m7 3942*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m5, m3 3943*c0909341SAndroid Build Coastguard Worker mova [acq+16*3], m3 3944*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3945*c0909341SAndroid Build Coastguard Worker paddd m6, m7 3946*c0909341SAndroid Build Coastguard Worker paddd m4, m6 3947*c0909341SAndroid Build Coastguard Worker dec hd 3948*c0909341SAndroid Build Coastguard Worker jg .w32_loop 3949*c0909341SAndroid Build Coastguard Worker%if WIN64 3950*c0909341SAndroid Build Coastguard Worker mova m5, m6 3951*c0909341SAndroid Build Coastguard Worker WIN64_RESTORE_XMM 3952*c0909341SAndroid Build Coastguard Worker SWAP 5, 6 3953*c0909341SAndroid Build Coastguard Worker%endif 3954*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3955*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3956*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop: 3957*c0909341SAndroid Build Coastguard Worker mova [acq+16*0], m0 3958*c0909341SAndroid Build Coastguard Worker mova [acq+16*1], m1 3959*c0909341SAndroid Build Coastguard Worker paddd m4, m6 3960*c0909341SAndroid Build Coastguard Worker mova [acq+16*2], m2 3961*c0909341SAndroid Build Coastguard Worker mova [acq+16*3], m3 3962*c0909341SAndroid Build Coastguard Worker add acq, 16*4 3963*c0909341SAndroid Build Coastguard Worker dec hpadd 3964*c0909341SAndroid Build Coastguard Worker jg .w32_hpad_loop 3965*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3966*c0909341SAndroid Build Coastguard Worker 3967*c0909341SAndroid Build Coastguard Workercglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h 3968*c0909341SAndroid Build Coastguard Worker%define base r2-pal_pred_16bpc_ssse3_table 3969*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3970*c0909341SAndroid Build Coastguard Worker %define hd r2d 3971*c0909341SAndroid Build Coastguard Worker%endif 3972*c0909341SAndroid Build Coastguard Worker mova m4, [palq] 3973*c0909341SAndroid Build Coastguard Worker LEA r2, pal_pred_16bpc_ssse3_table 3974*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3975*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+pal_pred_shuf] 3976*c0909341SAndroid Build Coastguard Worker movsxd wq, [r2+wq*4] 3977*c0909341SAndroid Build Coastguard Worker pshufd m5, m4, q1032 3978*c0909341SAndroid Build Coastguard Worker add wq, r2 3979*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3980*c0909341SAndroid Build Coastguard Worker jmp wq 3981*c0909341SAndroid Build Coastguard Worker.w4: 3982*c0909341SAndroid Build Coastguard Worker movq m0, [idxq] 3983*c0909341SAndroid Build Coastguard Worker add idxq, 8 3984*c0909341SAndroid Build Coastguard Worker psrlw m1, m0, 4 3985*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 3986*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 3987*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m0 3988*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 3989*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 3990*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 3991*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 3992*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3993*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m1 3994*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m1 3995*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3996*c0909341SAndroid Build Coastguard Worker sub hd, 4 3997*c0909341SAndroid Build Coastguard Worker jg .w4 3998*c0909341SAndroid Build Coastguard Worker RET 3999*c0909341SAndroid Build Coastguard Worker.w8: 4000*c0909341SAndroid Build Coastguard Worker movu m3, [idxq] 4001*c0909341SAndroid Build Coastguard Worker add idxq, 16 4002*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4003*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3, m1 4004*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4005*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 4006*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m0 4007*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4008*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4009*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 4010*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 4011*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4012*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4013*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m3 4014*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4015*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4016*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 4017*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 4018*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4019*c0909341SAndroid Build Coastguard Worker sub hd, 4 4020*c0909341SAndroid Build Coastguard Worker jg .w8 4021*c0909341SAndroid Build Coastguard Worker RET 4022*c0909341SAndroid Build Coastguard Worker.w16: 4023*c0909341SAndroid Build Coastguard Worker movu m3, [idxq] 4024*c0909341SAndroid Build Coastguard Worker add idxq, 16 4025*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4026*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3, m1 4027*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4028*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 4029*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m0 4030*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4031*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4032*c0909341SAndroid Build Coastguard Worker mova [dstq+ 0], m0 4033*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 4034*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4035*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m3 4036*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4037*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4038*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+ 0], m0 4039*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+16], m1 4040*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4041*c0909341SAndroid Build Coastguard Worker sub hd, 2 4042*c0909341SAndroid Build Coastguard Worker jg .w16 4043*c0909341SAndroid Build Coastguard Worker RET 4044*c0909341SAndroid Build Coastguard Worker.w32: 4045*c0909341SAndroid Build Coastguard Worker movu m3, [idxq] 4046*c0909341SAndroid Build Coastguard Worker add idxq, 16 4047*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4048*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3, m1 4049*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4050*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 4051*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m0 4052*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4053*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4054*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 4055*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 4056*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4057*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m3 4058*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4059*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4060*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 4061*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m1 4062*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4063*c0909341SAndroid Build Coastguard Worker dec hd 4064*c0909341SAndroid Build Coastguard Worker jg .w32 4065*c0909341SAndroid Build Coastguard Worker RET 4066*c0909341SAndroid Build Coastguard Worker.w64: 4067*c0909341SAndroid Build Coastguard Worker movu m3, [idxq+16*0] 4068*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4069*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3, m1 4070*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4071*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 4072*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m0 4073*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4074*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4075*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 4076*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 4077*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4078*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m3 4079*c0909341SAndroid Build Coastguard Worker movu m3, [idxq+16*1] 4080*c0909341SAndroid Build Coastguard Worker add idxq, 32 4081*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4082*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4083*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 4084*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m1 4085*c0909341SAndroid Build Coastguard Worker psrlw m1, m3, 4 4086*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3, m1 4087*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 4088*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 4089*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m0 4090*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4091*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4092*c0909341SAndroid Build Coastguard Worker mova [dstq+16*4], m0 4093*c0909341SAndroid Build Coastguard Worker mova [dstq+16*5], m1 4094*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m3 4095*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m3 4096*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 4097*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 4098*c0909341SAndroid Build Coastguard Worker mova [dstq+16*6], m0 4099*c0909341SAndroid Build Coastguard Worker mova [dstq+16*7], m1 4100*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4101*c0909341SAndroid Build Coastguard Worker dec hd 4102*c0909341SAndroid Build Coastguard Worker jg .w64 4103*c0909341SAndroid Build Coastguard Worker RET 4104