1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHT_TABLE 1-* 32*c0909341SAndroid Build Coastguard Worker %rep %0 33*c0909341SAndroid Build Coastguard Worker db %1-128, 127-%1 34*c0909341SAndroid Build Coastguard Worker %rotate 1 35*c0909341SAndroid Build Coastguard Worker %endrep 36*c0909341SAndroid Build Coastguard Worker%endmacro 37*c0909341SAndroid Build Coastguard Worker 38*c0909341SAndroid Build Coastguard Worker; sm_weights[], but modified to precalculate x and 256-x with offsets to 39*c0909341SAndroid Build Coastguard Worker; enable efficient use of pmaddubsw (which requires signed values) 40*c0909341SAndroid Build Coastguard Workersmooth_weights: SMOOTH_WEIGHT_TABLE \ 41*c0909341SAndroid Build Coastguard Worker 0, 0, 255, 128, 255, 149, 85, 64, \ 42*c0909341SAndroid Build Coastguard Worker 255, 197, 146, 105, 73, 50, 37, 32, \ 43*c0909341SAndroid Build Coastguard Worker 255, 225, 196, 170, 145, 123, 102, 84, \ 44*c0909341SAndroid Build Coastguard Worker 68, 54, 43, 33, 26, 20, 17, 16, \ 45*c0909341SAndroid Build Coastguard Worker 255, 240, 225, 210, 196, 182, 169, 157, \ 46*c0909341SAndroid Build Coastguard Worker 145, 133, 122, 111, 101, 92, 83, 74, \ 47*c0909341SAndroid Build Coastguard Worker 66, 59, 52, 45, 39, 34, 29, 25, \ 48*c0909341SAndroid Build Coastguard Worker 21, 17, 14, 12, 10, 9, 8, 8, \ 49*c0909341SAndroid Build Coastguard Worker 255, 248, 240, 233, 225, 218, 210, 203, \ 50*c0909341SAndroid Build Coastguard Worker 196, 189, 182, 176, 169, 163, 156, 150, \ 51*c0909341SAndroid Build Coastguard Worker 144, 138, 133, 127, 121, 116, 111, 106, \ 52*c0909341SAndroid Build Coastguard Worker 101, 96, 91, 86, 82, 77, 73, 69, \ 53*c0909341SAndroid Build Coastguard Worker 65, 61, 57, 54, 50, 47, 44, 41, \ 54*c0909341SAndroid Build Coastguard Worker 38, 35, 32, 29, 27, 25, 22, 20, \ 55*c0909341SAndroid Build Coastguard Worker 18, 16, 15, 13, 12, 10, 9, 8, \ 56*c0909341SAndroid Build Coastguard Worker 7, 6, 6, 5, 5, 4, 4, 4 57*c0909341SAndroid Build Coastguard Worker 58*c0909341SAndroid Build Coastguard Workeripred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 59*c0909341SAndroid Build Coastguard Workeripred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 60*c0909341SAndroid Build Coastguard Workeripred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 61*c0909341SAndroid Build Coastguard Workerz_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 62*c0909341SAndroid Build Coastguard Workerz_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 63*c0909341SAndroid Build Coastguard Workerz_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 64*c0909341SAndroid Build Coastguard Workerz3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 65*c0909341SAndroid Build Coastguard Workerz3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 66*c0909341SAndroid Build Coastguard Workerfilter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 67*c0909341SAndroid Build Coastguard Workerfilter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 68*c0909341SAndroid Build Coastguard Workerz_filter_wh4: db 7, 7, 19, 7, 69*c0909341SAndroid Build Coastguard Workerz_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 70*c0909341SAndroid Build Coastguard Workerpd_32768: dd 32768 71*c0909341SAndroid Build Coastguard Workerz3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 72*c0909341SAndroid Build Coastguard Workerz1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 73*c0909341SAndroid Build Coastguard Workerpb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 74*c0909341SAndroid Build Coastguard Workerpb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 75*c0909341SAndroid Build Coastguard Workerz_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 76*c0909341SAndroid Build Coastguard Workerz3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 77*c0909341SAndroid Build Coastguard Workerz_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 78*c0909341SAndroid Build Coastguard Workerz_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 79*c0909341SAndroid Build Coastguard Worker db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 80*c0909341SAndroid Build Coastguard Workerz_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 81*c0909341SAndroid Build Coastguard Workerz_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 82*c0909341SAndroid Build Coastguard Worker db 7, 8, 8, 9, 9, 10, 10, 11 83*c0909341SAndroid Build Coastguard Workerz_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 84*c0909341SAndroid Build Coastguard Workerz2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 85*c0909341SAndroid Build Coastguard Workerz2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 86*c0909341SAndroid Build Coastguard Workerz2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 87*c0909341SAndroid Build Coastguard Workerpw_m1to4: dw -1, -2, -3, -4 88*c0909341SAndroid Build Coastguard Workerz_filter_k: times 4 db 0, 16 89*c0909341SAndroid Build Coastguard Worker times 4 db 0, 20 90*c0909341SAndroid Build Coastguard Worker times 4 db 8, 16 91*c0909341SAndroid Build Coastguard Worker times 4 db 32, 16 92*c0909341SAndroid Build Coastguard Worker times 4 db 24, 20 93*c0909341SAndroid Build Coastguard Worker times 4 db 16, 16 94*c0909341SAndroid Build Coastguard Worker times 4 db 0, 0 95*c0909341SAndroid Build Coastguard Worker times 4 db 0, 0 96*c0909341SAndroid Build Coastguard Workerpw_8: times 8 db 8, 0 97*c0909341SAndroid Build Coastguard Workerpb_3: times 16 db 3 98*c0909341SAndroid Build Coastguard Workerpb_16: times 16 db 16 99*c0909341SAndroid Build Coastguard Workerpw_62: times 8 dw 62 100*c0909341SAndroid Build Coastguard Workerpw_64: times 8 dw 64 101*c0909341SAndroid Build Coastguard Workerpw_256: times 8 dw 256 102*c0909341SAndroid Build Coastguard Workerpw_512: times 8 dw 512 103*c0909341SAndroid Build Coastguard Workerpw_m256: times 8 dw -256 104*c0909341SAndroid Build Coastguard Workerpb_2: times 8 db 2 105*c0909341SAndroid Build Coastguard Workerpb_4: times 8 db 4 106*c0909341SAndroid Build Coastguard Workerpb_8: times 8 db 8 107*c0909341SAndroid Build Coastguard Workerpb_128: times 8 db 128 108*c0909341SAndroid Build Coastguard Workerpb_m16: times 8 db -16 109*c0909341SAndroid Build Coastguard Workerpw_128: times 4 dw 128 110*c0909341SAndroid Build Coastguard Workerpw_255: times 4 dw 255 111*c0909341SAndroid Build Coastguard Workerpb_36_m4: times 4 db 36, -4 112*c0909341SAndroid Build Coastguard Workerpb_127_m127: times 4 db 127, -127 113*c0909341SAndroid Build Coastguard Worker 114*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-* 115*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*4) 116*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 117*c0909341SAndroid Build Coastguard Worker %%table: 118*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 119*c0909341SAndroid Build Coastguard Worker dd %%base %+ .%3 - (%%table - 2*4) 120*c0909341SAndroid Build Coastguard Worker %rotate 1 121*c0909341SAndroid Build Coastguard Worker %endrep 122*c0909341SAndroid Build Coastguard Worker%endmacro 123*c0909341SAndroid Build Coastguard Worker 124*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) 125*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) 126*c0909341SAndroid Build Coastguard Worker 127*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 128*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 129*c0909341SAndroid Build Coastguard Worker s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 130*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 131*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 132*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 133*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 134*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 135*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 136*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 137*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 138*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 139*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ 140*c0909341SAndroid Build Coastguard Worker s4-8*4, s8-8*4, s16-8*4, s32-8*4 141*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 142*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 143*c0909341SAndroid Build Coastguard Worker 144*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative 145*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps 146*c0909341SAndroid Build Coastguard Worker 147*c0909341SAndroid Build Coastguard WorkerSECTION .text 148*c0909341SAndroid Build Coastguard Worker 149*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 150*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 151*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 152*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 153*c0909341SAndroid Build Coastguard Worker%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 154*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, %3 ; extend 8 byte for 2 pos 155*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 156*c0909341SAndroid Build Coastguard Worker mova [dstq + %2], m1 157*c0909341SAndroid Build Coastguard Worker%if %1 > 16 158*c0909341SAndroid Build Coastguard Worker mova [dstq + 16 + %2], m1 159*c0909341SAndroid Build Coastguard Worker%endif 160*c0909341SAndroid Build Coastguard Worker%if %1 > 32 161*c0909341SAndroid Build Coastguard Worker mova [dstq + 32 + %2], m1 162*c0909341SAndroid Build Coastguard Worker mova [dstq + 48 + %2], m1 163*c0909341SAndroid Build Coastguard Worker%endif 164*c0909341SAndroid Build Coastguard Worker%endmacro 165*c0909341SAndroid Build Coastguard Worker 166*c0909341SAndroid Build Coastguard Worker%macro IPRED_H 1 ; width 167*c0909341SAndroid Build Coastguard Worker sub tlq, 4 168*c0909341SAndroid Build Coastguard Worker movd m0, [tlq] ; get 4 bytes of topleft data 169*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 ; extend 2 byte 170*c0909341SAndroid Build Coastguard Worker%if %1 == 4 171*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q2233 172*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m1 173*c0909341SAndroid Build Coastguard Worker psrlq m1, 32 174*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 175*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0011 176*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m0 177*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 178*c0909341SAndroid Build Coastguard Worker movd [dstq+stride3q ], m0 179*c0909341SAndroid Build Coastguard Worker 180*c0909341SAndroid Build Coastguard Worker%elif %1 == 8 181*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m0 182*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m0, m0 183*c0909341SAndroid Build Coastguard Worker punpckldq m0, m0 184*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m1 185*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], m1 186*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], m0 187*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], m0 188*c0909341SAndroid Build Coastguard Worker%else 189*c0909341SAndroid Build Coastguard Worker IPRED_SET %1, 0, q3333 190*c0909341SAndroid Build Coastguard Worker IPRED_SET %1, strideq, q2222 191*c0909341SAndroid Build Coastguard Worker IPRED_SET %1, strideq*2, q1111 192*c0909341SAndroid Build Coastguard Worker IPRED_SET %1, stride3q, q0000 193*c0909341SAndroid Build Coastguard Worker%endif 194*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 195*c0909341SAndroid Build Coastguard Worker sub hd, 4 196*c0909341SAndroid Build Coastguard Worker jg .w%1 197*c0909341SAndroid Build Coastguard Worker RET 198*c0909341SAndroid Build Coastguard Worker%endmacro 199*c0909341SAndroid Build Coastguard Worker 200*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 201*c0909341SAndroid Build Coastguard Workercglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 202*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_h_ssse3_table 203*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 204*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 205*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 206*c0909341SAndroid Build Coastguard Worker add wq, r5 207*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 208*c0909341SAndroid Build Coastguard Worker jmp wq 209*c0909341SAndroid Build Coastguard Worker.w4: 210*c0909341SAndroid Build Coastguard Worker IPRED_H 4 211*c0909341SAndroid Build Coastguard Worker.w8: 212*c0909341SAndroid Build Coastguard Worker IPRED_H 8 213*c0909341SAndroid Build Coastguard Worker.w16: 214*c0909341SAndroid Build Coastguard Worker IPRED_H 16 215*c0909341SAndroid Build Coastguard Worker.w32: 216*c0909341SAndroid Build Coastguard Worker IPRED_H 32 217*c0909341SAndroid Build Coastguard Worker.w64: 218*c0909341SAndroid Build Coastguard Worker IPRED_H 64 219*c0909341SAndroid Build Coastguard Worker 220*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 221*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 222*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 223*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 224*c0909341SAndroid Build Coastguard Workercglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 225*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_splat_ssse3_table 226*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 227*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+ 1] 228*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+17] 229*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+33] 230*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+49] 231*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 232*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 233*c0909341SAndroid Build Coastguard Worker add wq, r5 234*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 235*c0909341SAndroid Build Coastguard Worker jmp wq 236*c0909341SAndroid Build Coastguard Worker 237*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 238*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 239*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 240*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 241*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 242*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 243*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 244*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 245*c0909341SAndroid Build Coastguard Worker lea r5d, [wq+hq] 246*c0909341SAndroid Build Coastguard Worker movd m4, r5d 247*c0909341SAndroid Build Coastguard Worker tzcnt r5d, r5d 248*c0909341SAndroid Build Coastguard Worker movd m5, r5d 249*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_ssse3_table 250*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 251*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 252*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4+20] 253*c0909341SAndroid Build Coastguard Worker pcmpeqd m3, m3 254*c0909341SAndroid Build Coastguard Worker psrlw m4, 1 ; dc = (width + height) >> 1; 255*c0909341SAndroid Build Coastguard Worker add r6, r5 256*c0909341SAndroid Build Coastguard Worker add wq, r5 257*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 258*c0909341SAndroid Build Coastguard Worker jmp r6 259*c0909341SAndroid Build Coastguard Worker.h4: 260*c0909341SAndroid Build Coastguard Worker movd m0, [tlq-4] 261*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 262*c0909341SAndroid Build Coastguard Worker jmp wq 263*c0909341SAndroid Build Coastguard Worker.w4: 264*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+1] 265*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 266*c0909341SAndroid Build Coastguard Worker psubw m0, m4 267*c0909341SAndroid Build Coastguard Worker paddw m0, m1 268*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 269*c0909341SAndroid Build Coastguard Worker cmp hd, 4 270*c0909341SAndroid Build Coastguard Worker jg .w4_mul 271*c0909341SAndroid Build Coastguard Worker psrlw m0, 3 ; dc >>= ctz(width + height); 272*c0909341SAndroid Build Coastguard Worker jmp .w4_end 273*c0909341SAndroid Build Coastguard Worker.w4_mul: 274*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m0, m0 275*c0909341SAndroid Build Coastguard Worker paddw m0, m1 276*c0909341SAndroid Build Coastguard Worker psrlq m1, m0, 32 277*c0909341SAndroid Build Coastguard Worker paddw m0, m1 278*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 279*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 280*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 281*c0909341SAndroid Build Coastguard Worker test hd, 8 282*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 283*c0909341SAndroid Build Coastguard Worker movd m5, r6d 284*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m5 285*c0909341SAndroid Build Coastguard Worker.w4_end: 286*c0909341SAndroid Build Coastguard Worker pxor m1, m1 287*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 288*c0909341SAndroid Build Coastguard Worker.s4: 289*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 290*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m0 291*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m0 292*c0909341SAndroid Build Coastguard Worker movd [dstq+stride3q ], m0 293*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 294*c0909341SAndroid Build Coastguard Worker sub hd, 4 295*c0909341SAndroid Build Coastguard Worker jg .s4 296*c0909341SAndroid Build Coastguard Worker RET 297*c0909341SAndroid Build Coastguard WorkerALIGN function_align 298*c0909341SAndroid Build Coastguard Worker.h8: 299*c0909341SAndroid Build Coastguard Worker movq m0, [tlq-8] 300*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 301*c0909341SAndroid Build Coastguard Worker jmp wq 302*c0909341SAndroid Build Coastguard Worker.w8: 303*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+1] 304*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 305*c0909341SAndroid Build Coastguard Worker psubw m4, m0 306*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 307*c0909341SAndroid Build Coastguard Worker psubw m0, m4 308*c0909341SAndroid Build Coastguard Worker paddw m0, m1 309*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 310*c0909341SAndroid Build Coastguard Worker paddw m0, m1 311*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 312*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 313*c0909341SAndroid Build Coastguard Worker cmp hd, 8 314*c0909341SAndroid Build Coastguard Worker je .w8_end 315*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 316*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 317*c0909341SAndroid Build Coastguard Worker cmp hd, 32 318*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 319*c0909341SAndroid Build Coastguard Worker movd m1, r6d 320*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 321*c0909341SAndroid Build Coastguard Worker.w8_end: 322*c0909341SAndroid Build Coastguard Worker pxor m1, m1 323*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 324*c0909341SAndroid Build Coastguard Worker.s8: 325*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 326*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m0 327*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m0 328*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], m0 329*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 330*c0909341SAndroid Build Coastguard Worker sub hd, 4 331*c0909341SAndroid Build Coastguard Worker jg .s8 332*c0909341SAndroid Build Coastguard Worker RET 333*c0909341SAndroid Build Coastguard WorkerALIGN function_align 334*c0909341SAndroid Build Coastguard Worker.h16: 335*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16] 336*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 337*c0909341SAndroid Build Coastguard Worker jmp wq 338*c0909341SAndroid Build Coastguard Worker.w16: 339*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] 340*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 341*c0909341SAndroid Build Coastguard Worker paddw m0, m1 342*c0909341SAndroid Build Coastguard Worker psubw m4, m0 343*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 344*c0909341SAndroid Build Coastguard Worker psubw m0, m4 345*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 346*c0909341SAndroid Build Coastguard Worker paddw m0, m1 347*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 348*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 349*c0909341SAndroid Build Coastguard Worker cmp hd, 16 350*c0909341SAndroid Build Coastguard Worker je .w16_end 351*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 352*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 353*c0909341SAndroid Build Coastguard Worker test hd, 8|32 354*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 355*c0909341SAndroid Build Coastguard Worker movd m1, r6d 356*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 357*c0909341SAndroid Build Coastguard Worker.w16_end: 358*c0909341SAndroid Build Coastguard Worker pxor m1, m1 359*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 360*c0909341SAndroid Build Coastguard Worker.s16: 361*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 362*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 363*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 364*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m0 365*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 366*c0909341SAndroid Build Coastguard Worker sub hd, 4 367*c0909341SAndroid Build Coastguard Worker jg .s16 368*c0909341SAndroid Build Coastguard Worker RET 369*c0909341SAndroid Build Coastguard WorkerALIGN function_align 370*c0909341SAndroid Build Coastguard Worker.h32: 371*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] 372*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 373*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16] 374*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 375*c0909341SAndroid Build Coastguard Worker paddw m0, m2 376*c0909341SAndroid Build Coastguard Worker jmp wq 377*c0909341SAndroid Build Coastguard Worker.w32: 378*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] 379*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 380*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+17] 381*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 382*c0909341SAndroid Build Coastguard Worker paddw m1, m2 383*c0909341SAndroid Build Coastguard Worker paddw m0, m1 384*c0909341SAndroid Build Coastguard Worker psubw m4, m0 385*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 386*c0909341SAndroid Build Coastguard Worker psubw m0, m4 387*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 388*c0909341SAndroid Build Coastguard Worker paddw m0, m1 389*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 390*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 391*c0909341SAndroid Build Coastguard Worker cmp hd, 32 392*c0909341SAndroid Build Coastguard Worker je .w32_end 393*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 394*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 395*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 396*c0909341SAndroid Build Coastguard Worker test hd, 64|16 397*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 398*c0909341SAndroid Build Coastguard Worker movd m1, r6d 399*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 400*c0909341SAndroid Build Coastguard Worker.w32_end: 401*c0909341SAndroid Build Coastguard Worker pxor m1, m1 402*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 403*c0909341SAndroid Build Coastguard Worker mova m1, m0 404*c0909341SAndroid Build Coastguard Worker.s32: 405*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 406*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 407*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq], m0 408*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+16], m1 409*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 410*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2+16], m1 411*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q], m0 412*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q+16], m1 413*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 414*c0909341SAndroid Build Coastguard Worker sub hd, 4 415*c0909341SAndroid Build Coastguard Worker jg .s32 416*c0909341SAndroid Build Coastguard Worker RET 417*c0909341SAndroid Build Coastguard WorkerALIGN function_align 418*c0909341SAndroid Build Coastguard Worker.h64: 419*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-64] 420*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-48] 421*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 422*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 423*c0909341SAndroid Build Coastguard Worker paddw m0, m1 424*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-32] 425*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 426*c0909341SAndroid Build Coastguard Worker paddw m0, m1 427*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16] 428*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 429*c0909341SAndroid Build Coastguard Worker paddw m0, m1 430*c0909341SAndroid Build Coastguard Worker jmp wq 431*c0909341SAndroid Build Coastguard Worker.w64: 432*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+ 1] 433*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+17] 434*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 435*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 436*c0909341SAndroid Build Coastguard Worker paddw m1, m2 437*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+33] 438*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 439*c0909341SAndroid Build Coastguard Worker paddw m1, m2 440*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+49] 441*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 442*c0909341SAndroid Build Coastguard Worker paddw m1, m2 443*c0909341SAndroid Build Coastguard Worker paddw m0, m1 444*c0909341SAndroid Build Coastguard Worker psubw m4, m0 445*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 446*c0909341SAndroid Build Coastguard Worker psubw m0, m4 447*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 448*c0909341SAndroid Build Coastguard Worker paddw m0, m1 449*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 450*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 451*c0909341SAndroid Build Coastguard Worker cmp hd, 64 452*c0909341SAndroid Build Coastguard Worker je .w64_end 453*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 454*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 455*c0909341SAndroid Build Coastguard Worker test hd, 32 456*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 457*c0909341SAndroid Build Coastguard Worker movd m1, r6d 458*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 459*c0909341SAndroid Build Coastguard Worker.w64_end: 460*c0909341SAndroid Build Coastguard Worker pxor m1, m1 461*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 462*c0909341SAndroid Build Coastguard Worker mova m1, m0 463*c0909341SAndroid Build Coastguard Worker mova m2, m0 464*c0909341SAndroid Build Coastguard Worker mova m3, m0 465*c0909341SAndroid Build Coastguard Worker.s64: 466*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 467*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 468*c0909341SAndroid Build Coastguard Worker mova [dstq+32], m2 469*c0909341SAndroid Build Coastguard Worker mova [dstq+48], m3 470*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq], m0 471*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+16], m1 472*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+32], m2 473*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq+48], m3 474*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 475*c0909341SAndroid Build Coastguard Worker sub hd, 2 476*c0909341SAndroid Build Coastguard Worker jg .s64 477*c0909341SAndroid Build Coastguard Worker RET 478*c0909341SAndroid Build Coastguard Worker 479*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 480*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 481*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 482*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 483*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 484*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_left_ssse3_table 485*c0909341SAndroid Build Coastguard Worker mov hd, hm ; zero upper half 486*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 487*c0909341SAndroid Build Coastguard Worker sub tlq, hq 488*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 489*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 490*c0909341SAndroid Build Coastguard Worker movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] 491*c0909341SAndroid Build Coastguard Worker movd m2, r6d 492*c0909341SAndroid Build Coastguard Worker psrld m3, m2 493*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 494*c0909341SAndroid Build Coastguard Worker pcmpeqd m2, m2 495*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 496*c0909341SAndroid Build Coastguard Worker add r6, r5 497*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table 498*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 499*c0909341SAndroid Build Coastguard Worker add wq, r5 500*c0909341SAndroid Build Coastguard Worker jmp r6 501*c0909341SAndroid Build Coastguard Worker.h64: 502*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+48] ; unaligned when jumping here from dc_top 503*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 504*c0909341SAndroid Build Coastguard Worker paddw m0, m1 505*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+32] ; unaligned when jumping here from dc_top 506*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 507*c0909341SAndroid Build Coastguard Worker paddw m0, m1 508*c0909341SAndroid Build Coastguard Worker.h32: 509*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16] ; unaligned when jumping here from dc_top 510*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 511*c0909341SAndroid Build Coastguard Worker paddw m0, m1 512*c0909341SAndroid Build Coastguard Worker.h16: 513*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q3232 ; psrlq m1, m0, 16 514*c0909341SAndroid Build Coastguard Worker paddw m0, m1 515*c0909341SAndroid Build Coastguard Worker.h8: 516*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 517*c0909341SAndroid Build Coastguard Worker paddw m0, m1 518*c0909341SAndroid Build Coastguard Worker.h4: 519*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m2 520*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 521*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 522*c0909341SAndroid Build Coastguard Worker pxor m1, m1 523*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 524*c0909341SAndroid Build Coastguard Worker mova m1, m0 525*c0909341SAndroid Build Coastguard Worker mova m2, m0 526*c0909341SAndroid Build Coastguard Worker mova m3, m0 527*c0909341SAndroid Build Coastguard Worker jmp wq 528*c0909341SAndroid Build Coastguard Worker 529*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 530*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 531*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 532*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 533*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 534*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_splat_ssse3_table 535*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 536*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 537*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 538*c0909341SAndroid Build Coastguard Worker movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] 539*c0909341SAndroid Build Coastguard Worker mova m1, m0 540*c0909341SAndroid Build Coastguard Worker mova m2, m0 541*c0909341SAndroid Build Coastguard Worker mova m3, m0 542*c0909341SAndroid Build Coastguard Worker add wq, r5 543*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 544*c0909341SAndroid Build Coastguard Worker jmp wq 545*c0909341SAndroid Build Coastguard Worker 546*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 547*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 548*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 549*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 550*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h 551*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_dc_left_ssse3_table 552*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 553*c0909341SAndroid Build Coastguard Worker inc tlq 554*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 555*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 556*c0909341SAndroid Build Coastguard Worker movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] 557*c0909341SAndroid Build Coastguard Worker movd m2, wd 558*c0909341SAndroid Build Coastguard Worker psrld m3, m2 559*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+wq*4] 560*c0909341SAndroid Build Coastguard Worker pcmpeqd m2, m2 561*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 562*c0909341SAndroid Build Coastguard Worker add r6, r5 563*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table 564*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 565*c0909341SAndroid Build Coastguard Worker add wq, r5 566*c0909341SAndroid Build Coastguard Worker jmp r6 567*c0909341SAndroid Build Coastguard Worker 568*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 569*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 570*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 571*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 572*c0909341SAndroid Build Coastguard Worker%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] 573*c0909341SAndroid Build Coastguard Worker ; w * a = (w - 128) * a + 128 * a 574*c0909341SAndroid Build Coastguard Worker ; (256 - w) * b = (127 - w) * b + 129 * b 575*c0909341SAndroid Build Coastguard Worker ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] 576*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m%3, m%1 577*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b 578*c0909341SAndroid Build Coastguard Worker paddw m6, m%5 579*c0909341SAndroid Build Coastguard Worker paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] 580*c0909341SAndroid Build Coastguard Worker psrlw m6, 8 581*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 582*c0909341SAndroid Build Coastguard Worker packuswb m6, m0 583*c0909341SAndroid Build Coastguard Worker%endmacro 584*c0909341SAndroid Build Coastguard Worker 585*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights 586*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_ssse3_table 587*c0909341SAndroid Build Coastguard Worker LEA r6, ipred_smooth_v_ssse3_table 588*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 589*c0909341SAndroid Build Coastguard Worker mov hd, hm 590*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 591*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pb_127_m127] 592*c0909341SAndroid Build Coastguard Worker movddup m1, [base+pw_128] 593*c0909341SAndroid Build Coastguard Worker lea weightsq, [base+smooth_weights+hq*4] 594*c0909341SAndroid Build Coastguard Worker neg hq 595*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+hq] 596*c0909341SAndroid Build Coastguard Worker pxor m2, m2 597*c0909341SAndroid Build Coastguard Worker pshufb m5, m2 598*c0909341SAndroid Build Coastguard Worker add wq, r6 599*c0909341SAndroid Build Coastguard Worker jmp wq 600*c0909341SAndroid Build Coastguard Worker.w4: 601*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+1] 602*c0909341SAndroid Build Coastguard Worker punpckldq m2, m2 603*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m5 ; top, bottom 604*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 605*c0909341SAndroid Build Coastguard Worker mova m4, [base+ipred_v_shuf] 606*c0909341SAndroid Build Coastguard Worker mova m5, m4 607*c0909341SAndroid Build Coastguard Worker punpckldq m4, m4 608*c0909341SAndroid Build Coastguard Worker punpckhdq m5, m5 609*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom 610*c0909341SAndroid Build Coastguard Worker paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok 611*c0909341SAndroid Build Coastguard Worker paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 612*c0909341SAndroid Build Coastguard Worker.w4_loop: 613*c0909341SAndroid Build Coastguard Worker movu m1, [weightsq+hq*2] 614*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop 615*c0909341SAndroid Build Coastguard Worker pshufb m1, m5 616*c0909341SAndroid Build Coastguard Worker SMOOTH 0, 1, 2, 2, 3, 3 617*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m6 618*c0909341SAndroid Build Coastguard Worker pshuflw m1, m6, q1032 619*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 620*c0909341SAndroid Build Coastguard Worker punpckhqdq m6, m6 621*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m6 622*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 623*c0909341SAndroid Build Coastguard Worker movd [dstq+r3 ], m6 624*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 625*c0909341SAndroid Build Coastguard Worker add hq, 4 626*c0909341SAndroid Build Coastguard Worker jl .w4_loop 627*c0909341SAndroid Build Coastguard Worker RET 628*c0909341SAndroid Build Coastguard WorkerALIGN function_align 629*c0909341SAndroid Build Coastguard Worker.w8: 630*c0909341SAndroid Build Coastguard Worker movq m2, [tlq+1] 631*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m5 632*c0909341SAndroid Build Coastguard Worker mova m5, [base+ipred_v_shuf] 633*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 634*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q0000 635*c0909341SAndroid Build Coastguard Worker pshufd m5, m5, q1111 636*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m0 637*c0909341SAndroid Build Coastguard Worker paddw m1, m2 638*c0909341SAndroid Build Coastguard Worker paddw m3, m1 ; m3 is output for loop 639*c0909341SAndroid Build Coastguard Worker.w8_loop: 640*c0909341SAndroid Build Coastguard Worker movq m1, [weightsq+hq*2] 641*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m4 642*c0909341SAndroid Build Coastguard Worker pshufb m1, m5 643*c0909341SAndroid Build Coastguard Worker SMOOTH 0, 1, 2, 2, 3, 3 644*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m6 645*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m6 646*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 647*c0909341SAndroid Build Coastguard Worker add hq, 2 648*c0909341SAndroid Build Coastguard Worker jl .w8_loop 649*c0909341SAndroid Build Coastguard Worker RET 650*c0909341SAndroid Build Coastguard WorkerALIGN function_align 651*c0909341SAndroid Build Coastguard Worker.w16: 652*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+1] 653*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m5 654*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m5 655*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m0 656*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m0 657*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 658*c0909341SAndroid Build Coastguard Worker paddw m1, m3 659*c0909341SAndroid Build Coastguard Worker paddw m4, m0 660*c0909341SAndroid Build Coastguard Worker paddw m5, m1 ; m4 and m5 is output for loop 661*c0909341SAndroid Build Coastguard Worker.w16_loop: 662*c0909341SAndroid Build Coastguard Worker movd m1, [weightsq+hq*2] 663*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 664*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 665*c0909341SAndroid Build Coastguard Worker SMOOTH 1, 1, 2, 3, 4, 5 666*c0909341SAndroid Build Coastguard Worker mova [dstq], m6 667*c0909341SAndroid Build Coastguard Worker add dstq, strideq 668*c0909341SAndroid Build Coastguard Worker add hq, 1 669*c0909341SAndroid Build Coastguard Worker jl .w16_loop 670*c0909341SAndroid Build Coastguard Worker RET 671*c0909341SAndroid Build Coastguard WorkerALIGN function_align 672*c0909341SAndroid Build Coastguard Worker.w32: 673*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 8, 7 674*c0909341SAndroid Build Coastguard Worker mova m7, m5 675*c0909341SAndroid Build Coastguard Worker.w32_loop_init: 676*c0909341SAndroid Build Coastguard Worker mov r3d, 2 677*c0909341SAndroid Build Coastguard Worker.w32_loop: 678*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pb_127_m127] 679*c0909341SAndroid Build Coastguard Worker movddup m1, [base+pw_128] 680*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+1] 681*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m7 682*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m7 683*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m0 684*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m0 685*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 686*c0909341SAndroid Build Coastguard Worker paddw m1, m3 687*c0909341SAndroid Build Coastguard Worker paddw m4, m0 688*c0909341SAndroid Build Coastguard Worker paddw m5, m1 689*c0909341SAndroid Build Coastguard Worker movd m1, [weightsq+hq*2] 690*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 691*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 692*c0909341SAndroid Build Coastguard Worker SMOOTH 1, 1, 2, 3, 4, 5 693*c0909341SAndroid Build Coastguard Worker mova [dstq], m6 694*c0909341SAndroid Build Coastguard Worker add tlq, 16 695*c0909341SAndroid Build Coastguard Worker add dstq, 16 696*c0909341SAndroid Build Coastguard Worker dec r3d 697*c0909341SAndroid Build Coastguard Worker jg .w32_loop 698*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq-32+strideq] 699*c0909341SAndroid Build Coastguard Worker sub tlq, 32 700*c0909341SAndroid Build Coastguard Worker add hq, 1 701*c0909341SAndroid Build Coastguard Worker jl .w32_loop_init 702*c0909341SAndroid Build Coastguard Worker RET 703*c0909341SAndroid Build Coastguard WorkerALIGN function_align 704*c0909341SAndroid Build Coastguard Worker.w64: 705*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 8, 7 706*c0909341SAndroid Build Coastguard Worker mova m7, m5 707*c0909341SAndroid Build Coastguard Worker.w64_loop_init: 708*c0909341SAndroid Build Coastguard Worker mov r3d, 4 709*c0909341SAndroid Build Coastguard Worker.w64_loop: 710*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pb_127_m127] 711*c0909341SAndroid Build Coastguard Worker movddup m1, [base+pw_128] 712*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+1] 713*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m7 714*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m7 715*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m0 716*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m0 717*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 718*c0909341SAndroid Build Coastguard Worker paddw m1, m3 719*c0909341SAndroid Build Coastguard Worker paddw m4, m0 720*c0909341SAndroid Build Coastguard Worker paddw m5, m1 721*c0909341SAndroid Build Coastguard Worker movd m1, [weightsq+hq*2] 722*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 723*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 724*c0909341SAndroid Build Coastguard Worker SMOOTH 1, 1, 2, 3, 4, 5 725*c0909341SAndroid Build Coastguard Worker mova [dstq], m6 726*c0909341SAndroid Build Coastguard Worker add tlq, 16 727*c0909341SAndroid Build Coastguard Worker add dstq, 16 728*c0909341SAndroid Build Coastguard Worker dec r3d 729*c0909341SAndroid Build Coastguard Worker jg .w64_loop 730*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq-64+strideq] 731*c0909341SAndroid Build Coastguard Worker sub tlq, 64 732*c0909341SAndroid Build Coastguard Worker add hq, 1 733*c0909341SAndroid Build Coastguard Worker jl .w64_loop_init 734*c0909341SAndroid Build Coastguard Worker RET 735*c0909341SAndroid Build Coastguard Worker 736*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 737*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 738*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 739*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 740*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h 741*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_h_ssse3_table 742*c0909341SAndroid Build Coastguard Worker LEA r6, ipred_smooth_h_ssse3_table 743*c0909341SAndroid Build Coastguard Worker mov wd, wm 744*c0909341SAndroid Build Coastguard Worker movd m3, [tlq+wq] 745*c0909341SAndroid Build Coastguard Worker pxor m1, m1 746*c0909341SAndroid Build Coastguard Worker pshufb m3, m1 ; right 747*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 748*c0909341SAndroid Build Coastguard Worker mov hd, hm 749*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 750*c0909341SAndroid Build Coastguard Worker movddup m4, [base+pb_127_m127] 751*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pw_128] 752*c0909341SAndroid Build Coastguard Worker add wq, r6 753*c0909341SAndroid Build Coastguard Worker jmp wq 754*c0909341SAndroid Build Coastguard Worker.w4: 755*c0909341SAndroid Build Coastguard Worker movddup m6, [base+smooth_weights+4*2] 756*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_h_shuf] 757*c0909341SAndroid Build Coastguard Worker sub tlq, 4 758*c0909341SAndroid Build Coastguard Worker sub tlq, hq 759*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 760*c0909341SAndroid Build Coastguard Worker.w4_loop: 761*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+hq] ; left 762*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 763*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 ; left, right 764*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 765*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 766*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; 128 * left + 129 * right 767*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 768*c0909341SAndroid Build Coastguard Worker paddw m1, m5 769*c0909341SAndroid Build Coastguard Worker paddw m0, m1 770*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m4 771*c0909341SAndroid Build Coastguard Worker paddw m1, m2 772*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 773*c0909341SAndroid Build Coastguard Worker paddw m2, m5 774*c0909341SAndroid Build Coastguard Worker paddw m1, m2 775*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 776*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 777*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 778*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 779*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 780*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 781*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 782*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m0 783*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 784*c0909341SAndroid Build Coastguard Worker movd [dstq+r3 ], m0 785*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 786*c0909341SAndroid Build Coastguard Worker sub hd, 4 787*c0909341SAndroid Build Coastguard Worker jg .w4_loop 788*c0909341SAndroid Build Coastguard Worker RET 789*c0909341SAndroid Build Coastguard WorkerALIGN function_align 790*c0909341SAndroid Build Coastguard Worker.w8: 791*c0909341SAndroid Build Coastguard Worker mova m6, [base+smooth_weights+8*2] 792*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_h_shuf] 793*c0909341SAndroid Build Coastguard Worker sub tlq, 4 794*c0909341SAndroid Build Coastguard Worker sub tlq, hq 795*c0909341SAndroid Build Coastguard Worker punpckldq m7, m7 796*c0909341SAndroid Build Coastguard Worker.w8_loop: 797*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+hq] ; left 798*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 799*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 ; left, right 800*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 801*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 802*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; 128 * left + 129 * right 803*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 804*c0909341SAndroid Build Coastguard Worker paddw m1, m5 805*c0909341SAndroid Build Coastguard Worker paddw m0, m1 806*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m4 807*c0909341SAndroid Build Coastguard Worker paddw m1, m2 808*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 809*c0909341SAndroid Build Coastguard Worker paddw m2, m5 810*c0909341SAndroid Build Coastguard Worker paddw m1, m2 811*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 812*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 813*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 814*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 815*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 816*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 817*c0909341SAndroid Build Coastguard Worker sub hd, 2 818*c0909341SAndroid Build Coastguard Worker jg .w8_loop 819*c0909341SAndroid Build Coastguard Worker RET 820*c0909341SAndroid Build Coastguard WorkerALIGN function_align 821*c0909341SAndroid Build Coastguard Worker.w16: 822*c0909341SAndroid Build Coastguard Worker mova m6, [base+smooth_weights+16*2] 823*c0909341SAndroid Build Coastguard Worker mova m7, [base+smooth_weights+16*3] 824*c0909341SAndroid Build Coastguard Worker sub tlq, 1 825*c0909341SAndroid Build Coastguard Worker sub tlq, hq 826*c0909341SAndroid Build Coastguard Worker.w16_loop: 827*c0909341SAndroid Build Coastguard Worker pxor m1, m1 828*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+hq] ; left 829*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 830*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 ; left, right 831*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 832*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 833*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; 128 * left + 129 * right 834*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 835*c0909341SAndroid Build Coastguard Worker paddw m1, m5 836*c0909341SAndroid Build Coastguard Worker paddw m0, m1 837*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m4 838*c0909341SAndroid Build Coastguard Worker paddw m1, m2 839*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 840*c0909341SAndroid Build Coastguard Worker paddw m2, m5 841*c0909341SAndroid Build Coastguard Worker paddw m1, m2 842*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 843*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 844*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 845*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 846*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq] 847*c0909341SAndroid Build Coastguard Worker sub hd, 1 848*c0909341SAndroid Build Coastguard Worker jg .w16_loop 849*c0909341SAndroid Build Coastguard Worker RET 850*c0909341SAndroid Build Coastguard WorkerALIGN function_align 851*c0909341SAndroid Build Coastguard Worker.w32: 852*c0909341SAndroid Build Coastguard Worker sub tlq, 1 853*c0909341SAndroid Build Coastguard Worker sub tlq, hq 854*c0909341SAndroid Build Coastguard Worker pxor m6, m6 855*c0909341SAndroid Build Coastguard Worker.w32_loop_init: 856*c0909341SAndroid Build Coastguard Worker mov r5, 2 857*c0909341SAndroid Build Coastguard Worker lea r3, [base+smooth_weights+16*4] 858*c0909341SAndroid Build Coastguard Worker.w32_loop: 859*c0909341SAndroid Build Coastguard Worker mova m7, [r3] 860*c0909341SAndroid Build Coastguard Worker add r3, 16 861*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+hq] ; left 862*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 863*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 ; left, right 864*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 865*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 866*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; 128 * left + 129 * right 867*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 868*c0909341SAndroid Build Coastguard Worker paddw m1, m5 869*c0909341SAndroid Build Coastguard Worker paddw m0, m1 870*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m4 871*c0909341SAndroid Build Coastguard Worker paddw m1, m2 872*c0909341SAndroid Build Coastguard Worker mova m7, [r3] 873*c0909341SAndroid Build Coastguard Worker add r3, 16 874*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 875*c0909341SAndroid Build Coastguard Worker paddw m2, m5 876*c0909341SAndroid Build Coastguard Worker paddw m1, m2 877*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 878*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 879*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 880*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 881*c0909341SAndroid Build Coastguard Worker add dstq, 16 882*c0909341SAndroid Build Coastguard Worker dec r5 883*c0909341SAndroid Build Coastguard Worker jg .w32_loop 884*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq-32+strideq] 885*c0909341SAndroid Build Coastguard Worker sub hd, 1 886*c0909341SAndroid Build Coastguard Worker jg .w32_loop_init 887*c0909341SAndroid Build Coastguard Worker RET 888*c0909341SAndroid Build Coastguard WorkerALIGN function_align 889*c0909341SAndroid Build Coastguard Worker.w64: 890*c0909341SAndroid Build Coastguard Worker sub tlq, 1 891*c0909341SAndroid Build Coastguard Worker sub tlq, hq 892*c0909341SAndroid Build Coastguard Worker pxor m6, m6 893*c0909341SAndroid Build Coastguard Worker.w64_loop_init: 894*c0909341SAndroid Build Coastguard Worker mov r5, 4 895*c0909341SAndroid Build Coastguard Worker lea r3, [base+smooth_weights+16*8] 896*c0909341SAndroid Build Coastguard Worker.w64_loop: 897*c0909341SAndroid Build Coastguard Worker mova m7, [r3] 898*c0909341SAndroid Build Coastguard Worker add r3, 16 899*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+hq] ; left 900*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 901*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 ; left, right 902*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 903*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 904*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; 128 * left + 129 * right 905*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 906*c0909341SAndroid Build Coastguard Worker paddw m1, m5 907*c0909341SAndroid Build Coastguard Worker paddw m0, m1 908*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m4 909*c0909341SAndroid Build Coastguard Worker paddw m1, m2 910*c0909341SAndroid Build Coastguard Worker mova m7, [r3] 911*c0909341SAndroid Build Coastguard Worker add r3, 16 912*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 913*c0909341SAndroid Build Coastguard Worker paddw m2, m5 914*c0909341SAndroid Build Coastguard Worker paddw m1, m2 915*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 916*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 917*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 918*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 919*c0909341SAndroid Build Coastguard Worker add dstq, 16 920*c0909341SAndroid Build Coastguard Worker dec r5 921*c0909341SAndroid Build Coastguard Worker jg .w64_loop 922*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq-64+strideq] 923*c0909341SAndroid Build Coastguard Worker sub hd, 1 924*c0909341SAndroid Build Coastguard Worker jg .w64_loop_init 925*c0909341SAndroid Build Coastguard Worker RET 926*c0909341SAndroid Build Coastguard Worker 927*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 928*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 929*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int a); 930*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 931*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 932*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m%3, m%1 933*c0909341SAndroid Build Coastguard Worker mova m0, m6 934*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m%4, m%2 935*c0909341SAndroid Build Coastguard Worker mova m1, m6 936*c0909341SAndroid Build Coastguard Worker%ifnum %5 937*c0909341SAndroid Build Coastguard Worker paddw m0, m%5 938*c0909341SAndroid Build Coastguard Worker%else 939*c0909341SAndroid Build Coastguard Worker paddw m0, %5 940*c0909341SAndroid Build Coastguard Worker%endif 941*c0909341SAndroid Build Coastguard Worker%ifnum %6 942*c0909341SAndroid Build Coastguard Worker paddw m1, m%6 943*c0909341SAndroid Build Coastguard Worker%else 944*c0909341SAndroid Build Coastguard Worker paddw m1, %6 945*c0909341SAndroid Build Coastguard Worker%endif 946*c0909341SAndroid Build Coastguard Worker%ifnum %7 947*c0909341SAndroid Build Coastguard Worker%else 948*c0909341SAndroid Build Coastguard Worker mova m3, %7 949*c0909341SAndroid Build Coastguard Worker%endif 950*c0909341SAndroid Build Coastguard Worker pavgw m0, m2 951*c0909341SAndroid Build Coastguard Worker pavgw m1, m3 952*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 953*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 954*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 955*c0909341SAndroid Build Coastguard Worker%endmacro 956*c0909341SAndroid Build Coastguard Worker 957*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] 958*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16*%1] ; top 959*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m1, m0 ; top, bottom 960*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0 ; top, bottom 961*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m5 962*c0909341SAndroid Build Coastguard Worker mova [rsp+16*%2], m1 963*c0909341SAndroid Build Coastguard Worker paddw m1, m3 ; 1 * top + 255 * bottom + 255 964*c0909341SAndroid Build Coastguard Worker paddw m2, m1 ; 128 * top + 129 * bottom + 255 965*c0909341SAndroid Build Coastguard Worker mova [rsp+16*%3], m2 966*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6, m5 967*c0909341SAndroid Build Coastguard Worker mova [rsp+16*%4], m6 968*c0909341SAndroid Build Coastguard Worker paddw m6, m3 ; 1 * top + 255 * bottom + 255 969*c0909341SAndroid Build Coastguard Worker paddw m2, m6 ; 128 * top + 129 * bottom + 255 970*c0909341SAndroid Build Coastguard Worker mova [rsp+16*%5], m2 971*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+hq] ; left 972*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+pb_3] ; topleft[-(1 + y)] 973*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4 ; left, right 974*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m5 ; 127 * left - 127 * right 975*c0909341SAndroid Build Coastguard Worker paddw m2, m1 ; 128 * left + 129 * right 976*c0909341SAndroid Build Coastguard Worker mova m3, m2 977*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; 978*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, %7 979*c0909341SAndroid Build Coastguard Worker paddw m2, m3, m0 980*c0909341SAndroid Build Coastguard Worker paddw m3, m1 981*c0909341SAndroid Build Coastguard Worker movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 982*c0909341SAndroid Build Coastguard Worker mova m7, [rsp+16*%9] 983*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 984*c0909341SAndroid Build Coastguard Worker mova [rsp+16*%8], m3 985*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*%2] 986*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*%3] 987*c0909341SAndroid Build Coastguard Worker mova m3, [rsp+16*%4] 988*c0909341SAndroid Build Coastguard Worker mova m7, [rsp+16*%5] 989*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] 990*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 991*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_255] ; recovery 992*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+16*%10] ; recovery 993*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*%11] ; recovery 994*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*%12] ; recovery 995*c0909341SAndroid Build Coastguard Worker%endmacro 996*c0909341SAndroid Build Coastguard Worker 997*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights 998*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_ssse3_table 999*c0909341SAndroid Build Coastguard Worker mov wd, wm 1000*c0909341SAndroid Build Coastguard Worker mov hd, hm 1001*c0909341SAndroid Build Coastguard Worker LEA r6, ipred_smooth_ssse3_table 1002*c0909341SAndroid Build Coastguard Worker movd m4, [tlq+wq] ; right 1003*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1004*c0909341SAndroid Build Coastguard Worker pshufb m4, m2 1005*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1006*c0909341SAndroid Build Coastguard Worker mov r5, tlq 1007*c0909341SAndroid Build Coastguard Worker sub r5, hq 1008*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 1009*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pb_127_m127] 1010*c0909341SAndroid Build Coastguard Worker movd m0, [r5] 1011*c0909341SAndroid Build Coastguard Worker pshufb m0, m2 ; bottom 1012*c0909341SAndroid Build Coastguard Worker movddup m3, [base+pw_255] 1013*c0909341SAndroid Build Coastguard Worker add wq, r6 1014*c0909341SAndroid Build Coastguard Worker lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] 1015*c0909341SAndroid Build Coastguard Worker jmp wq 1016*c0909341SAndroid Build Coastguard Worker.w4: 1017*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_v_shuf] 1018*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+1] ; left 1019*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q0000 1020*c0909341SAndroid Build Coastguard Worker sub tlq, 4 1021*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 1022*c0909341SAndroid Build Coastguard Worker sub tlq, hq 1023*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0 ; top, bottom 1024*c0909341SAndroid Build Coastguard Worker pshufd m6, m7, q1100 1025*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q3322 1026*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m5 1027*c0909341SAndroid Build Coastguard Worker paddw m3, m1 ; 1 * top + 255 * bottom + 255 1028*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; 128 * top + 129 * bottom + 255 1029*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 1030*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1031*c0909341SAndroid Build Coastguard Worker movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; 1032*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 1033*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m1 1034*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m4 1035*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m6 1036*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m5 1037*c0909341SAndroid Build Coastguard Worker.w4_loop: 1038*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+hq] ; left 1039*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+ipred_h_shuf] 1040*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m4 ; left, right 1041*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m4 1042*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m0, m5 ; 127 * left - 127 * right 1043*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m5 1044*c0909341SAndroid Build Coastguard Worker paddw m2, m0 ; 128 * left + 129 * right 1045*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1046*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*2] 1047*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m4 1048*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 1049*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1050*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1051*c0909341SAndroid Build Coastguard Worker movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1052*c0909341SAndroid Build Coastguard Worker add v_weightsq, 8 1053*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m6 1054*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 1055*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*0] 1056*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*1] 1057*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 1058*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*3] 1059*c0909341SAndroid Build Coastguard Worker mova m6, [rsp+16*4] 1060*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*5] 1061*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 1062*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 1063*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 1064*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 1065*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m0 1066*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 1067*c0909341SAndroid Build Coastguard Worker movd [dstq+r3 ], m0 1068*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1069*c0909341SAndroid Build Coastguard Worker sub hd, 4 1070*c0909341SAndroid Build Coastguard Worker jg .w4_loop 1071*c0909341SAndroid Build Coastguard Worker RET 1072*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1073*c0909341SAndroid Build Coastguard Worker.w8: 1074*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_v_shuf] 1075*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+1] ; left 1076*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 1077*c0909341SAndroid Build Coastguard Worker sub tlq, 4 1078*c0909341SAndroid Build Coastguard Worker sub tlq, hq 1079*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0 1080*c0909341SAndroid Build Coastguard Worker pshufd m6, m7, q0000 1081*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q1111 1082*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m5 1083*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1084*c0909341SAndroid Build Coastguard Worker paddw m2, m3 1085*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 1086*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1087*c0909341SAndroid Build Coastguard Worker mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; 1088*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m1 1089*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m4 1090*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m6 1091*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m5 1092*c0909341SAndroid Build Coastguard Worker.w8_loop: 1093*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+hq] ; left 1094*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+ipred_h_shuf] 1095*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1100 1096*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m4 1097*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m4 1098*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m0, m5 1099*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m5 1100*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1101*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1102*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*2] 1103*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m4 1104*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 1105*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1106*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1107*c0909341SAndroid Build Coastguard Worker movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1108*c0909341SAndroid Build Coastguard Worker add v_weightsq, 4 1109*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m6 1110*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 1111*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*0] 1112*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*1] 1113*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 1114*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*3] 1115*c0909341SAndroid Build Coastguard Worker mova m6, [rsp+16*4] 1116*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*5] 1117*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 1118*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 1119*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1120*c0909341SAndroid Build Coastguard Worker sub hd, 2 1121*c0909341SAndroid Build Coastguard Worker jg .w8_loop 1122*c0909341SAndroid Build Coastguard Worker RET 1123*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1124*c0909341SAndroid Build Coastguard Worker.w16: 1125*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_v_shuf] 1126*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] ; left 1127*c0909341SAndroid Build Coastguard Worker sub tlq, 4 1128*c0909341SAndroid Build Coastguard Worker sub tlq, hq 1129*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m1, m0 ; top, bottom 1130*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0 ; top, bottom 1131*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 1132*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m7 1133*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6, m5 1134*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m6 1135*c0909341SAndroid Build Coastguard Worker paddw m6, m3 ; 1 * top + 255 * bottom + 255 1136*c0909341SAndroid Build Coastguard Worker paddw m2, m6 ; 128 * top + 129 * bottom + 255 1137*c0909341SAndroid Build Coastguard Worker mova [rsp+16*6], m2 1138*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m5 1139*c0909341SAndroid Build Coastguard Worker paddw m3, m1 ; 1 * top + 255 * bottom + 255 1140*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 1141*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; 128 * top + 129 * bottom + 255 1142*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1143*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m4 1144*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m5 1145*c0909341SAndroid Build Coastguard Worker.w16_loop: 1146*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+hq] ; left 1147*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+pb_3] ; topleft[-(1 + y)] 1148*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4 ; left, right 1149*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m5 ; 127 * left - 127 * right 1150*c0909341SAndroid Build Coastguard Worker paddw m2, m1 ; 128 * left + 129 * right 1151*c0909341SAndroid Build Coastguard Worker mova m0, m1 1152*c0909341SAndroid Build Coastguard Worker mova m3, m2 1153*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; 1154*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, [base+smooth_weights+16*3] 1155*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1156*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1157*c0909341SAndroid Build Coastguard Worker movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1158*c0909341SAndroid Build Coastguard Worker add v_weightsq, 2 1159*c0909341SAndroid Build Coastguard Worker mova m7, [rsp+16*2] 1160*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 1161*c0909341SAndroid Build Coastguard Worker mova [rsp+16*7], m3 1162*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*0] 1163*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*1] 1164*c0909341SAndroid Build Coastguard Worker mova m3, [rsp+16*5] 1165*c0909341SAndroid Build Coastguard Worker mova m7, [rsp+16*6] 1166*c0909341SAndroid Build Coastguard Worker SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] 1167*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*3] 1168*c0909341SAndroid Build Coastguard Worker mova m5, [rsp+16*4] 1169*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1170*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq] 1171*c0909341SAndroid Build Coastguard Worker sub hd, 1 1172*c0909341SAndroid Build Coastguard Worker jg .w16_loop 1173*c0909341SAndroid Build Coastguard Worker RET 1174*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1175*c0909341SAndroid Build Coastguard Worker.w32: 1176*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] ; top topleft[1 + x] 1177*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+17] ; top 1178*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 1179*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1180*c0909341SAndroid Build Coastguard Worker sub tlq, 4 1181*c0909341SAndroid Build Coastguard Worker sub tlq, hq 1182*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_v_shuf] 1183*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 1184*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m7 1185*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 1186*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m4 1187*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m5 1188*c0909341SAndroid Build Coastguard Worker.w32_loop: 1189*c0909341SAndroid Build Coastguard Worker SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 1190*c0909341SAndroid Build Coastguard Worker add dstq, 16 1191*c0909341SAndroid Build Coastguard Worker SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 1192*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq-16+strideq] 1193*c0909341SAndroid Build Coastguard Worker add v_weightsq, 2 1194*c0909341SAndroid Build Coastguard Worker sub hd, 1 1195*c0909341SAndroid Build Coastguard Worker jg .w32_loop 1196*c0909341SAndroid Build Coastguard Worker RET 1197*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1198*c0909341SAndroid Build Coastguard Worker.w64: 1199*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] ; top topleft[1 + x] 1200*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+17] ; top 1201*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m1 1202*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m2 1203*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+33] ; top 1204*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+49] ; top 1205*c0909341SAndroid Build Coastguard Worker mova [rsp+16*11], m1 1206*c0909341SAndroid Build Coastguard Worker mova [rsp+16*12], m2 1207*c0909341SAndroid Build Coastguard Worker sub tlq, 4 1208*c0909341SAndroid Build Coastguard Worker sub tlq, hq 1209*c0909341SAndroid Build Coastguard Worker mova m7, [base+ipred_v_shuf] 1210*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 1211*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m7 1212*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 1213*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m4 1214*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m5 1215*c0909341SAndroid Build Coastguard Worker.w64_loop: 1216*c0909341SAndroid Build Coastguard Worker SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 1217*c0909341SAndroid Build Coastguard Worker add dstq, 16 1218*c0909341SAndroid Build Coastguard Worker SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 1219*c0909341SAndroid Build Coastguard Worker add dstq, 16 1220*c0909341SAndroid Build Coastguard Worker SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 1221*c0909341SAndroid Build Coastguard Worker add dstq, 16 1222*c0909341SAndroid Build Coastguard Worker SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 1223*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq-48+strideq] 1224*c0909341SAndroid Build Coastguard Worker add v_weightsq, 2 1225*c0909341SAndroid Build Coastguard Worker sub hd, 1 1226*c0909341SAndroid Build Coastguard Worker jg .w64_loop 1227*c0909341SAndroid Build Coastguard Worker RET 1228*c0909341SAndroid Build Coastguard Worker 1229*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1230*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx 1231*c0909341SAndroid Build Coastguard Worker %define base r7-$$ 1232*c0909341SAndroid Build Coastguard Worker lea r7, [$$] 1233*c0909341SAndroid Build Coastguard Worker mova m8, [base+pw_62] 1234*c0909341SAndroid Build Coastguard Worker mova m9, [base+pw_64] 1235*c0909341SAndroid Build Coastguard Worker mova m10, [base+pw_512] 1236*c0909341SAndroid Build Coastguard Worker%else 1237*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx 1238*c0909341SAndroid Build Coastguard Worker %define base r1-$$ 1239*c0909341SAndroid Build Coastguard Worker %define m8 [base+pw_62] 1240*c0909341SAndroid Build Coastguard Worker %define m9 [base+pw_64] 1241*c0909341SAndroid Build Coastguard Worker %define m10 [base+pw_512] 1242*c0909341SAndroid Build Coastguard Worker %define strideq r3 1243*c0909341SAndroid Build Coastguard Worker %define stridemp dword [rsp+16*12] 1244*c0909341SAndroid Build Coastguard Worker mov stridemp, r1 1245*c0909341SAndroid Build Coastguard Worker LEA r1, $$ 1246*c0909341SAndroid Build Coastguard Worker%endif 1247*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 1248*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 1249*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1250*c0909341SAndroid Build Coastguard Worker inc tlq 1251*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z1_ssse3_table+wq*4] 1252*c0909341SAndroid Build Coastguard Worker mov dxd, angled 1253*c0909341SAndroid Build Coastguard Worker and dxd, 0x7e 1254*c0909341SAndroid Build Coastguard Worker add angled, 165 ; ~90 1255*c0909341SAndroid Build Coastguard Worker lea wq, [base+wq+ipred_z1_ssse3_table] 1256*c0909341SAndroid Build Coastguard Worker movzx dxd, word [base+dr_intra_derivative+dxq] 1257*c0909341SAndroid Build Coastguard Worker xor angled, 0x4ff ; d = 90 - angle 1258*c0909341SAndroid Build Coastguard Worker jmp wq 1259*c0909341SAndroid Build Coastguard Worker.w4: 1260*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+88] 1261*c0909341SAndroid Build Coastguard Worker test r3d, 0x480 1262*c0909341SAndroid Build Coastguard Worker jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 1263*c0909341SAndroid Build Coastguard Worker sar r3d, 9 1264*c0909341SAndroid Build Coastguard Worker add r3d, hd 1265*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 1266*c0909341SAndroid Build Coastguard Worker jg .w4_no_upsample ; h > 8 || (w == h && is_sm) 1267*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-1] 1268*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, [base+z_upsample1] 1269*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+z_upsample2] 1270*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pb_36_m4] 1271*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1272*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1273*c0909341SAndroid Build Coastguard Worker pshufd m7, m1, q3333 1274*c0909341SAndroid Build Coastguard Worker movd [rsp+16], m7 ; top[max_base_x] 1275*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 1276*c0909341SAndroid Build Coastguard Worker movd m6, dxd 1277*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1278*c0909341SAndroid Build Coastguard Worker pshufb m6, [base+pw_256] 1279*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1280*c0909341SAndroid Build Coastguard Worker movq m0, [tlq] 1281*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1282*c0909341SAndroid Build Coastguard Worker paddw m7, m6, m6 1283*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7 ; xpos0 xpos1 1284*c0909341SAndroid Build Coastguard Worker packuswb m1, m1 1285*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 1286*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1287*c0909341SAndroid Build Coastguard Worker mova [rsp], m0 1288*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop: 1289*c0909341SAndroid Build Coastguard Worker lea r2d, [r5+dxq] 1290*c0909341SAndroid Build Coastguard Worker shr r5d, 6 ; base0 1291*c0909341SAndroid Build Coastguard Worker movq m0, [rsp+r5] 1292*c0909341SAndroid Build Coastguard Worker lea r5d, [r2+dxq] 1293*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base1 1294*c0909341SAndroid Build Coastguard Worker movhps m0, [rsp+r2] 1295*c0909341SAndroid Build Coastguard Worker pand m2, m8, m6 ; frac 1296*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 ; 64-frac 1297*c0909341SAndroid Build Coastguard Worker psllw m2, 8 1298*c0909341SAndroid Build Coastguard Worker por m1, m2 ; 64-frac, frac 1299*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 1300*c0909341SAndroid Build Coastguard Worker paddw m6, m7 ; xpos += dx 1301*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1302*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1303*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 1304*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q1032 1305*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m0 1306*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1307*c0909341SAndroid Build Coastguard Worker sub hd, 2 1308*c0909341SAndroid Build Coastguard Worker jg .w4_upsample_loop 1309*c0909341SAndroid Build Coastguard Worker RET 1310*c0909341SAndroid Build Coastguard Worker.w4_no_upsample: 1311*c0909341SAndroid Build Coastguard Worker mov r3d, 7 ; max_base 1312*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1313*c0909341SAndroid Build Coastguard Worker jnz .w4_main 1314*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1315*c0909341SAndroid Build Coastguard Worker movd m0, r3d 1316*c0909341SAndroid Build Coastguard Worker movd m2, angled 1317*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1318*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1319*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 1320*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 1321*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, m0, [base+z_filter_wh4] 1322*c0909341SAndroid Build Coastguard Worker pand m1, m2 1323*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, [base+z_filter_t_w48+angleq*8] 1324*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 1325*c0909341SAndroid Build Coastguard Worker mov r3d, 7 1326*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1327*c0909341SAndroid Build Coastguard Worker jz .w4_main ; filter_strength == 0 1328*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-1] 1329*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 1330*c0909341SAndroid Build Coastguard Worker movu m7, [base+z_filter_s+8] 1331*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 1332*c0909341SAndroid Build Coastguard Worker movddup m0, [base+pb_8] 1333*c0909341SAndroid Build Coastguard Worker pminub m7, m0 1334*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, [base+z_filter_s] 1335*c0909341SAndroid Build Coastguard Worker movddup m4, [base+z_filter_k-8+r5*8+24*0] 1336*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 1337*c0909341SAndroid Build Coastguard Worker movddup m5, [base+z_filter_k-8+r5*8+24*1] 1338*c0909341SAndroid Build Coastguard Worker shufps m2, m0, m3, q2121 1339*c0909341SAndroid Build Coastguard Worker movddup m6, [base+z_filter_k-8+r5*8+24*2] 1340*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m4 1341*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m4 1342*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1343*c0909341SAndroid Build Coastguard Worker paddd m5, m6 1344*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m3, m5 1345*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1346*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1347*c0909341SAndroid Build Coastguard Worker paddw m1, m4 1348*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1349*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q3333 1350*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1351*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1352*c0909341SAndroid Build Coastguard Worker mov r5d, 9 1353*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 1354*c0909341SAndroid Build Coastguard Worker cmp hd, 4 1355*c0909341SAndroid Build Coastguard Worker cmovne r3d, r5d 1356*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1357*c0909341SAndroid Build Coastguard Worker mova [tlq], m0 1358*c0909341SAndroid Build Coastguard Worker.w4_main: 1359*c0909341SAndroid Build Coastguard Worker add tlq, r3 1360*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1361*c0909341SAndroid Build Coastguard Worker movddup m0, [base+z_base_inc] ; base_inc << 6 1362*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] ; top[max_base_x] 1363*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1364*c0909341SAndroid Build Coastguard Worker movd m4, r3d 1365*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 1366*c0909341SAndroid Build Coastguard Worker mov r5d, dxd ; xpos 1367*c0909341SAndroid Build Coastguard Worker pshufb m7, [base+pw_m256] 1368*c0909341SAndroid Build Coastguard Worker sub r5, r3 1369*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+pw_256] 1370*c0909341SAndroid Build Coastguard Worker mova m3, [base+z1_shuf_w4] 1371*c0909341SAndroid Build Coastguard Worker paddw m6, m5, m5 1372*c0909341SAndroid Build Coastguard Worker psubw m4, m0 ; max_base_x 1373*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m6 ; xpos0 xpos1 1374*c0909341SAndroid Build Coastguard Worker.w4_loop: 1375*c0909341SAndroid Build Coastguard Worker lea r3, [r5+dxq] 1376*c0909341SAndroid Build Coastguard Worker sar r5, 6 ; base0 1377*c0909341SAndroid Build Coastguard Worker movq m0, [tlq+r5] 1378*c0909341SAndroid Build Coastguard Worker lea r5, [r3+dxq] 1379*c0909341SAndroid Build Coastguard Worker sar r3, 6 ; base1 1380*c0909341SAndroid Build Coastguard Worker movhps m0, [tlq+r3] 1381*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 ; frac 1382*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 ; 64-frac 1383*c0909341SAndroid Build Coastguard Worker psllw m2, 8 1384*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 1385*c0909341SAndroid Build Coastguard Worker por m1, m2 ; 64-frac, frac 1386*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 1387*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1388*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m4, m5 ; base < max_base_x 1389*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1390*c0909341SAndroid Build Coastguard Worker paddw m5, m6 ; xpos += dx 1391*c0909341SAndroid Build Coastguard Worker pand m0, m1 1392*c0909341SAndroid Build Coastguard Worker pandn m1, m7 1393*c0909341SAndroid Build Coastguard Worker por m0, m1 1394*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1395*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 1396*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q1032 1397*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m0 1398*c0909341SAndroid Build Coastguard Worker sub hd, 2 1399*c0909341SAndroid Build Coastguard Worker jz .w4_end 1400*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1401*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1402*c0909341SAndroid Build Coastguard Worker jl .w4_loop 1403*c0909341SAndroid Build Coastguard Worker packuswb m7, m7 1404*c0909341SAndroid Build Coastguard Worker.w4_end_loop: 1405*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m7 1406*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m7 1407*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1408*c0909341SAndroid Build Coastguard Worker sub hd, 2 1409*c0909341SAndroid Build Coastguard Worker jg .w4_end_loop 1410*c0909341SAndroid Build Coastguard Worker.w4_end: 1411*c0909341SAndroid Build Coastguard Worker RET 1412*c0909341SAndroid Build Coastguard Worker.w8: 1413*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+88] 1414*c0909341SAndroid Build Coastguard Worker and r3d, ~0x7f 1415*c0909341SAndroid Build Coastguard Worker or r3d, hd 1416*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 1417*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1418*c0909341SAndroid Build Coastguard Worker mova m5, [base+z_upsample1] 1419*c0909341SAndroid Build Coastguard Worker movu m3, [base+z_filter_s+6] 1420*c0909341SAndroid Build Coastguard Worker movd m4, hd 1421*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-1] 1422*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+7] 1423*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1424*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 1425*c0909341SAndroid Build Coastguard Worker movddup m7, [base+pb_36_m4] 1426*c0909341SAndroid Build Coastguard Worker pminub m4, m3 1427*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1428*c0909341SAndroid Build Coastguard Worker pshufb m2, m0, m5 1429*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 1430*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 1431*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 1432*c0909341SAndroid Build Coastguard Worker movd m6, dxd 1433*c0909341SAndroid Build Coastguard Worker pshufb m3, m1, m5 1434*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 1435*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1436*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 1437*c0909341SAndroid Build Coastguard Worker pshufb m6, [base+pw_256] 1438*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1439*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1440*c0909341SAndroid Build Coastguard Worker paddw m7, m6, m6 1441*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1442*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7 ; xpos0 xpos1 1443*c0909341SAndroid Build Coastguard Worker movu m1, [tlq] 1444*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m10 1445*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m10 1446*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 1447*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1448*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1449*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1450*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 1451*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m1 1452*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop: 1453*c0909341SAndroid Build Coastguard Worker lea r2d, [r5+dxq] 1454*c0909341SAndroid Build Coastguard Worker shr r5d, 6 ; base0 1455*c0909341SAndroid Build Coastguard Worker movu m0, [rsp+r5] 1456*c0909341SAndroid Build Coastguard Worker lea r5d, [r2+dxq] 1457*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base1 1458*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+r2] 1459*c0909341SAndroid Build Coastguard Worker pand m2, m8, m6 1460*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m2 1461*c0909341SAndroid Build Coastguard Worker psllw m2, 8 1462*c0909341SAndroid Build Coastguard Worker por m3, m2 1463*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m3 ; frac0 1464*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1465*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 ; frac1 1466*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1467*c0909341SAndroid Build Coastguard Worker paddw m6, m7 1468*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1469*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1470*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1471*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 1472*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 1473*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1474*c0909341SAndroid Build Coastguard Worker sub hd, 2 1475*c0909341SAndroid Build Coastguard Worker jg .w8_upsample_loop 1476*c0909341SAndroid Build Coastguard Worker RET 1477*c0909341SAndroid Build Coastguard Worker.w8_no_upsample: 1478*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 1479*c0909341SAndroid Build Coastguard Worker movd m0, r3d 1480*c0909341SAndroid Build Coastguard Worker and r3d, 7 1481*c0909341SAndroid Build Coastguard Worker or r3d, 8 ; imin(h+7, 15) 1482*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1483*c0909341SAndroid Build Coastguard Worker jnz .w8_main 1484*c0909341SAndroid Build Coastguard Worker movd m2, angled 1485*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1486*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1487*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 1488*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 1489*c0909341SAndroid Build Coastguard Worker movu m1, [base+z_filter_wh8] 1490*c0909341SAndroid Build Coastguard Worker psrldq m3, [base+z_filter_t_w48+angleq*8], 4 1491*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, m0 1492*c0909341SAndroid Build Coastguard Worker pand m1, m2 1493*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m3 1494*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 1495*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1496*c0909341SAndroid Build Coastguard Worker jz .w8_main ; filter_strength == 0 1497*c0909341SAndroid Build Coastguard Worker movd m3, [tlq-1] 1498*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+16*0] 1499*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 1500*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*1] 1501*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 1502*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+r3] 1503*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*4] 1504*c0909341SAndroid Build Coastguard Worker sub r5, 3 1505*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m0 1506*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1507*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m1 1508*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 1509*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 1510*c0909341SAndroid Build Coastguard Worker mova [tlq-16*2], m3 1511*c0909341SAndroid Build Coastguard Worker movq [tlq+r3-15], m2 1512*c0909341SAndroid Build Coastguard Worker call .filter_edge 1513*c0909341SAndroid Build Coastguard Worker sar r5d, 1 1514*c0909341SAndroid Build Coastguard Worker add r5d, 17 1515*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1516*c0909341SAndroid Build Coastguard Worker cmova r3d, r5d 1517*c0909341SAndroid Build Coastguard Worker.w8_main: 1518*c0909341SAndroid Build Coastguard Worker add tlq, r3 1519*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1520*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 1521*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1522*c0909341SAndroid Build Coastguard Worker movu m3, [base+z_filter_s+2] 1523*c0909341SAndroid Build Coastguard Worker movd m4, r3d 1524*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 1525*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1526*c0909341SAndroid Build Coastguard Worker pshufb m7, [base+pw_m256] 1527*c0909341SAndroid Build Coastguard Worker sub r5, r3 1528*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+pw_256] 1529*c0909341SAndroid Build Coastguard Worker psubw m4, [base+z_base_inc] 1530*c0909341SAndroid Build Coastguard Worker mova m6, m5 1531*c0909341SAndroid Build Coastguard Worker.w8_loop: 1532*c0909341SAndroid Build Coastguard Worker mov r3, r5 1533*c0909341SAndroid Build Coastguard Worker sar r3, 6 1534*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r3] 1535*c0909341SAndroid Build Coastguard Worker pand m1, m8, m5 1536*c0909341SAndroid Build Coastguard Worker psubw m2, m9, m1 1537*c0909341SAndroid Build Coastguard Worker psllw m1, 8 1538*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 1539*c0909341SAndroid Build Coastguard Worker por m1, m2 1540*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 1541*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m4, m5 1542*c0909341SAndroid Build Coastguard Worker paddw m5, m6 1543*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1544*c0909341SAndroid Build Coastguard Worker pand m0, m1 1545*c0909341SAndroid Build Coastguard Worker pandn m1, m7 1546*c0909341SAndroid Build Coastguard Worker por m0, m1 1547*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1548*c0909341SAndroid Build Coastguard Worker movq [dstq], m0 1549*c0909341SAndroid Build Coastguard Worker dec hd 1550*c0909341SAndroid Build Coastguard Worker jz .w8_end 1551*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1552*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1553*c0909341SAndroid Build Coastguard Worker add r5, dxq 1554*c0909341SAndroid Build Coastguard Worker jl .w8_loop 1555*c0909341SAndroid Build Coastguard Worker packuswb m7, m7 1556*c0909341SAndroid Build Coastguard Worker.w8_end_loop: 1557*c0909341SAndroid Build Coastguard Worker movq [dstq], m7 1558*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1559*c0909341SAndroid Build Coastguard Worker dec hd 1560*c0909341SAndroid Build Coastguard Worker jg .w8_end_loop 1561*c0909341SAndroid Build Coastguard Worker.w8_end: 1562*c0909341SAndroid Build Coastguard Worker RET 1563*c0909341SAndroid Build Coastguard Worker.w16: 1564*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 1565*c0909341SAndroid Build Coastguard Worker movd m0, r3d 1566*c0909341SAndroid Build Coastguard Worker and r3d, 15 1567*c0909341SAndroid Build Coastguard Worker or r3d, 16 ; imin(h+15, 31) 1568*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1569*c0909341SAndroid Build Coastguard Worker jnz .w16_main 1570*c0909341SAndroid Build Coastguard Worker movd m2, angled 1571*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1572*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1573*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 1574*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 1575*c0909341SAndroid Build Coastguard Worker movq m3, [base+z_filter_t_w16+angleq*4] 1576*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, [base+z_filter_wh16] 1577*c0909341SAndroid Build Coastguard Worker pand m0, m2 1578*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m3 1579*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m0 1580*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1581*c0909341SAndroid Build Coastguard Worker jz .w16_main ; filter_strength == 0 1582*c0909341SAndroid Build Coastguard Worker movd m4, [tlq-1] 1583*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+16*0] 1584*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 1585*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*1] 1586*c0909341SAndroid Build Coastguard Worker shr r5d, 30 1587*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+30] 1588*c0909341SAndroid Build Coastguard Worker adc r5, -4 ; filter_strength-3 1589*c0909341SAndroid Build Coastguard Worker movd m3, [tlq+r3] 1590*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*4] 1591*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m0 1592*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1593*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m1 1594*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 1595*c0909341SAndroid Build Coastguard Worker movd [rsp], m2 1596*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 1597*c0909341SAndroid Build Coastguard Worker mova [tlq-16*2], m4 1598*c0909341SAndroid Build Coastguard Worker movd [tlq+r3-16], m3 1599*c0909341SAndroid Build Coastguard Worker call .filter_edge 1600*c0909341SAndroid Build Coastguard Worker cmp hd, 16 1601*c0909341SAndroid Build Coastguard Worker jle .w16_main 1602*c0909341SAndroid Build Coastguard Worker pshuflw m0, [rsp], q0000 1603*c0909341SAndroid Build Coastguard Worker sar r5, 1 1604*c0909341SAndroid Build Coastguard Worker movd m1, [base+z_filter_k_tail+4+r5*4] 1605*c0909341SAndroid Build Coastguard Worker lea r3d, [r5+33] 1606*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 1607*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1608*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1609*c0909341SAndroid Build Coastguard Worker%else 1610*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 1611*c0909341SAndroid Build Coastguard Worker%endif 1612*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1613*c0909341SAndroid Build Coastguard Worker movd [tlq+32], m0 1614*c0909341SAndroid Build Coastguard Worker.w16_main: 1615*c0909341SAndroid Build Coastguard Worker add tlq, r3 1616*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1617*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 1618*c0909341SAndroid Build Coastguard Worker movd m4, r3d 1619*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1620*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 1621*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1622*c0909341SAndroid Build Coastguard Worker pshufb m7, m6 1623*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1624*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 1625*c0909341SAndroid Build Coastguard Worker sub r5, r3 1626*c0909341SAndroid Build Coastguard Worker psubb m4, [base+pb_0to15] 1627*c0909341SAndroid Build Coastguard Worker mova m6, m5 1628*c0909341SAndroid Build Coastguard Worker.w16_loop: 1629*c0909341SAndroid Build Coastguard Worker mov r3, r5 1630*c0909341SAndroid Build Coastguard Worker sar r3, 6 1631*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+0] 1632*c0909341SAndroid Build Coastguard Worker pand m0, m8, m5 1633*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+1] 1634*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m0 1635*c0909341SAndroid Build Coastguard Worker psllw m0, 8 1636*c0909341SAndroid Build Coastguard Worker por m3, m0 1637*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1638*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1639*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1640*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1641*c0909341SAndroid Build Coastguard Worker psrlw m3, m5, 6 1642*c0909341SAndroid Build Coastguard Worker packsswb m3, m3 1643*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1644*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1645*c0909341SAndroid Build Coastguard Worker paddw m5, m6 1646*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m4, m3 1647*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1648*c0909341SAndroid Build Coastguard Worker pand m0, m2 1649*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1650*c0909341SAndroid Build Coastguard Worker por m0, m2 1651*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1652*c0909341SAndroid Build Coastguard Worker dec hd 1653*c0909341SAndroid Build Coastguard Worker jz .w16_end 1654*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1655*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1656*c0909341SAndroid Build Coastguard Worker add r5, dxq 1657*c0909341SAndroid Build Coastguard Worker jl .w16_loop 1658*c0909341SAndroid Build Coastguard Worker.w16_end_loop: 1659*c0909341SAndroid Build Coastguard Worker mova [dstq], m7 1660*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1661*c0909341SAndroid Build Coastguard Worker dec hd 1662*c0909341SAndroid Build Coastguard Worker jg .w16_end_loop 1663*c0909341SAndroid Build Coastguard Worker.w16_end: 1664*c0909341SAndroid Build Coastguard Worker RET 1665*c0909341SAndroid Build Coastguard Worker.w32: 1666*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+31] 1667*c0909341SAndroid Build Coastguard Worker and r3d, 31 1668*c0909341SAndroid Build Coastguard Worker or r3d, 32 ; imin(h+31, 63) 1669*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1670*c0909341SAndroid Build Coastguard Worker jnz .w32_main 1671*c0909341SAndroid Build Coastguard Worker movd m6, [tlq-1] 1672*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+16*0] 1673*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*1] 1674*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*2] 1675*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+16*3] 1676*c0909341SAndroid Build Coastguard Worker movd m4, [tlq+62] 1677*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+r3] 1678*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*6] 1679*c0909341SAndroid Build Coastguard Worker mova [tlq-16*3], m0 1680*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1681*c0909341SAndroid Build Coastguard Worker mova [tlq-16*2], m1 1682*c0909341SAndroid Build Coastguard Worker pshufb m6, m7 1683*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m2 1684*c0909341SAndroid Build Coastguard Worker xor r5d, r5d ; filter_strength = 3 1685*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m3 1686*c0909341SAndroid Build Coastguard Worker movd [rsp], m4 1687*c0909341SAndroid Build Coastguard Worker pshufb m5, m7 1688*c0909341SAndroid Build Coastguard Worker mova [tlq-16*4], m6 1689*c0909341SAndroid Build Coastguard Worker movd [tlq+r3-48], m5 1690*c0909341SAndroid Build Coastguard Worker call .filter_edge 1691*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 1692*c0909341SAndroid Build Coastguard Worker call .filter_edge 1693*c0909341SAndroid Build Coastguard Worker cmp hd, 32 1694*c0909341SAndroid Build Coastguard Worker jle .w32_main 1695*c0909341SAndroid Build Coastguard Worker pshuflw m0, [rsp], q0000 1696*c0909341SAndroid Build Coastguard Worker movd m1, [base+z_filter_k_tail+4] 1697*c0909341SAndroid Build Coastguard Worker add r3d, 2 1698*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 1699*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1700*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1701*c0909341SAndroid Build Coastguard Worker%else 1702*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 1703*c0909341SAndroid Build Coastguard Worker%endif 1704*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1705*c0909341SAndroid Build Coastguard Worker movd [tlq+64], m0 1706*c0909341SAndroid Build Coastguard Worker.w32_main: 1707*c0909341SAndroid Build Coastguard Worker add tlq, r3 1708*c0909341SAndroid Build Coastguard Worker movd m0, r3d 1709*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 1710*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1711*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1712*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1713*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1714*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 1715*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 1716*c0909341SAndroid Build Coastguard Worker sub r5, r3 1717*c0909341SAndroid Build Coastguard Worker pshufb m7, m6 1718*c0909341SAndroid Build Coastguard Worker psubb m0, [base+pb_0to15] 1719*c0909341SAndroid Build Coastguard Worker movddup m1, [base+pb_m16] 1720*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 1721*c0909341SAndroid Build Coastguard Worker paddb m0, m1 1722*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 1723*c0909341SAndroid Build Coastguard Worker mova m6, m5 1724*c0909341SAndroid Build Coastguard Worker.w32_loop: 1725*c0909341SAndroid Build Coastguard Worker mov r3, r5 1726*c0909341SAndroid Build Coastguard Worker sar r3, 6 1727*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+16*0+0] 1728*c0909341SAndroid Build Coastguard Worker pand m0, m8, m5 1729*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+16*0+1] 1730*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m0 1731*c0909341SAndroid Build Coastguard Worker psllw m0, 8 1732*c0909341SAndroid Build Coastguard Worker por m3, m0 1733*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1734*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1735*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1736*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1737*c0909341SAndroid Build Coastguard Worker psrlw m4, m5, 6 1738*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1739*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1740*c0909341SAndroid Build Coastguard Worker packsswb m4, m4 1741*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [rsp+16*0], m4 1742*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1743*c0909341SAndroid Build Coastguard Worker pand m0, m2 1744*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1745*c0909341SAndroid Build Coastguard Worker por m0, m2 1746*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+16*1+0] 1747*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+16*1+1] 1748*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 1749*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1750*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1751*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1752*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1753*c0909341SAndroid Build Coastguard Worker paddw m5, m6 1754*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1755*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1756*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [rsp+16*1], m4 1757*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1758*c0909341SAndroid Build Coastguard Worker pand m0, m2 1759*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1760*c0909341SAndroid Build Coastguard Worker por m0, m2 1761*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m0 1762*c0909341SAndroid Build Coastguard Worker dec hd 1763*c0909341SAndroid Build Coastguard Worker jz .w32_end 1764*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1765*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1766*c0909341SAndroid Build Coastguard Worker add r5, dxq 1767*c0909341SAndroid Build Coastguard Worker jl .w32_loop 1768*c0909341SAndroid Build Coastguard Worker.w32_end_loop: 1769*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m7 1770*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m7 1771*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1772*c0909341SAndroid Build Coastguard Worker dec hd 1773*c0909341SAndroid Build Coastguard Worker jg .w32_end_loop 1774*c0909341SAndroid Build Coastguard Worker.w32_end: 1775*c0909341SAndroid Build Coastguard Worker RET 1776*c0909341SAndroid Build Coastguard Worker.w64: 1777*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+63] 1778*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1779*c0909341SAndroid Build Coastguard Worker jnz .w64_main 1780*c0909341SAndroid Build Coastguard Worker movd m4, [tlq-1] 1781*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+16*0] 1782*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*1] 1783*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*2] 1784*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+16*3] 1785*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 1786*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1787*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m1 1788*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 1789*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m2 1790*c0909341SAndroid Build Coastguard Worker mova [rsp+16*6], m3 1791*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m4 1792*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+16*4] 1793*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*5] 1794*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+16*6] 1795*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+16*7] 1796*c0909341SAndroid Build Coastguard Worker movd m4, [tlq+r3] 1797*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*10] 1798*c0909341SAndroid Build Coastguard Worker mova [tlq-16*3], m0 1799*c0909341SAndroid Build Coastguard Worker xor r5d, r5d ; filter_strength = 3 1800*c0909341SAndroid Build Coastguard Worker mova [tlq-16*2], m1 1801*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 1802*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m2 1803*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m3 1804*c0909341SAndroid Build Coastguard Worker movd [tlq+r3-16*7], m4 1805*c0909341SAndroid Build Coastguard Worker cmp hd, 64 1806*c0909341SAndroid Build Coastguard Worker jl .w64_filter96 ; skip one call if the last 32 bytes aren't used 1807*c0909341SAndroid Build Coastguard Worker call .filter_edge 1808*c0909341SAndroid Build Coastguard Worker.w64_filter96: 1809*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 1810*c0909341SAndroid Build Coastguard Worker call .filter_edge 1811*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 1812*c0909341SAndroid Build Coastguard Worker call .filter_edge 1813*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 1814*c0909341SAndroid Build Coastguard Worker call .filter_edge 1815*c0909341SAndroid Build Coastguard Worker.w64_main: 1816*c0909341SAndroid Build Coastguard Worker add tlq, r3 1817*c0909341SAndroid Build Coastguard Worker movd m0, r3d 1818*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 1819*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1820*c0909341SAndroid Build Coastguard Worker movd m5, dxd 1821*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1822*c0909341SAndroid Build Coastguard Worker mov r5d, dxd 1823*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 1824*c0909341SAndroid Build Coastguard Worker sub r5, r3 1825*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 1826*c0909341SAndroid Build Coastguard Worker pshufb m7, m6 1827*c0909341SAndroid Build Coastguard Worker psubb m0, [base+pb_0to15] 1828*c0909341SAndroid Build Coastguard Worker movddup m1, [base+pb_m16] 1829*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 1830*c0909341SAndroid Build Coastguard Worker paddb m0, m1 1831*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 1832*c0909341SAndroid Build Coastguard Worker paddb m0, m1 1833*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m0 1834*c0909341SAndroid Build Coastguard Worker paddb m0, m1 1835*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 1836*c0909341SAndroid Build Coastguard Worker mova m6, m5 1837*c0909341SAndroid Build Coastguard Worker.w64_loop: 1838*c0909341SAndroid Build Coastguard Worker mov r3, r5 1839*c0909341SAndroid Build Coastguard Worker sar r3, 6 1840*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+16*0+0] 1841*c0909341SAndroid Build Coastguard Worker pand m0, m8, m5 1842*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+16*0+1] 1843*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m0 1844*c0909341SAndroid Build Coastguard Worker psllw m0, 8 1845*c0909341SAndroid Build Coastguard Worker por m3, m0 1846*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1847*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1848*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1849*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1850*c0909341SAndroid Build Coastguard Worker psrlw m4, m5, 6 1851*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1852*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1853*c0909341SAndroid Build Coastguard Worker packsswb m4, m4 1854*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [rsp+16*0], m4 1855*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1856*c0909341SAndroid Build Coastguard Worker pand m0, m2 1857*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1858*c0909341SAndroid Build Coastguard Worker por m0, m2 1859*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+16*1+0] 1860*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+16*1+1] 1861*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 1862*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1863*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1864*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1865*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1866*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1867*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1868*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [rsp+16*1], m4 1869*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1870*c0909341SAndroid Build Coastguard Worker pand m0, m2 1871*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1872*c0909341SAndroid Build Coastguard Worker por m0, m2 1873*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+16*2+0] 1874*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+16*2+1] 1875*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m0 1876*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1877*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1878*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1879*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1880*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1881*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1882*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [rsp+16*2], m4 1883*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1884*c0909341SAndroid Build Coastguard Worker pand m0, m2 1885*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1886*c0909341SAndroid Build Coastguard Worker por m0, m2 1887*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r3+16*3+0] 1888*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+r3+16*3+1] 1889*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 1890*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 1891*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 1892*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 1893*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1894*c0909341SAndroid Build Coastguard Worker paddw m5, m6 1895*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 1896*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 1897*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [rsp+16*3], m4 1898*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1899*c0909341SAndroid Build Coastguard Worker pand m0, m2 1900*c0909341SAndroid Build Coastguard Worker pandn m2, m7 1901*c0909341SAndroid Build Coastguard Worker por m0, m2 1902*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m0 1903*c0909341SAndroid Build Coastguard Worker dec hd 1904*c0909341SAndroid Build Coastguard Worker jz .w64_end 1905*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 1906*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1907*c0909341SAndroid Build Coastguard Worker add r5, dxq 1908*c0909341SAndroid Build Coastguard Worker jl .w64_loop 1909*c0909341SAndroid Build Coastguard Worker.w64_end_loop: 1910*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m7 1911*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m7 1912*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m7 1913*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m7 1914*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1915*c0909341SAndroid Build Coastguard Worker dec hd 1916*c0909341SAndroid Build Coastguard Worker jg .w64_end_loop 1917*c0909341SAndroid Build Coastguard Worker.w64_end: 1918*c0909341SAndroid Build Coastguard Worker RET 1919*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1920*c0909341SAndroid Build Coastguard Worker.filter_edge: ; 32 pixels/iteration 1921*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*0] 1922*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-18] 1923*c0909341SAndroid Build Coastguard Worker movu m1, [tlq-17] 1924*c0909341SAndroid Build Coastguard Worker movu m3, [tlq- 2] 1925*c0909341SAndroid Build Coastguard Worker movu m4, [tlq- 1] 1926*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m1 1927*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 1928*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m1 1929*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 1930*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3, m4 1931*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 1932*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 1933*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 1934*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*1] 1935*c0909341SAndroid Build Coastguard Worker mova m5, [tlq-16] 1936*c0909341SAndroid Build Coastguard Worker movu m6, [tlq-15] 1937*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 1938*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 1939*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 1940*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7 1941*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1942*c0909341SAndroid Build Coastguard Worker paddw m2, m5 1943*c0909341SAndroid Build Coastguard Worker mova m5, [tlq+ 0] 1944*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+ 1] 1945*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 1946*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 1947*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 1948*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7 1949*c0909341SAndroid Build Coastguard Worker paddw m1, m4 1950*c0909341SAndroid Build Coastguard Worker paddw m3, m5 1951*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1952*c0909341SAndroid Build Coastguard Worker jnz .filter_end ; 3-tap 1953*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*8] 1954*c0909341SAndroid Build Coastguard Worker movu m5, [tlq-14] 1955*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+ 2] 1956*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m5 1957*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 1958*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m5 1959*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7 1960*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1961*c0909341SAndroid Build Coastguard Worker paddw m2, m5 1962*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m6 1963*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7 1964*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m6 1965*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m7 1966*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1967*c0909341SAndroid Build Coastguard Worker paddw m3, m6 1968*c0909341SAndroid Build Coastguard Worker.filter_end: 1969*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1970*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m0, m2, m1, m3 1971*c0909341SAndroid Build Coastguard Worker%else 1972*c0909341SAndroid Build Coastguard Worker mova m4, m10 1973*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 1974*c0909341SAndroid Build Coastguard Worker%endif 1975*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 1976*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 1977*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m0 1978*c0909341SAndroid Build Coastguard Worker mova [tlq+16*1], m1 1979*c0909341SAndroid Build Coastguard Worker ret 1980*c0909341SAndroid Build Coastguard Worker 1981*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1982*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy 1983*c0909341SAndroid Build Coastguard Worker %define base r7-$$ 1984*c0909341SAndroid Build Coastguard Worker %define maxwm r6m 1985*c0909341SAndroid Build Coastguard Worker %define maxhm r7m 1986*c0909341SAndroid Build Coastguard Worker lea r7, [$$] 1987*c0909341SAndroid Build Coastguard Worker mov hd, hm 1988*c0909341SAndroid Build Coastguard Worker mova m8, [base+pw_62] 1989*c0909341SAndroid Build Coastguard Worker mova m9, [base+pw_64] 1990*c0909341SAndroid Build Coastguard Worker lea r9d, [wq-4] 1991*c0909341SAndroid Build Coastguard Worker mova m10, [base+pw_512] 1992*c0909341SAndroid Build Coastguard Worker shl r9d, 6 1993*c0909341SAndroid Build Coastguard Worker mova m11, [base+z1_shuf_w4] 1994*c0909341SAndroid Build Coastguard Worker or r9d, hd 1995*c0909341SAndroid Build Coastguard Worker mova m12, [base+z2_h_shuf] 1996*c0909341SAndroid Build Coastguard Worker%else 1997*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx 1998*c0909341SAndroid Build Coastguard Worker %define base r1-$$ 1999*c0909341SAndroid Build Coastguard Worker %define m8 [base+pw_62] 2000*c0909341SAndroid Build Coastguard Worker %define m9 [base+pw_64] 2001*c0909341SAndroid Build Coastguard Worker %define m10 [base+pw_512] 2002*c0909341SAndroid Build Coastguard Worker %define m11 [rsp+16*16] 2003*c0909341SAndroid Build Coastguard Worker %define m12 [rsp+16*17] 2004*c0909341SAndroid Build Coastguard Worker %define r9b byte [rsp+16*18+4*0] 2005*c0909341SAndroid Build Coastguard Worker %define r9d dword [rsp+16*18+4*0] 2006*c0909341SAndroid Build Coastguard Worker %define r10d dword [rsp+16*18+4*1] 2007*c0909341SAndroid Build Coastguard Worker %define r11d dword [rsp+16*18+4*2] 2008*c0909341SAndroid Build Coastguard Worker %define maxwm [rsp+16*18+4*3] 2009*c0909341SAndroid Build Coastguard Worker %define maxhm [rsp+16*19+4*0] 2010*c0909341SAndroid Build Coastguard Worker %define stridemp [rsp+16*19+4*1] 2011*c0909341SAndroid Build Coastguard Worker %define strideq r3 2012*c0909341SAndroid Build Coastguard Worker %define dyd r4 2013*c0909341SAndroid Build Coastguard Worker %define dyq r4 2014*c0909341SAndroid Build Coastguard Worker mov stridemp, r1 2015*c0909341SAndroid Build Coastguard Worker mov r1d, r6m 2016*c0909341SAndroid Build Coastguard Worker mov r4d, r7m 2017*c0909341SAndroid Build Coastguard Worker mov maxwm, r1d 2018*c0909341SAndroid Build Coastguard Worker mov maxhm, r4d 2019*c0909341SAndroid Build Coastguard Worker LEA r1, $$ 2020*c0909341SAndroid Build Coastguard Worker lea hd, [wq-4] 2021*c0909341SAndroid Build Coastguard Worker mova m0, [base+z1_shuf_w4] 2022*c0909341SAndroid Build Coastguard Worker shl hd, 6 2023*c0909341SAndroid Build Coastguard Worker mova m1, [base+z2_h_shuf] 2024*c0909341SAndroid Build Coastguard Worker or hd, hm 2025*c0909341SAndroid Build Coastguard Worker mova m11, m0 2026*c0909341SAndroid Build Coastguard Worker mov r9d, hd 2027*c0909341SAndroid Build Coastguard Worker mova m12, m1 2028*c0909341SAndroid Build Coastguard Worker%endif 2029*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2030*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 2031*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z2_ssse3_table+wq*4] 2032*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2033*c0909341SAndroid Build Coastguard Worker movzx dxd, angleb 2034*c0909341SAndroid Build Coastguard Worker%else 2035*c0909341SAndroid Build Coastguard Worker movzx dxd, byte anglem 2036*c0909341SAndroid Build Coastguard Worker%endif 2037*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 2038*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16*4] 2039*c0909341SAndroid Build Coastguard Worker mov dyd, dxd 2040*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16*3] 2041*c0909341SAndroid Build Coastguard Worker neg dxq 2042*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*2] 2043*c0909341SAndroid Build Coastguard Worker and dyd, ~1 2044*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*1] 2045*c0909341SAndroid Build Coastguard Worker and dxq, ~1 2046*c0909341SAndroid Build Coastguard Worker movd m4, [tlq] 2047*c0909341SAndroid Build Coastguard Worker movu m5, [tlq+16*0+1] 2048*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+16*1+1] 2049*c0909341SAndroid Build Coastguard Worker movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 2050*c0909341SAndroid Build Coastguard Worker movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle 2051*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m0 2052*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2053*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m1 2054*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 2055*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m2 2056*c0909341SAndroid Build Coastguard Worker lea wq, [base+ipred_z2_ssse3_table+wq] 2057*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m3 2058*c0909341SAndroid Build Coastguard Worker neg dxd 2059*c0909341SAndroid Build Coastguard Worker mova [rsp+16*6], m4 2060*c0909341SAndroid Build Coastguard Worker or dyd, 4<<16 2061*c0909341SAndroid Build Coastguard Worker mova [rsp+16*7], m4 2062*c0909341SAndroid Build Coastguard Worker mova [rsp+16*8], m5 2063*c0909341SAndroid Build Coastguard Worker mova [rsp+16*9], m6 2064*c0909341SAndroid Build Coastguard Worker movq m0, [base+z_base_inc+2] 2065*c0909341SAndroid Build Coastguard Worker movsldup m1, [base+z2_dy_offset] 2066*c0909341SAndroid Build Coastguard Worker movq m2, [base+pw_256] ; 4<<6 2067*c0909341SAndroid Build Coastguard Worker movq [rsp+16*14+8*0], m0 2068*c0909341SAndroid Build Coastguard Worker movq [rsp+16*15+8*0], m1 2069*c0909341SAndroid Build Coastguard Worker movq [rsp+16*15+8*1], m2 2070*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2071*c0909341SAndroid Build Coastguard Worker lea r10d, [dxq+(128<<6)] ; xpos 2072*c0909341SAndroid Build Coastguard Worker%else 2073*c0909341SAndroid Build Coastguard Worker mov [rsp+16*7+4*1], dyd 2074*c0909341SAndroid Build Coastguard Worker lea r4d, [dxq+(128<<6)] 2075*c0909341SAndroid Build Coastguard Worker mov r10d, r4d 2076*c0909341SAndroid Build Coastguard Worker movzx hd, r9b 2077*c0909341SAndroid Build Coastguard Worker%endif 2078*c0909341SAndroid Build Coastguard Worker mov r11d, (128-4)<<6 2079*c0909341SAndroid Build Coastguard Worker jmp wq 2080*c0909341SAndroid Build Coastguard Worker.w4: 2081*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2082*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2083*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+4] 2084*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 2085*c0909341SAndroid Build Coastguard Worker add angled, 1022 2086*c0909341SAndroid Build Coastguard Worker pshufb m5, m7 2087*c0909341SAndroid Build Coastguard Worker shl r3d, 6 2088*c0909341SAndroid Build Coastguard Worker movd [rsp+16*8+4], m5 2089*c0909341SAndroid Build Coastguard Worker test r3d, angled 2090*c0909341SAndroid Build Coastguard Worker jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 2091*c0909341SAndroid Build Coastguard Worker call .upsample_above 2092*c0909341SAndroid Build Coastguard Worker sub angled, 1075 ; angle - 53 2093*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 2094*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 2095*c0909341SAndroid Build Coastguard Worker movd m0, r3d 2096*c0909341SAndroid Build Coastguard Worker movd m6, angled 2097*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2098*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 2099*c0909341SAndroid Build Coastguard Worker pshufb m6, m7 2100*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, [base+z_filter_wh4] 2101*c0909341SAndroid Build Coastguard Worker pand m6, m0 2102*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, [base+z_filter_t_w48+angleq*8] 2103*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left 2104*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8 2105*c0909341SAndroid Build Coastguard Worker movq m3, [rsp+gprsize+16*8-2] 2106*c0909341SAndroid Build Coastguard Worker movq m1, [rsp+gprsize+16*8-1] 2107*c0909341SAndroid Build Coastguard Worker movq m0, [rsp+gprsize+16*8+0] 2108*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+gprsize+16*8+1] 2109*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pb_36_m4] 2110*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 2111*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m0, m4 2112*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 2113*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 2114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2115*c0909341SAndroid Build Coastguard Worker mova m11, [base+pb_0to15] 2116*c0909341SAndroid Build Coastguard Worker lea r10d, [r10+dxq+(1<<6)] 2117*c0909341SAndroid Build Coastguard Worker mov r11d, (128-7)<<6 2118*c0909341SAndroid Build Coastguard Worker%else 2119*c0909341SAndroid Build Coastguard Worker mova m3, [base+pb_0to15] 2120*c0909341SAndroid Build Coastguard Worker mov r3d, [rsp+gprsize+16*18+4*1] 2121*c0909341SAndroid Build Coastguard Worker mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 2122*c0909341SAndroid Build Coastguard Worker lea r3d, [r3+dxq+(1<<6)] 2123*c0909341SAndroid Build Coastguard Worker mov [rsp+gprsize+16*18+4*1], r3d 2124*c0909341SAndroid Build Coastguard Worker mova [rsp+gprsize+16*16], m3 2125*c0909341SAndroid Build Coastguard Worker%endif 2126*c0909341SAndroid Build Coastguard Worker add dxd, dxd 2127*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2128*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2129*c0909341SAndroid Build Coastguard Worker movq m2, [rsp+gprsize+16*14] 2130*c0909341SAndroid Build Coastguard Worker paddw m2, m2 2131*c0909341SAndroid Build Coastguard Worker movq [rsp+gprsize+16*14], m2 2132*c0909341SAndroid Build Coastguard Worker packuswb m1, m1 2133*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0 2134*c0909341SAndroid Build Coastguard Worker mova [rsp+gprsize+16*8], m1 2135*c0909341SAndroid Build Coastguard Worker ret 2136*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above: 2137*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 2138*c0909341SAndroid Build Coastguard Worker mov [rsp], angled 2139*c0909341SAndroid Build Coastguard Worker sub angled, 1112 ; angle - 90 2140*c0909341SAndroid Build Coastguard Worker movd m0, r3d 2141*c0909341SAndroid Build Coastguard Worker mov r3d, 90 2142*c0909341SAndroid Build Coastguard Worker movd m1, angled 2143*c0909341SAndroid Build Coastguard Worker sub r3d, angled ; 180 - angle 2144*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2145*c0909341SAndroid Build Coastguard Worker movu m3, [base+z_filter_wh4] 2146*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_filter_t_w48+angleq*8] 2147*c0909341SAndroid Build Coastguard Worker call .w8_filter_top 2148*c0909341SAndroid Build Coastguard Worker mov angled, [rsp] 2149*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 2150*c0909341SAndroid Build Coastguard Worker sub angled, 139 2151*c0909341SAndroid Build Coastguard Worker shl r3d, 6 2152*c0909341SAndroid Build Coastguard Worker test r3d, angled 2153*c0909341SAndroid Build Coastguard Worker jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 2154*c0909341SAndroid Build Coastguard Worker.upsample_left: ; w4/w8 2155*c0909341SAndroid Build Coastguard Worker neg hq 2156*c0909341SAndroid Build Coastguard Worker movd m0, [tlq+hq] 2157*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 2158*c0909341SAndroid Build Coastguard Worker movd [rsp+16*6+hq-4], m0 2159*c0909341SAndroid Build Coastguard Worker movq m3, [rsp+16*5+7] 2160*c0909341SAndroid Build Coastguard Worker movq m0, [rsp+16*5+8] 2161*c0909341SAndroid Build Coastguard Worker movq m2, [rsp+16*5+9] 2162*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+16*5+10] 2163*c0909341SAndroid Build Coastguard Worker movddup m5, [base+pb_36_m4] 2164*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0, m3 2165*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 2166*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 2167*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 2168*c0909341SAndroid Build Coastguard Worker movshdup m3, [base+z2_dy_offset] 2169*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2170*c0909341SAndroid Build Coastguard Worker mova m12, [base+z2_upsample] 2171*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2172*c0909341SAndroid Build Coastguard Worker%else 2173*c0909341SAndroid Build Coastguard Worker mova m4, [base+z2_upsample] 2174*c0909341SAndroid Build Coastguard Worker shl dword [rsp+16*7+4*1], 1 2175*c0909341SAndroid Build Coastguard Worker mova m12, m4 2176*c0909341SAndroid Build Coastguard Worker%endif 2177*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2178*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2179*c0909341SAndroid Build Coastguard Worker movq [rsp+16*15], m3 2180*c0909341SAndroid Build Coastguard Worker packuswb m1, m1 2181*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 2182*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m0 2183*c0909341SAndroid Build Coastguard Worker.w4_main: 2184*c0909341SAndroid Build Coastguard Worker movd m6, dxd 2185*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2186*c0909341SAndroid Build Coastguard Worker movd m3, dyd 2187*c0909341SAndroid Build Coastguard Worker%else 2188*c0909341SAndroid Build Coastguard Worker movd m3, [rsp+16*7+4*1] 2189*c0909341SAndroid Build Coastguard Worker%endif 2190*c0909341SAndroid Build Coastguard Worker movddup m0, [rsp+16*14+8*0] 2191*c0909341SAndroid Build Coastguard Worker pshufb m6, [base+pw_256] 2192*c0909341SAndroid Build Coastguard Worker paddw m7, m6, m6 2193*c0909341SAndroid Build Coastguard Worker movq m5, [base+pw_m1to4] 2194*c0909341SAndroid Build Coastguard Worker pshuflw m4, m3, q0000 2195*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7 2196*c0909341SAndroid Build Coastguard Worker pmullw m4, m5 2197*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q1111 2198*c0909341SAndroid Build Coastguard Worker paddw m6, m0 2199*c0909341SAndroid Build Coastguard Worker mov r2d, r10d 2200*c0909341SAndroid Build Coastguard Worker pshuflw m0, m4, q3333 2201*c0909341SAndroid Build Coastguard Worker psubw m4, [rsp+16*15] 2202*c0909341SAndroid Build Coastguard Worker movq [rsp+16*6+8*1], m3 2203*c0909341SAndroid Build Coastguard Worker movq [rsp+8*1], m0 ; dy*4 2204*c0909341SAndroid Build Coastguard Worker mov r5, dstq 2205*c0909341SAndroid Build Coastguard Worker.w4_loop0: 2206*c0909341SAndroid Build Coastguard Worker mova [rsp+16*12], m6 2207*c0909341SAndroid Build Coastguard Worker movq [rsp+8*0], m4 2208*c0909341SAndroid Build Coastguard Worker pand m0, m4, m8 2209*c0909341SAndroid Build Coastguard Worker psraw m4, 6 2210*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m0 2211*c0909341SAndroid Build Coastguard Worker psllw m0, 8 2212*c0909341SAndroid Build Coastguard Worker por m0, m1 ; 64-frac_y, frac_y 2213*c0909341SAndroid Build Coastguard Worker movq [rsp+8*3], m0 2214*c0909341SAndroid Build Coastguard Worker pabsw m4, m4 2215*c0909341SAndroid Build Coastguard Worker movq [rsp+8*2], m4 2216*c0909341SAndroid Build Coastguard Worker movzx hd, r9b 2217*c0909341SAndroid Build Coastguard Worker.w4_loop: 2218*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 2219*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x0 2220*c0909341SAndroid Build Coastguard Worker movq m0, [rsp+r2] 2221*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 2222*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x1 2223*c0909341SAndroid Build Coastguard Worker movhps m0, [rsp+r3] 2224*c0909341SAndroid Build Coastguard Worker lea r3d, [r2+dxq] 2225*c0909341SAndroid Build Coastguard Worker shr r2d, 6 ; base_x2 2226*c0909341SAndroid Build Coastguard Worker movq m1, [rsp+r2] 2227*c0909341SAndroid Build Coastguard Worker lea r2d, [r3+dxq] 2228*c0909341SAndroid Build Coastguard Worker shr r3d, 6 ; base_x3 2229*c0909341SAndroid Build Coastguard Worker movhps m1, [rsp+r3] 2230*c0909341SAndroid Build Coastguard Worker pand m2, m8, m6 2231*c0909341SAndroid Build Coastguard Worker paddsw m5, m6, m7 2232*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m2 2233*c0909341SAndroid Build Coastguard Worker psllw m2, 8 2234*c0909341SAndroid Build Coastguard Worker pshufb m0, m11 2235*c0909341SAndroid Build Coastguard Worker por m2, m3 2236*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 2237*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 2238*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m2 2239*c0909341SAndroid Build Coastguard Worker psllw m2, 8 2240*c0909341SAndroid Build Coastguard Worker pshufb m1, m11 2241*c0909341SAndroid Build Coastguard Worker por m2, m3 2242*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 2243*c0909341SAndroid Build Coastguard Worker cmp r3d, 127 ; topleft 2244*c0909341SAndroid Build Coastguard Worker jge .w4_toponly 2245*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*2+0] ; base_y0 2246*c0909341SAndroid Build Coastguard Worker movq m3, [rsp+r3] 2247*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*2+2] ; base_y1 2248*c0909341SAndroid Build Coastguard Worker movhps m3, [rsp+r3] 2249*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*2+4] ; base_y2 2250*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+r3] 2251*c0909341SAndroid Build Coastguard Worker movzx r3d, byte [rsp+8*2+6] ; base_y3 2252*c0909341SAndroid Build Coastguard Worker movhps m4, [rsp+r3] 2253*c0909341SAndroid Build Coastguard Worker pshufb m3, m12 2254*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 2255*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3, m4 2256*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4 2257*c0909341SAndroid Build Coastguard Worker movddup m4, [rsp+8*3] 2258*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 2259*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m4 2260*c0909341SAndroid Build Coastguard Worker psraw m6, 15 ; base_x < topleft 2261*c0909341SAndroid Build Coastguard Worker pand m2, m6 2262*c0909341SAndroid Build Coastguard Worker pandn m6, m0 2263*c0909341SAndroid Build Coastguard Worker por m0, m2, m6 2264*c0909341SAndroid Build Coastguard Worker psraw m6, m5, 15 2265*c0909341SAndroid Build Coastguard Worker pand m3, m6 2266*c0909341SAndroid Build Coastguard Worker pandn m6, m1 2267*c0909341SAndroid Build Coastguard Worker por m1, m3, m6 2268*c0909341SAndroid Build Coastguard Worker.w4_toponly: 2269*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2270*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2271*c0909341SAndroid Build Coastguard Worker movifnidn strideq, stridemp 2272*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2273*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 2274*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 2275*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 2276*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2277*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 2278*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 2279*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 2280*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m0 2281*c0909341SAndroid Build Coastguard Worker sub hd, 4 2282*c0909341SAndroid Build Coastguard Worker jz .w4_end 2283*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+8*2] 2284*c0909341SAndroid Build Coastguard Worker movq m3, [rsp+16*6+8*1] 2285*c0909341SAndroid Build Coastguard Worker paddw m6, m5, m7 ; xpos += dx 2286*c0909341SAndroid Build Coastguard Worker psubw m4, m3 2287*c0909341SAndroid Build Coastguard Worker movq [rsp+8*2], m4 2288*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2289*c0909341SAndroid Build Coastguard Worker cmp r2d, r11d 2290*c0909341SAndroid Build Coastguard Worker jge .w4_loop 2291*c0909341SAndroid Build Coastguard Worker movddup m5, [rsp+8*3] 2292*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop: 2293*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*2+0] ; base_y0 2294*c0909341SAndroid Build Coastguard Worker movq m1, [rsp+r2] 2295*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*2+2] ; base_y1 2296*c0909341SAndroid Build Coastguard Worker movhps m1, [rsp+r2] 2297*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*2+4] ; base_y2 2298*c0909341SAndroid Build Coastguard Worker movq m2, [rsp+r2] 2299*c0909341SAndroid Build Coastguard Worker movzx r2d, byte [rsp+8*2+6] ; base_y3 2300*c0909341SAndroid Build Coastguard Worker movhps m2, [rsp+r2] 2301*c0909341SAndroid Build Coastguard Worker psubw m4, m3 2302*c0909341SAndroid Build Coastguard Worker pshufb m1, m12 2303*c0909341SAndroid Build Coastguard Worker pshufb m2, m12 2304*c0909341SAndroid Build Coastguard Worker movq [rsp+8*2], m4 2305*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 2306*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 2307*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 2308*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 2309*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2310*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2311*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2312*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 2313*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 2314*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 2315*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2316*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 2317*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 2318*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 2319*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m0 2320*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2321*c0909341SAndroid Build Coastguard Worker sub hd, 4 2322*c0909341SAndroid Build Coastguard Worker jg .w4_leftonly_loop 2323*c0909341SAndroid Build Coastguard Worker.w4_end: 2324*c0909341SAndroid Build Coastguard Worker sub r9d, 1<<8 2325*c0909341SAndroid Build Coastguard Worker jl .w4_ret 2326*c0909341SAndroid Build Coastguard Worker movq m4, [rsp+8*1] 2327*c0909341SAndroid Build Coastguard Worker add r5, 4 2328*c0909341SAndroid Build Coastguard Worker mov dstq, r5 2329*c0909341SAndroid Build Coastguard Worker paddw m4, [rsp+8*0] ; base_y += 4*dy 2330*c0909341SAndroid Build Coastguard Worker movzx r2d, word [rsp+16*15+8*1] 2331*c0909341SAndroid Build Coastguard Worker movddup m6, [rsp+16*15+8*1] 2332*c0909341SAndroid Build Coastguard Worker paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) 2333*c0909341SAndroid Build Coastguard Worker add r2d, r10d 2334*c0909341SAndroid Build Coastguard Worker mov r10d, r2d 2335*c0909341SAndroid Build Coastguard Worker jmp .w4_loop0 2336*c0909341SAndroid Build Coastguard Worker.w4_ret: 2337*c0909341SAndroid Build Coastguard Worker RET 2338*c0909341SAndroid Build Coastguard Worker.w8: 2339*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2340*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2341*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+8] 2342*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+126] 2343*c0909341SAndroid Build Coastguard Worker pshufb m5, m7 2344*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2345*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2346*c0909341SAndroid Build Coastguard Worker%else 2347*c0909341SAndroid Build Coastguard Worker xor r3b, r3b 2348*c0909341SAndroid Build Coastguard Worker or r3d, hd 2349*c0909341SAndroid Build Coastguard Worker%endif 2350*c0909341SAndroid Build Coastguard Worker movd [rsp+16*8+8], m5 2351*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2352*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2353*c0909341SAndroid Build Coastguard Worker call .upsample_above 2354*c0909341SAndroid Build Coastguard Worker sub angled, 53 2355*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2356*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 2357*c0909341SAndroid Build Coastguard Worker movu m1, [base+z_filter_wh8] 2358*c0909341SAndroid Build Coastguard Worker movd m0, r3d 2359*c0909341SAndroid Build Coastguard Worker movd m6, angled 2360*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2361*c0909341SAndroid Build Coastguard Worker psrldq m2, [base+z_filter_t_w48+angleq*8], 4 2362*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 2363*c0909341SAndroid Build Coastguard Worker pshufb m6, m7 2364*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, m1 2365*c0909341SAndroid Build Coastguard Worker pand m6, m0 2366*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m2 2367*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2368*c0909341SAndroid Build Coastguard Worker movq [rsp+16*15+8*1], m10 ; 8<<6 2369*c0909341SAndroid Build Coastguard Worker%else 2370*c0909341SAndroid Build Coastguard Worker movq m0, m10 2371*c0909341SAndroid Build Coastguard Worker movq [rsp+16*15+8*1], m0 2372*c0909341SAndroid Build Coastguard Worker%endif 2373*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left 2374*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above: 2375*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2376*c0909341SAndroid Build Coastguard Worker mov [rsp], angled 2377*c0909341SAndroid Build Coastguard Worker sub angled, 90 2378*c0909341SAndroid Build Coastguard Worker movd m0, r3d 2379*c0909341SAndroid Build Coastguard Worker mov r3d, 90 2380*c0909341SAndroid Build Coastguard Worker movd m1, angled 2381*c0909341SAndroid Build Coastguard Worker sub r3d, angled ; 180 - angle 2382*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2383*c0909341SAndroid Build Coastguard Worker movu m3, [base+z_filter_wh8] 2384*c0909341SAndroid Build Coastguard Worker psrldq m4, [base+z_filter_t_w48+angleq*8], 4 2385*c0909341SAndroid Build Coastguard Worker call .w8_filter_top 2386*c0909341SAndroid Build Coastguard Worker mov r3d, [rsp] 2387*c0909341SAndroid Build Coastguard Worker sub r3d, 141 2388*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2389*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2390*c0909341SAndroid Build Coastguard Worker%else 2391*c0909341SAndroid Build Coastguard Worker xor r3b, r3b 2392*c0909341SAndroid Build Coastguard Worker or r3d, hd 2393*c0909341SAndroid Build Coastguard Worker%endif 2394*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2395*c0909341SAndroid Build Coastguard Worker jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm 2396*c0909341SAndroid Build Coastguard Worker.w8_filter_left: 2397*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m6 2398*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2399*c0909341SAndroid Build Coastguard Worker jz .w4_main 2400*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2401*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2402*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2403*c0909341SAndroid Build Coastguard Worker sub r5, 3 ; filter_strength-3 2404*c0909341SAndroid Build Coastguard Worker jmp .filter_left 2405*c0909341SAndroid Build Coastguard Worker.w8_filter_top: 2406*c0909341SAndroid Build Coastguard Worker movd m6, r3d 2407*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m7}, m0, m1, m6 2408*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, m3 2409*c0909341SAndroid Build Coastguard Worker pand m1, m0 2410*c0909341SAndroid Build Coastguard Worker pand m6, m0 2411*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4 2412*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m4 2413*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2414*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2415*c0909341SAndroid Build Coastguard Worker jz .w8_filter_top_end ; filter_strength == 0 2416*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2417*c0909341SAndroid Build Coastguard Worker movq m0, [rsp+gprsize+16*8-2] 2418*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2419*c0909341SAndroid Build Coastguard Worker movq m1, [rsp+gprsize+16*8-1] 2420*c0909341SAndroid Build Coastguard Worker sub r5, 3 ; filter_strength-3 2421*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*0] 2422*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 2423*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 2424*c0909341SAndroid Build Coastguard Worker movq m1, [rsp+gprsize+16*8+0] 2425*c0909341SAndroid Build Coastguard Worker movq m2, [rsp+gprsize+16*8+1] 2426*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*1] 2427*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2 2428*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2429*c0909341SAndroid Build Coastguard Worker movq m2, [rsp+gprsize+16*8+2] 2430*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*2] 2431*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m2 2432*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2433*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2434*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2435*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2436*c0909341SAndroid Build Coastguard Worker mov r3d, r7m ; maxw, offset due to call 2437*c0909341SAndroid Build Coastguard Worker%else 2438*c0909341SAndroid Build Coastguard Worker mov r3d, [rsp+gprsize+16*18+4*3] 2439*c0909341SAndroid Build Coastguard Worker%endif 2440*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2441*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2442*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2443*c0909341SAndroid Build Coastguard Worker movq [rsp+gprsize+16*8], m0 2444*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2445*c0909341SAndroid Build Coastguard Worker jge .w8_filter_top_end 2446*c0909341SAndroid Build Coastguard Worker movq m0, [tlq+r3+1] 2447*c0909341SAndroid Build Coastguard Worker movq [rsp+gprsize+r3+16*8], m0 2448*c0909341SAndroid Build Coastguard Worker.w8_filter_top_end: 2449*c0909341SAndroid Build Coastguard Worker ret 2450*c0909341SAndroid Build Coastguard Worker.w16: 2451*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2452*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2453*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 2454*c0909341SAndroid Build Coastguard Worker sub angled, 90 2455*c0909341SAndroid Build Coastguard Worker movd m0, r3d 2456*c0909341SAndroid Build Coastguard Worker mov r3d, 90 2457*c0909341SAndroid Build Coastguard Worker movd m1, angled 2458*c0909341SAndroid Build Coastguard Worker sub r3d, angled ; 180 - angle 2459*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2460*c0909341SAndroid Build Coastguard Worker movd m6, r3d 2461*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m7}, m0, m1, m6 2462*c0909341SAndroid Build Coastguard Worker movq m3, [base+z_filter_t_w16+angleq*4] 2463*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, [base+z_filter_wh16] 2464*c0909341SAndroid Build Coastguard Worker pand m1, m0 2465*c0909341SAndroid Build Coastguard Worker pand m6, m0 2466*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m3 2467*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m3 2468*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2469*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2470*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2471*c0909341SAndroid Build Coastguard Worker jz .w16_filter_left ; filter_strength == 0 2472*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 2473*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+z_filter_t_w16] ; tlq[16] 2474*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2475*c0909341SAndroid Build Coastguard Worker adc r5, -4 ; filter_strength-3 2476*c0909341SAndroid Build Coastguard Worker movd [rsp+16*9], m5 2477*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*0] 2478*c0909341SAndroid Build Coastguard Worker movu m1, [rsp+16*8-2] 2479*c0909341SAndroid Build Coastguard Worker movu m2, [rsp+16*8-1] 2480*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 2481*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 2482*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 2483*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2484*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*2+r5*8+24*1] 2485*c0909341SAndroid Build Coastguard Worker mova m3, [rsp+16*8+0] 2486*c0909341SAndroid Build Coastguard Worker movu m4, [rsp+16*8+1] 2487*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 2488*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2489*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 2490*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 2491*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2492*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2493*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2494*c0909341SAndroid Build Coastguard Worker jnz .w16_filter_end ; 3-tap 2495*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k+8*8] 2496*c0909341SAndroid Build Coastguard Worker movu m3, [rsp+16*8+2] 2497*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m3 2498*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2499*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m3 2500*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 2501*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2502*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2503*c0909341SAndroid Build Coastguard Worker.w16_filter_end: 2504*c0909341SAndroid Build Coastguard Worker mov r2d, maxwm 2505*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2506*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2507*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2508*c0909341SAndroid Build Coastguard Worker mova [rsp+16*8], m0 2509*c0909341SAndroid Build Coastguard Worker cmp r2d, 16 2510*c0909341SAndroid Build Coastguard Worker jge .w16_filter_left 2511*c0909341SAndroid Build Coastguard Worker movu m0, [r3+r2+1] 2512*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*8], m0 2513*c0909341SAndroid Build Coastguard Worker.w16_filter_left: 2514*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m6 2515*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2516*c0909341SAndroid Build Coastguard Worker jz .w4_main 2517*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 2518*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2519*c0909341SAndroid Build Coastguard Worker adc r5, -4 ; filter_strength-3 2520*c0909341SAndroid Build Coastguard Worker jmp .filter_left 2521*c0909341SAndroid Build Coastguard Worker.w32: 2522*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2523*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2524*c0909341SAndroid Build Coastguard Worker pshufb m6, [base+z_filter_t_w16] ; tlq[32] 2525*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2526*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*9] 2527*c0909341SAndroid Build Coastguard Worker movd [tlq+16*1], m6 2528*c0909341SAndroid Build Coastguard Worker xor r5d, r5d ; filter_strength = 3 2529*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2530*c0909341SAndroid Build Coastguard Worker mova m0, [tlq+16*0] 2531*c0909341SAndroid Build Coastguard Worker mova m1, [tlq+16*1] 2532*c0909341SAndroid Build Coastguard Worker mov r2d, maxwm 2533*c0909341SAndroid Build Coastguard Worker mova [rsp+16*8], m0 2534*c0909341SAndroid Build Coastguard Worker mova [rsp+16*9], m1 2535*c0909341SAndroid Build Coastguard Worker cmp r2d, 32 2536*c0909341SAndroid Build Coastguard Worker jge .filter_left 2537*c0909341SAndroid Build Coastguard Worker movu m0, [r3+r2+16*0+1] 2538*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2+16*1+1] 2539*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*8], m0 2540*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*9], m1 2541*c0909341SAndroid Build Coastguard Worker jmp .filter_left 2542*c0909341SAndroid Build Coastguard Worker.w64: 2543*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+16*2+1] 2544*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16*3+1] 2545*c0909341SAndroid Build Coastguard Worker mova [rsp+16*10], m0 2546*c0909341SAndroid Build Coastguard Worker mova [rsp+16*11], m1 2547*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2548*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2549*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+z_filter_t_w16] ; tlq[64] 2550*c0909341SAndroid Build Coastguard Worker mov r3, tlq 2551*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*11] 2552*c0909341SAndroid Build Coastguard Worker movd [tlq+16*1], m1 2553*c0909341SAndroid Build Coastguard Worker xor r5d, r5d ; filter_strength = 3 2554*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2555*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 2556*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2557*c0909341SAndroid Build Coastguard Worker mova m0, [tlq+16*0] 2558*c0909341SAndroid Build Coastguard Worker mova m1, [tlq+16*1] 2559*c0909341SAndroid Build Coastguard Worker mova m2, [tlq+16*2] 2560*c0909341SAndroid Build Coastguard Worker mova m3, [tlq+16*3] 2561*c0909341SAndroid Build Coastguard Worker mov r2d, maxwm 2562*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 8], m0 2563*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 9], m1 2564*c0909341SAndroid Build Coastguard Worker mova [rsp+16*10], m2 2565*c0909341SAndroid Build Coastguard Worker mova [rsp+16*11], m3 2566*c0909341SAndroid Build Coastguard Worker cmp r2d, 64 2567*c0909341SAndroid Build Coastguard Worker jge .filter_left 2568*c0909341SAndroid Build Coastguard Worker movu m0, [r3+r2+16*0+1] 2569*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2+16*1+1] 2570*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16* 8], m0 2571*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16* 9], m1 2572*c0909341SAndroid Build Coastguard Worker cmp r2d, 32 2573*c0909341SAndroid Build Coastguard Worker jge .filter_left 2574*c0909341SAndroid Build Coastguard Worker movu m0, [r3+r2+16*2+1] 2575*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2+16*3+1] 2576*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*10], m0 2577*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*11], m1 2578*c0909341SAndroid Build Coastguard Worker.filter_left: 2579*c0909341SAndroid Build Coastguard Worker neg hq 2580*c0909341SAndroid Build Coastguard Worker movd m0, [r3+hq] 2581*c0909341SAndroid Build Coastguard Worker pxor m1, m1 2582*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 2583*c0909341SAndroid Build Coastguard Worker movd [rsp+16*6+hq-4], m0 2584*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*5] 2585*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2586*c0909341SAndroid Build Coastguard Worker cmp hd, -32 2587*c0909341SAndroid Build Coastguard Worker jge .filter_left_end 2588*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 2589*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2590*c0909341SAndroid Build Coastguard Worker mova m0, [tlq+16*0] 2591*c0909341SAndroid Build Coastguard Worker mova m1, [tlq+16*1] 2592*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m0 2593*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m1 2594*c0909341SAndroid Build Coastguard Worker.filter_left_end: 2595*c0909341SAndroid Build Coastguard Worker mov r2d, maxhm 2596*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+16*5] 2597*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16*6] 2598*c0909341SAndroid Build Coastguard Worker mova m2, [rsp+16*7] 2599*c0909341SAndroid Build Coastguard Worker neg r2 2600*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m0 2601*c0909341SAndroid Build Coastguard Worker mova [rsp+16*5], m1 2602*c0909341SAndroid Build Coastguard Worker mova [rsp+16*6], m2 2603*c0909341SAndroid Build Coastguard Worker cmp r2d, hd 2604*c0909341SAndroid Build Coastguard Worker jle .w4_main 2605*c0909341SAndroid Build Coastguard Worker movu m0, [r3+r2-16*2] 2606*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2-16*1] 2607*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*4], m0 2608*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*5], m1 2609*c0909341SAndroid Build Coastguard Worker cmp r2d, -32 2610*c0909341SAndroid Build Coastguard Worker jle .w4_main 2611*c0909341SAndroid Build Coastguard Worker movu m0, [r3+r2-16*4] 2612*c0909341SAndroid Build Coastguard Worker movu m1, [r3+r2-16*3] 2613*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*2], m0 2614*c0909341SAndroid Build Coastguard Worker movu [rsp+r2+16*3], m1 2615*c0909341SAndroid Build Coastguard Worker jmp .w4_main 2616*c0909341SAndroid Build Coastguard Worker 2617*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2618*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w 2619*c0909341SAndroid Build Coastguard Worker %define base r7-$$ 2620*c0909341SAndroid Build Coastguard Worker lea r7, [$$] 2621*c0909341SAndroid Build Coastguard Worker mova m8, [base+pw_62] 2622*c0909341SAndroid Build Coastguard Worker mova m9, [base+pw_64] 2623*c0909341SAndroid Build Coastguard Worker mova m10, [base+pw_512] 2624*c0909341SAndroid Build Coastguard Worker mov org_wd, wd 2625*c0909341SAndroid Build Coastguard Worker%else 2626*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy 2627*c0909341SAndroid Build Coastguard Worker %define base r1-$$ 2628*c0909341SAndroid Build Coastguard Worker %define m8 [base+pw_62] 2629*c0909341SAndroid Build Coastguard Worker %define m9 [base+pw_64] 2630*c0909341SAndroid Build Coastguard Worker %define m10 [base+pw_512] 2631*c0909341SAndroid Build Coastguard Worker %define org_wd r5 2632*c0909341SAndroid Build Coastguard Worker %define org_wq r5 2633*c0909341SAndroid Build Coastguard Worker mov [dstq+strideq*0], strideq 2634*c0909341SAndroid Build Coastguard Worker mov [dstq+strideq*1], wd 2635*c0909341SAndroid Build Coastguard Worker LEA r1, $$ 2636*c0909341SAndroid Build Coastguard Worker%endif 2637*c0909341SAndroid Build Coastguard Worker tzcnt hd, hm 2638*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 2639*c0909341SAndroid Build Coastguard Worker dec tlq 2640*c0909341SAndroid Build Coastguard Worker movsxd hq, [base+ipred_z3_ssse3_table+hq*4] 2641*c0909341SAndroid Build Coastguard Worker sub angled, 180 2642*c0909341SAndroid Build Coastguard Worker mov dyd, angled 2643*c0909341SAndroid Build Coastguard Worker neg dyd 2644*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 2645*c0909341SAndroid Build Coastguard Worker or dyq, ~0x7e 2646*c0909341SAndroid Build Coastguard Worker lea hq, [base+ipred_z3_ssse3_table+hq] 2647*c0909341SAndroid Build Coastguard Worker movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] 2648*c0909341SAndroid Build Coastguard Worker jmp hq 2649*c0909341SAndroid Build Coastguard Worker.h4: 2650*c0909341SAndroid Build Coastguard Worker lea r4d, [angleq+88] 2651*c0909341SAndroid Build Coastguard Worker test r4d, 0x480 2652*c0909341SAndroid Build Coastguard Worker jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 2653*c0909341SAndroid Build Coastguard Worker sar r4d, 9 2654*c0909341SAndroid Build Coastguard Worker add r4d, wd 2655*c0909341SAndroid Build Coastguard Worker cmp r4d, 8 2656*c0909341SAndroid Build Coastguard Worker jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) 2657*c0909341SAndroid Build Coastguard Worker movu m3, [tlq-7] 2658*c0909341SAndroid Build Coastguard Worker movu m1, [base+z_upsample1-4] 2659*c0909341SAndroid Build Coastguard Worker movu m4, [base+z_filter_s+2] 2660*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m1 2661*c0909341SAndroid Build Coastguard Worker pxor m1, m1 2662*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m1 2663*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m4 2664*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m2 ; top[max_base_y] 2665*c0909341SAndroid Build Coastguard Worker movddup m2, [base+pb_36_m4] 2666*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2667*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 2668*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 2669*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2670*c0909341SAndroid Build Coastguard Worker mov r5d, dyd 2671*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 2672*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2673*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2674*c0909341SAndroid Build Coastguard Worker shl wd, 2 2675*c0909341SAndroid Build Coastguard Worker mov tlq, rsp 2676*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2677*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2678*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3 2679*c0909341SAndroid Build Coastguard Worker paddw m6, m5, m5 2680*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m6 2681*c0909341SAndroid Build Coastguard Worker pshufb m0, [base+pb_15to0] 2682*c0909341SAndroid Build Coastguard Worker mova [tlq], m0 2683*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop: 2684*c0909341SAndroid Build Coastguard Worker lea r4d, [r5+dyq] 2685*c0909341SAndroid Build Coastguard Worker shr r5d, 6 2686*c0909341SAndroid Build Coastguard Worker movq m0, [tlq+r5] 2687*c0909341SAndroid Build Coastguard Worker lea r5d, [r4+dyq] 2688*c0909341SAndroid Build Coastguard Worker shr r4d, 6 2689*c0909341SAndroid Build Coastguard Worker movhps m0, [tlq+r4] 2690*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 2691*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 2692*c0909341SAndroid Build Coastguard Worker psllw m2, 8 2693*c0909341SAndroid Build Coastguard Worker por m1, m2 2694*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 2695*c0909341SAndroid Build Coastguard Worker paddw m5, m6 2696*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2697*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2698*c0909341SAndroid Build Coastguard Worker movq [rsp+wq-8], m0 2699*c0909341SAndroid Build Coastguard Worker sub wd, 8 2700*c0909341SAndroid Build Coastguard Worker jg .h4_upsample_loop 2701*c0909341SAndroid Build Coastguard Worker jmp .h4_transpose 2702*c0909341SAndroid Build Coastguard Worker.h4_no_upsample: 2703*c0909341SAndroid Build Coastguard Worker mov r4d, 7 2704*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2705*c0909341SAndroid Build Coastguard Worker jnz .h4_main 2706*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+3] 2707*c0909341SAndroid Build Coastguard Worker movd m0, r4d 2708*c0909341SAndroid Build Coastguard Worker movd m2, angled 2709*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2710*c0909341SAndroid Build Coastguard Worker pxor m1, m1 2711*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 2712*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 2713*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, m0, [base+z_filter_wh4] 2714*c0909341SAndroid Build Coastguard Worker pand m1, m2 2715*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, [base+z_filter_t_w48+angleq*8] 2716*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2717*c0909341SAndroid Build Coastguard Worker mov r4d, 7 2718*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2719*c0909341SAndroid Build Coastguard Worker jz .h4_main ; filter_strength == 0 2720*c0909341SAndroid Build Coastguard Worker movu m2, [tlq-7] 2721*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2722*c0909341SAndroid Build Coastguard Worker movu m3, [base+z_filter_s-2] 2723*c0909341SAndroid Build Coastguard Worker shr r5d, 30 ; filter_strength 2724*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_upsample2] 2725*c0909341SAndroid Build Coastguard Worker movddup m5, [base+z_filter_k-8+r5*8+24*0] 2726*c0909341SAndroid Build Coastguard Worker movddup m6, [base+z_filter_k-8+r5*8+24*1] 2727*c0909341SAndroid Build Coastguard Worker movddup m7, [base+z_filter_k-8+r5*8+24*2] 2728*c0909341SAndroid Build Coastguard Worker pshufb m0, m2, m3 2729*c0909341SAndroid Build Coastguard Worker shufps m3, m4, q2121 2730*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m0, m5 2731*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6 2732*c0909341SAndroid Build Coastguard Worker pshufb m5, m2, m3 2733*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5, m6 2734*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7 2735*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 2736*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2737*c0909341SAndroid Build Coastguard Worker paddw m0, m1 2738*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2739*c0909341SAndroid Build Coastguard Worker paddw m0, m5 2740*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2741*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2742*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2743*c0909341SAndroid Build Coastguard Worker lea r2d, [r4+2] 2744*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2745*c0909341SAndroid Build Coastguard Worker cmovne r4d, r2d 2746*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q0000 2747*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+15] 2748*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2749*c0909341SAndroid Build Coastguard Worker mova [rsp], m0 2750*c0909341SAndroid Build Coastguard Worker.h4_main: 2751*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2752*c0909341SAndroid Build Coastguard Worker movddup m0, [base+z_base_inc] ; base_inc << 6 2753*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2754*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2755*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 2756*c0909341SAndroid Build Coastguard Worker movd m4, r4d 2757*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 2758*c0909341SAndroid Build Coastguard Worker neg dyq 2759*c0909341SAndroid Build Coastguard Worker pshufb m7, [base+pw_m256] 2760*c0909341SAndroid Build Coastguard Worker mova m3, [base+z3_shuf_h4] 2761*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] ; ypos 2762*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+pw_256] 2763*c0909341SAndroid Build Coastguard Worker psubw m4, m0 ; max_base_y 2764*c0909341SAndroid Build Coastguard Worker shl wd, 2 2765*c0909341SAndroid Build Coastguard Worker paddw m6, m5, m5 2766*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2767*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m6 2768*c0909341SAndroid Build Coastguard Worker.h4_loop: 2769*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 2770*c0909341SAndroid Build Coastguard Worker sar r5, 6 2771*c0909341SAndroid Build Coastguard Worker movq m0, [tlq+r5-4] 2772*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 2773*c0909341SAndroid Build Coastguard Worker sar r4, 6 2774*c0909341SAndroid Build Coastguard Worker movhps m0, [tlq+r4-4] 2775*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 2776*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 2777*c0909341SAndroid Build Coastguard Worker psllw m2, 8 2778*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 2779*c0909341SAndroid Build Coastguard Worker por m1, m2 2780*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 2781*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m4, m5 2782*c0909341SAndroid Build Coastguard Worker paddw m5, m6 2783*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2784*c0909341SAndroid Build Coastguard Worker pand m0, m1 2785*c0909341SAndroid Build Coastguard Worker pandn m1, m7 2786*c0909341SAndroid Build Coastguard Worker por m0, m1 2787*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2788*c0909341SAndroid Build Coastguard Worker movq [rsp+wq-8], m0 2789*c0909341SAndroid Build Coastguard Worker sub wd, 8 2790*c0909341SAndroid Build Coastguard Worker jz .h4_transpose 2791*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2792*c0909341SAndroid Build Coastguard Worker jg .h4_loop 2793*c0909341SAndroid Build Coastguard Worker packuswb m7, m7 2794*c0909341SAndroid Build Coastguard Worker.h4_end_loop: 2795*c0909341SAndroid Build Coastguard Worker movq [rsp+wq-8], m7 2796*c0909341SAndroid Build Coastguard Worker sub wd, 8 2797*c0909341SAndroid Build Coastguard Worker jg .h4_end_loop 2798*c0909341SAndroid Build Coastguard Worker.h4_transpose: 2799*c0909341SAndroid Build Coastguard Worker mova m1, [base+z_transpose4] 2800*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2801*c0909341SAndroid Build Coastguard Worker mov strideq, [dstq] 2802*c0909341SAndroid Build Coastguard Worker mov org_wd, [dstq+strideq] 2803*c0909341SAndroid Build Coastguard Worker%endif 2804*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2805*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+org_wq-4] 2806*c0909341SAndroid Build Coastguard Worker.h4_transpose_loop: 2807*c0909341SAndroid Build Coastguard Worker mova m0, [rsp] 2808*c0909341SAndroid Build Coastguard Worker add rsp, 16 2809*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 2810*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 2811*c0909341SAndroid Build Coastguard Worker pshuflw m2, m0, q1032 2812*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m2 2813*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 2814*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m0 2815*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 2816*c0909341SAndroid Build Coastguard Worker movd [dstq+r2 ], m0 2817*c0909341SAndroid Build Coastguard Worker sub dstq, 4 2818*c0909341SAndroid Build Coastguard Worker sub org_wd, 4 2819*c0909341SAndroid Build Coastguard Worker jg .h4_transpose_loop 2820*c0909341SAndroid Build Coastguard Worker RET 2821*c0909341SAndroid Build Coastguard Worker.h8: 2822*c0909341SAndroid Build Coastguard Worker lea r4d, [angleq+88] 2823*c0909341SAndroid Build Coastguard Worker and r4d, ~0x7f 2824*c0909341SAndroid Build Coastguard Worker or r4d, wd 2825*c0909341SAndroid Build Coastguard Worker cmp r4d, 8 2826*c0909341SAndroid Build Coastguard Worker ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2827*c0909341SAndroid Build Coastguard Worker mova m4, [tlq-15] 2828*c0909341SAndroid Build Coastguard Worker and r4d, 4 2829*c0909341SAndroid Build Coastguard Worker movu m3, [tlq- 9] 2830*c0909341SAndroid Build Coastguard Worker movd m1, r4d 2831*c0909341SAndroid Build Coastguard Worker movu m2, [base+z_filter_s+2] 2832*c0909341SAndroid Build Coastguard Worker pxor m0, m0 2833*c0909341SAndroid Build Coastguard Worker movu m5, [base+z_filter_s+6] 2834*c0909341SAndroid Build Coastguard Worker movddup m7, [base+pb_36_m4] 2835*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 ; w & 4 2836*c0909341SAndroid Build Coastguard Worker movu m0, [base+z_upsample1-4] 2837*c0909341SAndroid Build Coastguard Worker pmaxub m1, m0 ; clip 4x8 2838*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2839*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m1 2840*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7 2841*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m2 2842*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7 2843*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, [base+z_upsample1] 2844*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7 2845*c0909341SAndroid Build Coastguard Worker pshufb m3, m5 2846*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7 2847*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2848*c0909341SAndroid Build Coastguard Worker neg dyq 2849*c0909341SAndroid Build Coastguard Worker paddw m1, m0 2850*c0909341SAndroid Build Coastguard Worker paddw m2, m3 2851*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2852*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m10 2853*c0909341SAndroid Build Coastguard Worker shl wd, 3 2854*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16] 2855*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 2856*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2857*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 2858*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+63] 2859*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m4 2860*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m4 2861*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m0 2862*c0909341SAndroid Build Coastguard Worker mova [tlq-16*0], m1 2863*c0909341SAndroid Build Coastguard Worker paddw m6, m5, m5 2864*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m6 2865*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop: 2866*c0909341SAndroid Build Coastguard Worker lea r4, [r5+dyq] 2867*c0909341SAndroid Build Coastguard Worker sar r5, 6 2868*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r5] 2869*c0909341SAndroid Build Coastguard Worker lea r5, [r4+dyq] 2870*c0909341SAndroid Build Coastguard Worker sar r4, 6 2871*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4] 2872*c0909341SAndroid Build Coastguard Worker pand m3, m8, m5 2873*c0909341SAndroid Build Coastguard Worker psubw m2, m9, m3 2874*c0909341SAndroid Build Coastguard Worker psllw m2, 8 2875*c0909341SAndroid Build Coastguard Worker por m3, m2 2876*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q1010 2877*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 2878*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 2879*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 2880*c0909341SAndroid Build Coastguard Worker paddw m5, m6 2881*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2882*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 2883*c0909341SAndroid Build Coastguard Worker packuswb m1, m0 2884*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m1 2885*c0909341SAndroid Build Coastguard Worker sub wd, 16 2886*c0909341SAndroid Build Coastguard Worker jg .h8_upsample_loop 2887*c0909341SAndroid Build Coastguard Worker jmp .h8_transpose 2888*c0909341SAndroid Build Coastguard Worker.h8_no_upsample: 2889*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+7] 2890*c0909341SAndroid Build Coastguard Worker movd m0, r4d 2891*c0909341SAndroid Build Coastguard Worker and r4d, 7 2892*c0909341SAndroid Build Coastguard Worker or r4d, 8 ; imin(w+7, 15) 2893*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2894*c0909341SAndroid Build Coastguard Worker jnz .h8_main 2895*c0909341SAndroid Build Coastguard Worker movd m2, angled 2896*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 2897*c0909341SAndroid Build Coastguard Worker pxor m1, m1 2898*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 2899*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 2900*c0909341SAndroid Build Coastguard Worker movu m1, [base+z_filter_wh8] 2901*c0909341SAndroid Build Coastguard Worker psrldq m3, [base+z_filter_t_w48+angleq*8], 4 2902*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, m0 2903*c0909341SAndroid Build Coastguard Worker pand m1, m2 2904*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m3 2905*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 2906*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2907*c0909341SAndroid Build Coastguard Worker jz .h8_main ; filter_strength == 0 2908*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-15] 2909*c0909341SAndroid Build Coastguard Worker imul r5d, 0x55555555 2910*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+1] 2911*c0909341SAndroid Build Coastguard Worker neg r4 2912*c0909341SAndroid Build Coastguard Worker movd m2, [tlq+r4] 2913*c0909341SAndroid Build Coastguard Worker shr r5d, 30 2914*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2915*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*2] 2916*c0909341SAndroid Build Coastguard Worker sub r5, 3 ; filter_strength-3 2917*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m0 2918*c0909341SAndroid Build Coastguard Worker pshufb m1, m7 2919*c0909341SAndroid Build Coastguard Worker mova [tlq+16*1], m1 2920*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 2921*c0909341SAndroid Build Coastguard Worker movq [tlq+r4+8], m2 2922*c0909341SAndroid Build Coastguard Worker neg r4d 2923*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2924*c0909341SAndroid Build Coastguard Worker sar r5d, 1 2925*c0909341SAndroid Build Coastguard Worker add tlq, 31 2926*c0909341SAndroid Build Coastguard Worker add r5d, 17 2927*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2928*c0909341SAndroid Build Coastguard Worker cmova r4d, r5d 2929*c0909341SAndroid Build Coastguard Worker.h8_main: 2930*c0909341SAndroid Build Coastguard Worker movd m5, dyd 2931*c0909341SAndroid Build Coastguard Worker sub tlq, r4 2932*c0909341SAndroid Build Coastguard Worker shl r4d, 6 2933*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 2934*c0909341SAndroid Build Coastguard Worker movd m4, r4d 2935*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 2936*c0909341SAndroid Build Coastguard Worker neg dyq 2937*c0909341SAndroid Build Coastguard Worker pshufb m7, [base+pw_m256] 2938*c0909341SAndroid Build Coastguard Worker mova m3, [base+z3_shuf] 2939*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 2940*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+pw_256] 2941*c0909341SAndroid Build Coastguard Worker psubw m4, [base+z3_base_inc] 2942*c0909341SAndroid Build Coastguard Worker shl wd, 3 2943*c0909341SAndroid Build Coastguard Worker mova m6, m5 2944*c0909341SAndroid Build Coastguard Worker sub rsp, wq 2945*c0909341SAndroid Build Coastguard Worker.h8_loop: 2946*c0909341SAndroid Build Coastguard Worker mov r4, r5 2947*c0909341SAndroid Build Coastguard Worker sar r4, 6 2948*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8] 2949*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 2950*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 2951*c0909341SAndroid Build Coastguard Worker psllw m2, 8 2952*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 2953*c0909341SAndroid Build Coastguard Worker por m1, m2 2954*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 2955*c0909341SAndroid Build Coastguard Worker pcmpgtw m1, m4, m5 2956*c0909341SAndroid Build Coastguard Worker paddw m5, m6 2957*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 2958*c0909341SAndroid Build Coastguard Worker pand m0, m1 2959*c0909341SAndroid Build Coastguard Worker pandn m1, m7 2960*c0909341SAndroid Build Coastguard Worker por m0, m1 2961*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2962*c0909341SAndroid Build Coastguard Worker movq [rsp+wq-8], m0 2963*c0909341SAndroid Build Coastguard Worker sub wd, 8 2964*c0909341SAndroid Build Coastguard Worker jz .h8_transpose 2965*c0909341SAndroid Build Coastguard Worker add r5, dyq 2966*c0909341SAndroid Build Coastguard Worker jg .h8_loop 2967*c0909341SAndroid Build Coastguard Worker packuswb m7, m7 2968*c0909341SAndroid Build Coastguard Worker.h8_end_loop: 2969*c0909341SAndroid Build Coastguard Worker movq [rsp+wq-8], m7 2970*c0909341SAndroid Build Coastguard Worker sub wd, 8 2971*c0909341SAndroid Build Coastguard Worker jg .h8_end_loop 2972*c0909341SAndroid Build Coastguard Worker.h8_transpose: 2973*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2974*c0909341SAndroid Build Coastguard Worker mov strideq, [dstq] 2975*c0909341SAndroid Build Coastguard Worker mov org_wd, [dstq+strideq] 2976*c0909341SAndroid Build Coastguard Worker%endif 2977*c0909341SAndroid Build Coastguard Worker or r3d, 8 2978*c0909341SAndroid Build Coastguard Worker cmp org_wd, 4 2979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2980*c0909341SAndroid Build Coastguard Worker jne .end_transpose_main 2981*c0909341SAndroid Build Coastguard Worker%else 2982*c0909341SAndroid Build Coastguard Worker jne .end_transpose_loop 2983*c0909341SAndroid Build Coastguard Worker%endif 2984*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16*1] 2985*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+16*0] 2986*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2987*c0909341SAndroid Build Coastguard Worker add rsp, 16*2 2988*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1, m0 2989*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0 2990*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m1, m2 2991*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2 2992*c0909341SAndroid Build Coastguard Worker.write_4x8_end: 2993*c0909341SAndroid Build Coastguard Worker call .write_4x8 2994*c0909341SAndroid Build Coastguard Worker RET 2995*c0909341SAndroid Build Coastguard Worker.write_4x8: 2996*c0909341SAndroid Build Coastguard Worker movd [dstq+r2 ], m0 2997*c0909341SAndroid Build Coastguard Worker pshuflw m4, m0, q1032 2998*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m4 2999*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3000*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m0 3001*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 3002*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 3003*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3004*c0909341SAndroid Build Coastguard Worker movd [dstq+r2 ], m1 3005*c0909341SAndroid Build Coastguard Worker pshuflw m4, m1, q1032 3006*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m4 3007*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 3008*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 3009*c0909341SAndroid Build Coastguard Worker psrlq m1, 32 3010*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m1 3011*c0909341SAndroid Build Coastguard Worker ret 3012*c0909341SAndroid Build Coastguard Worker.h16: 3013*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+15] 3014*c0909341SAndroid Build Coastguard Worker movd m0, r4d 3015*c0909341SAndroid Build Coastguard Worker and r4d, 15 3016*c0909341SAndroid Build Coastguard Worker or r4d, 16 ; imin(w+15, 31) 3017*c0909341SAndroid Build Coastguard Worker test angled, 0x400 3018*c0909341SAndroid Build Coastguard Worker jnz .h16_main 3019*c0909341SAndroid Build Coastguard Worker movd m2, angled 3020*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 3021*c0909341SAndroid Build Coastguard Worker pxor m1, m1 3022*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 3023*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 3024*c0909341SAndroid Build Coastguard Worker movq m3, [base+z_filter_t_w16+angleq*4] 3025*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, m0, [base+z_filter_wh16] 3026*c0909341SAndroid Build Coastguard Worker pand m1, m2 3027*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m3 3028*c0909341SAndroid Build Coastguard Worker pmovmskb r5d, m1 3029*c0909341SAndroid Build Coastguard Worker test r5d, r5d 3030*c0909341SAndroid Build Coastguard Worker jz .h16_main ; filter_strength == 0 3031*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16*2+1] 3032*c0909341SAndroid Build Coastguard Worker imul r5d, 0x24924924 3033*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16*1+1] 3034*c0909341SAndroid Build Coastguard Worker neg r4 3035*c0909341SAndroid Build Coastguard Worker movd m2, [tlq-16*0+1] 3036*c0909341SAndroid Build Coastguard Worker shr r5d, 30 3037*c0909341SAndroid Build Coastguard Worker movd m3, [tlq+r4] 3038*c0909341SAndroid Build Coastguard Worker adc r5, -4 ; filter_strength-3 3039*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3040*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*2] 3041*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m0 3042*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 3043*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m1 3044*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 3045*c0909341SAndroid Build Coastguard Worker mova [tlq+16*1], m2 3046*c0909341SAndroid Build Coastguard Worker movq [tlq+r4+8], m3 3047*c0909341SAndroid Build Coastguard Worker neg r4d 3048*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3049*c0909341SAndroid Build Coastguard Worker add tlq, 31 3050*c0909341SAndroid Build Coastguard Worker cmp wd, 16 3051*c0909341SAndroid Build Coastguard Worker jle .h16_main 3052*c0909341SAndroid Build Coastguard Worker pshuflw m0, [tlq-47], q0000 3053*c0909341SAndroid Build Coastguard Worker sar r5, 1 3054*c0909341SAndroid Build Coastguard Worker movq m1, [base+z3_filter_k_tail+r5*4] 3055*c0909341SAndroid Build Coastguard Worker lea r4d, [r5+33] 3056*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 3057*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3058*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3059*c0909341SAndroid Build Coastguard Worker%else 3060*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 3061*c0909341SAndroid Build Coastguard Worker%endif 3062*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 3063*c0909341SAndroid Build Coastguard Worker movd [tlq-35], m0 3064*c0909341SAndroid Build Coastguard Worker.h16_main: 3065*c0909341SAndroid Build Coastguard Worker movd m5, dyd 3066*c0909341SAndroid Build Coastguard Worker sub tlq, r4 3067*c0909341SAndroid Build Coastguard Worker movd m4, r4d 3068*c0909341SAndroid Build Coastguard Worker shl r4d, 6 3069*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 3070*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3071*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 3072*c0909341SAndroid Build Coastguard Worker neg dyq 3073*c0909341SAndroid Build Coastguard Worker pshufb m7, m6 3074*c0909341SAndroid Build Coastguard Worker mova m3, [base+z3_shuf] 3075*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 3076*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 3077*c0909341SAndroid Build Coastguard Worker psubb m4, [base+pb_15to0] 3078*c0909341SAndroid Build Coastguard Worker shl wd, 4 3079*c0909341SAndroid Build Coastguard Worker mova m6, m5 3080*c0909341SAndroid Build Coastguard Worker sub rsp, wq 3081*c0909341SAndroid Build Coastguard Worker.h16_loop: 3082*c0909341SAndroid Build Coastguard Worker mov r4, r5 3083*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 3084*c0909341SAndroid Build Coastguard Worker sar r4, 6 3085*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 3086*c0909341SAndroid Build Coastguard Worker psllw m2, 8 3087*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*2] 3088*c0909341SAndroid Build Coastguard Worker por m2, m1 3089*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*1] 3090*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3091*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3092*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3093*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3094*c0909341SAndroid Build Coastguard Worker psrlw m2, m5, 6 3095*c0909341SAndroid Build Coastguard Worker paddw m5, m6 3096*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3097*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3098*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 3099*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3100*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3101*c0909341SAndroid Build Coastguard Worker pand m0, m1 3102*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3103*c0909341SAndroid Build Coastguard Worker por m0, m1 3104*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m0 3105*c0909341SAndroid Build Coastguard Worker sub wd, 16 3106*c0909341SAndroid Build Coastguard Worker jz .h16_transpose 3107*c0909341SAndroid Build Coastguard Worker add r5, dyq 3108*c0909341SAndroid Build Coastguard Worker jg .h16_loop 3109*c0909341SAndroid Build Coastguard Worker.h16_end_loop: 3110*c0909341SAndroid Build Coastguard Worker mova [rsp+wq-16], m7 3111*c0909341SAndroid Build Coastguard Worker sub wd, 16 3112*c0909341SAndroid Build Coastguard Worker jg .h16_end_loop 3113*c0909341SAndroid Build Coastguard Worker.h16_transpose: 3114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3115*c0909341SAndroid Build Coastguard Worker mov strideq, [dstq] 3116*c0909341SAndroid Build Coastguard Worker mov org_wd, [dstq+strideq] 3117*c0909341SAndroid Build Coastguard Worker%endif 3118*c0909341SAndroid Build Coastguard Worker or r3d, 16 3119*c0909341SAndroid Build Coastguard Worker cmp org_wd, 4 3120*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3121*c0909341SAndroid Build Coastguard Worker jne .end_transpose_main 3122*c0909341SAndroid Build Coastguard Worker%else 3123*c0909341SAndroid Build Coastguard Worker jne .end_transpose_loop 3124*c0909341SAndroid Build Coastguard Worker%endif 3125*c0909341SAndroid Build Coastguard Worker.h16_transpose_w4: 3126*c0909341SAndroid Build Coastguard Worker mova m2, [rsp+16*3] 3127*c0909341SAndroid Build Coastguard Worker mova m4, [rsp+16*2] 3128*c0909341SAndroid Build Coastguard Worker mova m3, [rsp+16*1] 3129*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+16*0] 3130*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 3131*c0909341SAndroid Build Coastguard Worker add rsp, 16*4 3132*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2, m4 3133*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 3134*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m3, m0 3135*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m0 3136*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1, m4 3137*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4 3138*c0909341SAndroid Build Coastguard Worker call .write_4x8 3139*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3140*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m2, m3 3141*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 3142*c0909341SAndroid Build Coastguard Worker jmp .write_4x8_end 3143*c0909341SAndroid Build Coastguard Worker.h32: 3144*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+31] 3145*c0909341SAndroid Build Coastguard Worker and r4d, 31 3146*c0909341SAndroid Build Coastguard Worker or r4d, 32 ; imin(w+31, 63) 3147*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 3148*c0909341SAndroid Build Coastguard Worker jnz .h32_main 3149*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16*4+1] 3150*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16*3+1] 3151*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*2+1] 3152*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*1+1] 3153*c0909341SAndroid Build Coastguard Worker movd m4, [tlq-16*0+1] 3154*c0909341SAndroid Build Coastguard Worker neg r4 3155*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+r4] 3156*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3157*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*4] 3158*c0909341SAndroid Build Coastguard Worker mova [tlq-16*3], m0 3159*c0909341SAndroid Build Coastguard Worker mova [tlq-16*2], m1 3160*c0909341SAndroid Build Coastguard Worker xor r5d, r5d ; filter_strength = 3 3161*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m2 3162*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 3163*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m3 3164*c0909341SAndroid Build Coastguard Worker pshufb m5, m7 3165*c0909341SAndroid Build Coastguard Worker mova [tlq+16*1], m4 3166*c0909341SAndroid Build Coastguard Worker movq [tlq+r4+8], m5 3167*c0909341SAndroid Build Coastguard Worker neg r4d 3168*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3169*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 3170*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3171*c0909341SAndroid Build Coastguard Worker add tlq, 63 3172*c0909341SAndroid Build Coastguard Worker cmp wd, 32 3173*c0909341SAndroid Build Coastguard Worker jle .h32_main 3174*c0909341SAndroid Build Coastguard Worker pshuflw m0, [tlq-79], q0000 3175*c0909341SAndroid Build Coastguard Worker movq m1, [base+z3_filter_k_tail] 3176*c0909341SAndroid Build Coastguard Worker add r4d, 2 3177*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 3178*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3179*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3180*c0909341SAndroid Build Coastguard Worker%else 3181*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 3182*c0909341SAndroid Build Coastguard Worker%endif 3183*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 3184*c0909341SAndroid Build Coastguard Worker movd [tlq-67], m0 3185*c0909341SAndroid Build Coastguard Worker.h32_main: 3186*c0909341SAndroid Build Coastguard Worker movd m5, dyd 3187*c0909341SAndroid Build Coastguard Worker sub tlq, r4 3188*c0909341SAndroid Build Coastguard Worker movd m4, r4d 3189*c0909341SAndroid Build Coastguard Worker shl r4d, 6 3190*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 3191*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3192*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 3193*c0909341SAndroid Build Coastguard Worker neg dyq 3194*c0909341SAndroid Build Coastguard Worker pshufb m7, m6 3195*c0909341SAndroid Build Coastguard Worker mova m3, [base+z3_shuf] 3196*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 3197*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 3198*c0909341SAndroid Build Coastguard Worker psubb m4, [base+pb_15to0] 3199*c0909341SAndroid Build Coastguard Worker mova m6, m5 3200*c0909341SAndroid Build Coastguard Worker.h32_loop: 3201*c0909341SAndroid Build Coastguard Worker mov r4, r5 3202*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 3203*c0909341SAndroid Build Coastguard Worker sar r4, 6 3204*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 3205*c0909341SAndroid Build Coastguard Worker psllw m2, 8 3206*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*4] 3207*c0909341SAndroid Build Coastguard Worker por m2, m1 3208*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*3] 3209*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3210*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3211*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3212*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3213*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3214*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3215*c0909341SAndroid Build Coastguard Worker sub rsp, 32 3216*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3217*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 3218*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*2] 3219*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*1] 3220*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3221*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3222*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3223*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3224*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3225*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3226*c0909341SAndroid Build Coastguard Worker psrlw m2, m5, 6 3227*c0909341SAndroid Build Coastguard Worker paddw m5, m6 3228*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 3229*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3230*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3231*c0909341SAndroid Build Coastguard Worker paddsb m2, [base+pb_16] 3232*c0909341SAndroid Build Coastguard Worker pand m0, m1 3233*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3234*c0909341SAndroid Build Coastguard Worker por m0, m1 3235*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3236*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 3237*c0909341SAndroid Build Coastguard Worker pand m0, m1, [rsp+16*0] 3238*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3239*c0909341SAndroid Build Coastguard Worker por m0, m1 3240*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 3241*c0909341SAndroid Build Coastguard Worker dec wd 3242*c0909341SAndroid Build Coastguard Worker jz .h32_transpose 3243*c0909341SAndroid Build Coastguard Worker add r5, dyq 3244*c0909341SAndroid Build Coastguard Worker jg .h32_loop 3245*c0909341SAndroid Build Coastguard Worker.h32_end_loop: 3246*c0909341SAndroid Build Coastguard Worker sub rsp, 32 3247*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m7 3248*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m7 3249*c0909341SAndroid Build Coastguard Worker dec wd 3250*c0909341SAndroid Build Coastguard Worker jg .h32_end_loop 3251*c0909341SAndroid Build Coastguard Worker.h32_transpose: 3252*c0909341SAndroid Build Coastguard Worker or r3d, 32 3253*c0909341SAndroid Build Coastguard Worker jmp .end_transpose_main 3254*c0909341SAndroid Build Coastguard Worker.h64: 3255*c0909341SAndroid Build Coastguard Worker lea r4d, [wq+63] 3256*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 3257*c0909341SAndroid Build Coastguard Worker jnz .h64_main 3258*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16*8+1] 3259*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16*7+1] 3260*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*6+1] 3261*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*5+1] 3262*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 3263*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m1 3264*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m2 3265*c0909341SAndroid Build Coastguard Worker mova [rsp+16*4], m3 3266*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16*4+1] 3267*c0909341SAndroid Build Coastguard Worker mova m1, [tlq-16*3+1] 3268*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16*2+1] 3269*c0909341SAndroid Build Coastguard Worker mova m3, [tlq-16*1+1] 3270*c0909341SAndroid Build Coastguard Worker movd m4, [tlq-16*0+1] 3271*c0909341SAndroid Build Coastguard Worker neg r4 3272*c0909341SAndroid Build Coastguard Worker movd m5, [tlq+r4] 3273*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3274*c0909341SAndroid Build Coastguard Worker lea tlq, [rsp+16*8] 3275*c0909341SAndroid Build Coastguard Worker mova [tlq-16*3], m0 3276*c0909341SAndroid Build Coastguard Worker mova [tlq-16*2], m1 3277*c0909341SAndroid Build Coastguard Worker xor r5d, r5d ; filter_strength = 3 3278*c0909341SAndroid Build Coastguard Worker mova [tlq-16*1], m2 3279*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 3280*c0909341SAndroid Build Coastguard Worker mova [tlq+16*0], m3 3281*c0909341SAndroid Build Coastguard Worker pshufb m5, m7 3282*c0909341SAndroid Build Coastguard Worker mova [tlq+16*1], m4 3283*c0909341SAndroid Build Coastguard Worker movq [tlq+r4+8], m5 3284*c0909341SAndroid Build Coastguard Worker neg r4d 3285*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3286*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 3287*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3288*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 3289*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3290*c0909341SAndroid Build Coastguard Worker sub tlq, 16*2 3291*c0909341SAndroid Build Coastguard Worker cmp wd, 64 3292*c0909341SAndroid Build Coastguard Worker jl .h64_filter96 ; skip one call if the last 32 bytes aren't used 3293*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3294*c0909341SAndroid Build Coastguard Worker.h64_filter96: 3295*c0909341SAndroid Build Coastguard Worker add tlq, 127 3296*c0909341SAndroid Build Coastguard Worker.h64_main: 3297*c0909341SAndroid Build Coastguard Worker movd m5, dyd 3298*c0909341SAndroid Build Coastguard Worker sub tlq, r4 3299*c0909341SAndroid Build Coastguard Worker movd m4, r4d 3300*c0909341SAndroid Build Coastguard Worker shl r4d, 6 3301*c0909341SAndroid Build Coastguard Worker movd m7, [tlq] 3302*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3303*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+pw_256] 3304*c0909341SAndroid Build Coastguard Worker neg dyq 3305*c0909341SAndroid Build Coastguard Worker pshufb m7, m6 3306*c0909341SAndroid Build Coastguard Worker mova m3, [base+z3_shuf] 3307*c0909341SAndroid Build Coastguard Worker lea r5, [dyq+r4+63] 3308*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 3309*c0909341SAndroid Build Coastguard Worker psubb m4, [base+pb_15to0] 3310*c0909341SAndroid Build Coastguard Worker mova m6, m5 3311*c0909341SAndroid Build Coastguard Worker.h64_loop: 3312*c0909341SAndroid Build Coastguard Worker mov r4, r5 3313*c0909341SAndroid Build Coastguard Worker pand m2, m8, m5 3314*c0909341SAndroid Build Coastguard Worker sar r4, 6 3315*c0909341SAndroid Build Coastguard Worker psubw m1, m9, m2 3316*c0909341SAndroid Build Coastguard Worker psllw m2, 8 3317*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*8] 3318*c0909341SAndroid Build Coastguard Worker por m2, m1 3319*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*7] 3320*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3321*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3322*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3323*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3324*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3325*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3326*c0909341SAndroid Build Coastguard Worker sub rsp, 64 3327*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3328*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 3329*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*6] 3330*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*5] 3331*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3332*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3333*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3334*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3335*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3336*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3337*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3338*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 3339*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*4] 3340*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*3] 3341*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3342*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3343*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3344*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3345*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3346*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3347*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3348*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m0 3349*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+r4-8*2] 3350*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+r4-8*1] 3351*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 3352*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 3353*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3354*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3355*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m10 3356*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m10 3357*c0909341SAndroid Build Coastguard Worker psrlw m2, m5, 6 3358*c0909341SAndroid Build Coastguard Worker paddw m5, m6 3359*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 3360*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3361*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3362*c0909341SAndroid Build Coastguard Worker paddsb m2, [base+pb_16] 3363*c0909341SAndroid Build Coastguard Worker pand m0, m1 3364*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3365*c0909341SAndroid Build Coastguard Worker por m0, m1 3366*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3367*c0909341SAndroid Build Coastguard Worker paddsb m2, [base+pb_16] 3368*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m0 3369*c0909341SAndroid Build Coastguard Worker pand m0, m1, [rsp+16*2] 3370*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3371*c0909341SAndroid Build Coastguard Worker por m0, m1 3372*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3373*c0909341SAndroid Build Coastguard Worker paddsb m2, [base+pb_16] 3374*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m0 3375*c0909341SAndroid Build Coastguard Worker pand m0, m1, [rsp+16*1] 3376*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3377*c0909341SAndroid Build Coastguard Worker por m0, m1 3378*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m2 3379*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m0 3380*c0909341SAndroid Build Coastguard Worker pand m0, m1, [rsp+16*0] 3381*c0909341SAndroid Build Coastguard Worker pandn m1, m7 3382*c0909341SAndroid Build Coastguard Worker por m0, m1 3383*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m0 3384*c0909341SAndroid Build Coastguard Worker dec wd 3385*c0909341SAndroid Build Coastguard Worker jz .h64_transpose 3386*c0909341SAndroid Build Coastguard Worker add r5, dyq 3387*c0909341SAndroid Build Coastguard Worker jg .h64_loop 3388*c0909341SAndroid Build Coastguard Worker.h64_end_loop: 3389*c0909341SAndroid Build Coastguard Worker sub rsp, 64 3390*c0909341SAndroid Build Coastguard Worker mova [rsp+16*3], m7 3391*c0909341SAndroid Build Coastguard Worker mova [rsp+16*2], m7 3392*c0909341SAndroid Build Coastguard Worker mova [rsp+16*1], m7 3393*c0909341SAndroid Build Coastguard Worker mova [rsp+16*0], m7 3394*c0909341SAndroid Build Coastguard Worker dec wd 3395*c0909341SAndroid Build Coastguard Worker jg .h64_end_loop 3396*c0909341SAndroid Build Coastguard Worker.h64_transpose: 3397*c0909341SAndroid Build Coastguard Worker or r3d, 64 3398*c0909341SAndroid Build Coastguard Worker.end_transpose_main: 3399*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3400*c0909341SAndroid Build Coastguard Worker lea r5, [r3*3] 3401*c0909341SAndroid Build Coastguard Worker lea r7, [strideq*3] 3402*c0909341SAndroid Build Coastguard Worker%else 3403*c0909341SAndroid Build Coastguard Worker mov strideq, [dstq] 3404*c0909341SAndroid Build Coastguard Worker mov org_wd, [dstq+strideq] 3405*c0909341SAndroid Build Coastguard Worker%endif 3406*c0909341SAndroid Build Coastguard Worker.end_transpose_loop: 3407*c0909341SAndroid Build Coastguard Worker lea r4, [rsp+r3-8] 3408*c0909341SAndroid Build Coastguard Worker lea r6, [dstq+org_wq-8] 3409*c0909341SAndroid Build Coastguard Worker.end_transpose_loop_y: 3410*c0909341SAndroid Build Coastguard Worker movq m0, [r4+r3*1] 3411*c0909341SAndroid Build Coastguard Worker movq m4, [r4+r3*0] 3412*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3413*c0909341SAndroid Build Coastguard Worker movq m1, [r4+r5 ] 3414*c0909341SAndroid Build Coastguard Worker movq m5, [r4+r3*2] 3415*c0909341SAndroid Build Coastguard Worker lea r2, [r4+r3*4] 3416*c0909341SAndroid Build Coastguard Worker%else 3417*c0909341SAndroid Build Coastguard Worker lea r2, [r4+r3*2] 3418*c0909341SAndroid Build Coastguard Worker movq m1, [r2+r3*1] 3419*c0909341SAndroid Build Coastguard Worker movq m5, [r2+r3*0] 3420*c0909341SAndroid Build Coastguard Worker lea r2, [r2+r3*2] 3421*c0909341SAndroid Build Coastguard Worker%endif 3422*c0909341SAndroid Build Coastguard Worker movq m2, [r2+r3*1] 3423*c0909341SAndroid Build Coastguard Worker movq m6, [r2+r3*0] 3424*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3425*c0909341SAndroid Build Coastguard Worker movq m3, [r2+r5 ] 3426*c0909341SAndroid Build Coastguard Worker movq m7, [r2+r3*2] 3427*c0909341SAndroid Build Coastguard Worker%else 3428*c0909341SAndroid Build Coastguard Worker lea r2, [r2+r3*2] 3429*c0909341SAndroid Build Coastguard Worker movq m3, [r2+r3*1] 3430*c0909341SAndroid Build Coastguard Worker movq m7, [r2+r3*0] 3431*c0909341SAndroid Build Coastguard Worker%endif 3432*c0909341SAndroid Build Coastguard Worker sub r4, 8 3433*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 3434*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m5 3435*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m6 3436*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m7 3437*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m1, m0 3438*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0 3439*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3, m2 3440*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m2 3441*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m3, m1 3442*c0909341SAndroid Build Coastguard Worker punpckldq m3, m1 3443*c0909341SAndroid Build Coastguard Worker punpckldq m1, m0, m4 3444*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m4 3445*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*0], m0 3446*c0909341SAndroid Build Coastguard Worker movq [r6+strideq*1], m0 3447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3448*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*2], m1 3449*c0909341SAndroid Build Coastguard Worker movq [r6+r7 ], m1 3450*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*4] 3451*c0909341SAndroid Build Coastguard Worker%else 3452*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 3453*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*0], m1 3454*c0909341SAndroid Build Coastguard Worker movq [r6+strideq*1], m1 3455*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 3456*c0909341SAndroid Build Coastguard Worker%endif 3457*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*0], m2 3458*c0909341SAndroid Build Coastguard Worker movq [r6+strideq*1], m2 3459*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3460*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*2], m3 3461*c0909341SAndroid Build Coastguard Worker movq [r6+r7 ], m3 3462*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*4] 3463*c0909341SAndroid Build Coastguard Worker%else 3464*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 3465*c0909341SAndroid Build Coastguard Worker movhps [r6+strideq*0], m3 3466*c0909341SAndroid Build Coastguard Worker movq [r6+strideq*1], m3 3467*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 3468*c0909341SAndroid Build Coastguard Worker%endif 3469*c0909341SAndroid Build Coastguard Worker cmp r4, rsp 3470*c0909341SAndroid Build Coastguard Worker jae .end_transpose_loop_y 3471*c0909341SAndroid Build Coastguard Worker lea rsp, [rsp+r3*8] 3472*c0909341SAndroid Build Coastguard Worker sub org_wd, 8 3473*c0909341SAndroid Build Coastguard Worker jg .end_transpose_loop 3474*c0909341SAndroid Build Coastguard Worker RET 3475*c0909341SAndroid Build Coastguard Worker 3476*c0909341SAndroid Build Coastguard Worker;------------------------------------------------------------------------------- 3477*c0909341SAndroid Build Coastguard Worker;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal, 3478*c0909341SAndroid Build Coastguard Worker; const uint8_t *idx, int w, int h); 3479*c0909341SAndroid Build Coastguard Worker;------------------------------------------------------------------------------- 3480*c0909341SAndroid Build Coastguard Workercglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h 3481*c0909341SAndroid Build Coastguard Worker movq m4, [palq] 3482*c0909341SAndroid Build Coastguard Worker LEA r2, pal_pred_ssse3_table 3483*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3484*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3485*c0909341SAndroid Build Coastguard Worker movsxd wq, [r2+wq*4] 3486*c0909341SAndroid Build Coastguard Worker add wq, r2 3487*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 3488*c0909341SAndroid Build Coastguard Worker jmp wq 3489*c0909341SAndroid Build Coastguard Worker.w4: 3490*c0909341SAndroid Build Coastguard Worker movq m1, [idxq] 3491*c0909341SAndroid Build Coastguard Worker add idxq, 8 3492*c0909341SAndroid Build Coastguard Worker psrlw m0, m1, 4 3493*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0 3494*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m1 3495*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m0 3496*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 3497*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m1 3498*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3499*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m0 3500*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 3501*c0909341SAndroid Build Coastguard Worker movd [dstq+r2 ], m0 3502*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3503*c0909341SAndroid Build Coastguard Worker sub hd, 4 3504*c0909341SAndroid Build Coastguard Worker jg .w4 3505*c0909341SAndroid Build Coastguard Worker RET 3506*c0909341SAndroid Build Coastguard Worker.w8: 3507*c0909341SAndroid Build Coastguard Worker movu m0, [idxq] 3508*c0909341SAndroid Build Coastguard Worker add idxq, 16 3509*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 3510*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 3511*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m0 3512*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 3513*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 3514*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m0 3515*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], m0 3516*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m1 3517*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], m1 3518*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3519*c0909341SAndroid Build Coastguard Worker sub hd, 4 3520*c0909341SAndroid Build Coastguard Worker jg .w8 3521*c0909341SAndroid Build Coastguard Worker RET 3522*c0909341SAndroid Build Coastguard Worker.w16: 3523*c0909341SAndroid Build Coastguard Worker movu m0, [idxq] 3524*c0909341SAndroid Build Coastguard Worker add idxq, 16 3525*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 3526*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 3527*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m0 3528*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 3529*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 3530*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 3531*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 3532*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3533*c0909341SAndroid Build Coastguard Worker sub hd, 2 3534*c0909341SAndroid Build Coastguard Worker jg .w16 3535*c0909341SAndroid Build Coastguard Worker RET 3536*c0909341SAndroid Build Coastguard Worker.w32: 3537*c0909341SAndroid Build Coastguard Worker movu m0, [idxq] 3538*c0909341SAndroid Build Coastguard Worker add idxq, 16 3539*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 3540*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 3541*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m0 3542*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 3543*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 3544*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 3545*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 3546*c0909341SAndroid Build Coastguard Worker add dstq, strideq 3547*c0909341SAndroid Build Coastguard Worker dec hd 3548*c0909341SAndroid Build Coastguard Worker jg .w32 3549*c0909341SAndroid Build Coastguard Worker RET 3550*c0909341SAndroid Build Coastguard Worker.w64: 3551*c0909341SAndroid Build Coastguard Worker movu m0, [idxq+16*0] 3552*c0909341SAndroid Build Coastguard Worker movu m2, [idxq+16*1] 3553*c0909341SAndroid Build Coastguard Worker add idxq, 32 3554*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m0 3555*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 3556*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m0 3557*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m3 3558*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 3559*c0909341SAndroid Build Coastguard Worker mova [dstq+16*0], m0 3560*c0909341SAndroid Build Coastguard Worker mova [dstq+16*1], m1 3561*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m2 3562*c0909341SAndroid Build Coastguard Worker psrlw m2, 4 3563*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m2 3564*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m3 3565*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 3566*c0909341SAndroid Build Coastguard Worker mova [dstq+16*2], m0 3567*c0909341SAndroid Build Coastguard Worker mova [dstq+16*3], m1 3568*c0909341SAndroid Build Coastguard Worker add dstq, strideq 3569*c0909341SAndroid Build Coastguard Worker sub hd, 1 3570*c0909341SAndroid Build Coastguard Worker jg .w64 3571*c0909341SAndroid Build Coastguard Worker RET 3572*c0909341SAndroid Build Coastguard Worker 3573*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3574*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3575*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int16_t *ac, const int alpha); 3576*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3577*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 1 ; ac in, unpacked pixels out 3578*c0909341SAndroid Build Coastguard Worker psignw m3, m%1, m1 3579*c0909341SAndroid Build Coastguard Worker pabsw m%1, m%1 3580*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m2 3581*c0909341SAndroid Build Coastguard Worker psignw m%1, m3 3582*c0909341SAndroid Build Coastguard Worker paddw m%1, m0 3583*c0909341SAndroid Build Coastguard Worker%endmacro 3584*c0909341SAndroid Build Coastguard Worker 3585*c0909341SAndroid Build Coastguard Worker%if UNIX64 3586*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 3587*c0909341SAndroid Build Coastguard Worker%else 3588*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5 3589*c0909341SAndroid Build Coastguard Worker%endif 3590*c0909341SAndroid Build Coastguard Worker 3591*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3592*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 3593*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3594*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 3595*c0909341SAndroid Build Coastguard Worker lea t0d, [wq+hq] 3596*c0909341SAndroid Build Coastguard Worker movd m4, t0d 3597*c0909341SAndroid Build Coastguard Worker tzcnt t0d, t0d 3598*c0909341SAndroid Build Coastguard Worker movd m5, t0d 3599*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_ssse3_table 3600*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 3601*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+r6*4] 3602*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4+16] 3603*c0909341SAndroid Build Coastguard Worker pcmpeqd m3, m3 3604*c0909341SAndroid Build Coastguard Worker psrlw m4, 1 3605*c0909341SAndroid Build Coastguard Worker add r6, t0 3606*c0909341SAndroid Build Coastguard Worker add wq, t0 3607*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3608*c0909341SAndroid Build Coastguard Worker jmp r6 3609*c0909341SAndroid Build Coastguard Worker.h4: 3610*c0909341SAndroid Build Coastguard Worker movd m0, [tlq-4] 3611*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 3612*c0909341SAndroid Build Coastguard Worker jmp wq 3613*c0909341SAndroid Build Coastguard Worker.w4: 3614*c0909341SAndroid Build Coastguard Worker movd m1, [tlq+1] 3615*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 3616*c0909341SAndroid Build Coastguard Worker psubw m0, m4 3617*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3618*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3619*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3620*c0909341SAndroid Build Coastguard Worker jg .w4_mul 3621*c0909341SAndroid Build Coastguard Worker psrlw m0, 3 ; dc >>= ctz(width + height); 3622*c0909341SAndroid Build Coastguard Worker jmp .w4_end 3623*c0909341SAndroid Build Coastguard Worker.w4_mul: 3624*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m0, m0 3625*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3626*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3627*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3628*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 3629*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 3630*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 3631*c0909341SAndroid Build Coastguard Worker test hd, 8 3632*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 3633*c0909341SAndroid Build Coastguard Worker movd m5, r6d 3634*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m5 3635*c0909341SAndroid Build Coastguard Worker.w4_end: 3636*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3637*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3638*c0909341SAndroid Build Coastguard Worker.s4: 3639*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3640*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3641*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3642*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 3643*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3644*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3645*c0909341SAndroid Build Coastguard Worker.s4_loop: 3646*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 3647*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16] 3648*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3649*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3650*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3651*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m4 3652*c0909341SAndroid Build Coastguard Worker pshuflw m4, m4, q1032 3653*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m4 3654*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m4 3655*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m4 3656*c0909341SAndroid Build Coastguard Worker psrlq m4, 32 3657*c0909341SAndroid Build Coastguard Worker movd [dstq+r6 ], m4 3658*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3659*c0909341SAndroid Build Coastguard Worker add acq, 32 3660*c0909341SAndroid Build Coastguard Worker sub hd, 4 3661*c0909341SAndroid Build Coastguard Worker jg .s4_loop 3662*c0909341SAndroid Build Coastguard Worker RET 3663*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3664*c0909341SAndroid Build Coastguard Worker.h8: 3665*c0909341SAndroid Build Coastguard Worker movq m0, [tlq-8] 3666*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 3667*c0909341SAndroid Build Coastguard Worker jmp wq 3668*c0909341SAndroid Build Coastguard Worker.w8: 3669*c0909341SAndroid Build Coastguard Worker movq m1, [tlq+1] 3670*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 3671*c0909341SAndroid Build Coastguard Worker psubw m4, m0 3672*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3673*c0909341SAndroid Build Coastguard Worker psubw m0, m4 3674*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3675*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3676*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3677*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3678*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 3679*c0909341SAndroid Build Coastguard Worker cmp hd, 8 3680*c0909341SAndroid Build Coastguard Worker je .w8_end 3681*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 3682*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 3683*c0909341SAndroid Build Coastguard Worker cmp hd, 32 3684*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 3685*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3686*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3687*c0909341SAndroid Build Coastguard Worker.w8_end: 3688*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3689*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3690*c0909341SAndroid Build Coastguard Worker.s8: 3691*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3692*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3693*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3694*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 3695*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3696*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3697*c0909341SAndroid Build Coastguard Worker.s8_loop: 3698*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 3699*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16] 3700*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3701*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3702*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3703*c0909341SAndroid Build Coastguard Worker movq [dstq ], m4 3704*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq ], m4 3705*c0909341SAndroid Build Coastguard Worker mova m4, [acq+32] 3706*c0909341SAndroid Build Coastguard Worker mova m5, [acq+48] 3707*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3708*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3709*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3710*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], m4 3711*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], m4 3712*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3713*c0909341SAndroid Build Coastguard Worker add acq, 64 3714*c0909341SAndroid Build Coastguard Worker sub hd, 4 3715*c0909341SAndroid Build Coastguard Worker jg .s8_loop 3716*c0909341SAndroid Build Coastguard Worker RET 3717*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3718*c0909341SAndroid Build Coastguard Worker.h16: 3719*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-16] 3720*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 3721*c0909341SAndroid Build Coastguard Worker jmp wq 3722*c0909341SAndroid Build Coastguard Worker.w16: 3723*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] 3724*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 3725*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3726*c0909341SAndroid Build Coastguard Worker psubw m4, m0 3727*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3728*c0909341SAndroid Build Coastguard Worker psubw m0, m4 3729*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3730*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3731*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3732*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 3733*c0909341SAndroid Build Coastguard Worker cmp hd, 16 3734*c0909341SAndroid Build Coastguard Worker je .w16_end 3735*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 3736*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 3737*c0909341SAndroid Build Coastguard Worker test hd, 8|32 3738*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 3739*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3740*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3741*c0909341SAndroid Build Coastguard Worker.w16_end: 3742*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3743*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3744*c0909341SAndroid Build Coastguard Worker.s16: 3745*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3746*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3747*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3748*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3749*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3750*c0909341SAndroid Build Coastguard Worker.s16_loop: 3751*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 3752*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16] 3753*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3754*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3755*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3756*c0909341SAndroid Build Coastguard Worker mova [dstq], m4 3757*c0909341SAndroid Build Coastguard Worker mova m4, [acq+32] 3758*c0909341SAndroid Build Coastguard Worker mova m5, [acq+48] 3759*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3760*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3761*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3762*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq], m4 3763*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3764*c0909341SAndroid Build Coastguard Worker add acq, 64 3765*c0909341SAndroid Build Coastguard Worker sub hd, 2 3766*c0909341SAndroid Build Coastguard Worker jg .s16_loop 3767*c0909341SAndroid Build Coastguard Worker RET 3768*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3769*c0909341SAndroid Build Coastguard Worker.h32: 3770*c0909341SAndroid Build Coastguard Worker mova m0, [tlq-32] 3771*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 3772*c0909341SAndroid Build Coastguard Worker mova m2, [tlq-16] 3773*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 3774*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3775*c0909341SAndroid Build Coastguard Worker jmp wq 3776*c0909341SAndroid Build Coastguard Worker.w32: 3777*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+1] 3778*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 3779*c0909341SAndroid Build Coastguard Worker movu m2, [tlq+17] 3780*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3 3781*c0909341SAndroid Build Coastguard Worker paddw m1, m2 3782*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3783*c0909341SAndroid Build Coastguard Worker psubw m4, m0 3784*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3785*c0909341SAndroid Build Coastguard Worker psubw m0, m4 3786*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3787*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3788*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 3789*c0909341SAndroid Build Coastguard Worker psrlw m0, m5 3790*c0909341SAndroid Build Coastguard Worker cmp hd, 32 3791*c0909341SAndroid Build Coastguard Worker je .w32_end 3792*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 3793*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 3794*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 3795*c0909341SAndroid Build Coastguard Worker test hd, 64|16 3796*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 3797*c0909341SAndroid Build Coastguard Worker movd m1, r6d 3798*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m1 3799*c0909341SAndroid Build Coastguard Worker.w32_end: 3800*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3801*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3802*c0909341SAndroid Build Coastguard Worker.s32: 3803*c0909341SAndroid Build Coastguard Worker movd m1, alpham 3804*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q0000 3805*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 3806*c0909341SAndroid Build Coastguard Worker pabsw m2, m1 3807*c0909341SAndroid Build Coastguard Worker psllw m2, 9 3808*c0909341SAndroid Build Coastguard Worker.s32_loop: 3809*c0909341SAndroid Build Coastguard Worker mova m4, [acq] 3810*c0909341SAndroid Build Coastguard Worker mova m5, [acq+16] 3811*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3812*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3813*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3814*c0909341SAndroid Build Coastguard Worker mova [dstq], m4 3815*c0909341SAndroid Build Coastguard Worker mova m4, [acq+32] 3816*c0909341SAndroid Build Coastguard Worker mova m5, [acq+48] 3817*c0909341SAndroid Build Coastguard Worker IPRED_CFL 4 3818*c0909341SAndroid Build Coastguard Worker IPRED_CFL 5 3819*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3820*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m4 3821*c0909341SAndroid Build Coastguard Worker add dstq, strideq 3822*c0909341SAndroid Build Coastguard Worker add acq, 64 3823*c0909341SAndroid Build Coastguard Worker dec hd 3824*c0909341SAndroid Build Coastguard Worker jg .s32_loop 3825*c0909341SAndroid Build Coastguard Worker RET 3826*c0909341SAndroid Build Coastguard Worker 3827*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3828*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3829*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int16_t *ac, const int alpha); 3830*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3831*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3832*c0909341SAndroid Build Coastguard Worker mov hd, hm ; zero upper half 3833*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 3834*c0909341SAndroid Build Coastguard Worker sub tlq, hq 3835*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3836*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 3837*c0909341SAndroid Build Coastguard Worker mov t0d, 0x8000 3838*c0909341SAndroid Build Coastguard Worker movd m3, t0d 3839*c0909341SAndroid Build Coastguard Worker movd m2, r6d 3840*c0909341SAndroid Build Coastguard Worker psrld m3, m2 3841*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_left_ssse3_table 3842*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+r6*4] 3843*c0909341SAndroid Build Coastguard Worker pcmpeqd m2, m2 3844*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3845*c0909341SAndroid Build Coastguard Worker add r6, t0 3846*c0909341SAndroid Build Coastguard Worker add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table 3847*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 3848*c0909341SAndroid Build Coastguard Worker add wq, t0 3849*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3850*c0909341SAndroid Build Coastguard Worker jmp r6 3851*c0909341SAndroid Build Coastguard Worker.h32: 3852*c0909341SAndroid Build Coastguard Worker movu m1, [tlq+16] ; unaligned when jumping here from dc_top 3853*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3854*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3855*c0909341SAndroid Build Coastguard Worker.h16: 3856*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q3232 ; psrlq m1, m0, 16 3857*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3858*c0909341SAndroid Build Coastguard Worker.h8: 3859*c0909341SAndroid Build Coastguard Worker pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3860*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3861*c0909341SAndroid Build Coastguard Worker.h4: 3862*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m2 3863*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 3864*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 3865*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 3866*c0909341SAndroid Build Coastguard Worker jmp wq 3867*c0909341SAndroid Build Coastguard Worker 3868*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3869*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3870*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int16_t *ac, const int alpha); 3871*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3872*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3873*c0909341SAndroid Build Coastguard Worker LEA t0, ipred_cfl_left_ssse3_table 3874*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3875*c0909341SAndroid Build Coastguard Worker inc tlq 3876*c0909341SAndroid Build Coastguard Worker movu m0, [tlq] 3877*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3878*c0909341SAndroid Build Coastguard Worker mov r6d, 0x8000 3879*c0909341SAndroid Build Coastguard Worker movd m3, r6d 3880*c0909341SAndroid Build Coastguard Worker movd m2, wd 3881*c0909341SAndroid Build Coastguard Worker psrld m3, m2 3882*c0909341SAndroid Build Coastguard Worker movsxd r6, [t0+wq*4] 3883*c0909341SAndroid Build Coastguard Worker pcmpeqd m2, m2 3884*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3885*c0909341SAndroid Build Coastguard Worker add r6, t0 3886*c0909341SAndroid Build Coastguard Worker add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table 3887*c0909341SAndroid Build Coastguard Worker movsxd wq, [t0+wq*4] 3888*c0909341SAndroid Build Coastguard Worker add wq, t0 3889*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3890*c0909341SAndroid Build Coastguard Worker jmp r6 3891*c0909341SAndroid Build Coastguard Worker 3892*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3893*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3894*c0909341SAndroid Build Coastguard Worker; const int width, const int height, const int16_t *ac, const int alpha); 3895*c0909341SAndroid Build Coastguard Worker;--------------------------------------------------------------------------------------- 3896*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3897*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 3898*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3899*c0909341SAndroid Build Coastguard Worker LEA r6, ipred_cfl_splat_ssse3_table 3900*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 3901*c0909341SAndroid Build Coastguard Worker movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] 3902*c0909341SAndroid Build Coastguard Worker add wq, r6 3903*c0909341SAndroid Build Coastguard Worker movifnidn acq, acmp 3904*c0909341SAndroid Build Coastguard Worker jmp wq 3905*c0909341SAndroid Build Coastguard Worker 3906*c0909341SAndroid Build Coastguard Worker%macro RELOAD_ACQ_32 1 3907*c0909341SAndroid Build Coastguard Worker mov acq, ac_bakq ; restore acq 3908*c0909341SAndroid Build Coastguard Worker%endmacro 3909*c0909341SAndroid Build Coastguard Worker 3910*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3911*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak 3912*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 3913*c0909341SAndroid Build Coastguard Worker movddup m2, [pb_2] 3914*c0909341SAndroid Build Coastguard Worker%else 3915*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h 3916*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4 3917*c0909341SAndroid Build Coastguard Worker%define ac_bakq acmp 3918*c0909341SAndroid Build Coastguard Worker mov t0d, 0x02020202 3919*c0909341SAndroid Build Coastguard Worker movd m2, t0d 3920*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 3921*c0909341SAndroid Build Coastguard Worker%endif 3922*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 3923*c0909341SAndroid Build Coastguard Worker mov t0d, hm 3924*c0909341SAndroid Build Coastguard Worker mov hd, t0d 3925*c0909341SAndroid Build Coastguard Worker imul t0d, wd 3926*c0909341SAndroid Build Coastguard Worker movd m5, t0d 3927*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 3928*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3929*c0909341SAndroid Build Coastguard Worker mov ac_bakq, acq 3930*c0909341SAndroid Build Coastguard Worker%endif 3931*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 3932*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 3933*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3934*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3935*c0909341SAndroid Build Coastguard Worker jg .w16 3936*c0909341SAndroid Build Coastguard Worker je .w8 3937*c0909341SAndroid Build Coastguard Worker ; fall-through 3938*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3939*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 3940*c0909341SAndroid Build Coastguard Worker%else 3941*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 3942*c0909341SAndroid Build Coastguard Worker%endif 3943*c0909341SAndroid Build Coastguard Worker.w4: 3944*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3945*c0909341SAndroid Build Coastguard Worker.w4_loop: 3946*c0909341SAndroid Build Coastguard Worker movq m0, [yq] 3947*c0909341SAndroid Build Coastguard Worker movq m1, [yq+strideq] 3948*c0909341SAndroid Build Coastguard Worker movhps m0, [yq+strideq*2] 3949*c0909341SAndroid Build Coastguard Worker movhps m1, [yq+stride3q] 3950*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3951*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3952*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3953*c0909341SAndroid Build Coastguard Worker mova [acq], m0 3954*c0909341SAndroid Build Coastguard Worker paddw m4, m0 3955*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*4] 3956*c0909341SAndroid Build Coastguard Worker add acq, 16 3957*c0909341SAndroid Build Coastguard Worker sub hd, 2 3958*c0909341SAndroid Build Coastguard Worker jg .w4_loop 3959*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3960*c0909341SAndroid Build Coastguard Worker jz .calc_avg_4_8 3961*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 3962*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop: 3963*c0909341SAndroid Build Coastguard Worker mova [acq], m0 3964*c0909341SAndroid Build Coastguard Worker paddw m4, m0 3965*c0909341SAndroid Build Coastguard Worker add acq, 16 3966*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 3967*c0909341SAndroid Build Coastguard Worker jg .w4_hpad_loop 3968*c0909341SAndroid Build Coastguard Worker jmp .calc_avg_4_8 3969*c0909341SAndroid Build Coastguard Worker.w8: 3970*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3971*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 3972*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad 3973*c0909341SAndroid Build Coastguard Worker.w8_loop: 3974*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 3975*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 3976*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3977*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3978*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3979*c0909341SAndroid Build Coastguard Worker mova [acq], m0 3980*c0909341SAndroid Build Coastguard Worker paddw m4, m0 3981*c0909341SAndroid Build Coastguard Worker mova m0, [yq+strideq*2] 3982*c0909341SAndroid Build Coastguard Worker mova m1, [yq+stride3q] 3983*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3984*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 3985*c0909341SAndroid Build Coastguard Worker paddw m0, m1 3986*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 3987*c0909341SAndroid Build Coastguard Worker paddw m4, m0 3988*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*4] 3989*c0909341SAndroid Build Coastguard Worker add acq, 32 3990*c0909341SAndroid Build Coastguard Worker sub hd, 2 3991*c0909341SAndroid Build Coastguard Worker jg .w8_loop 3992*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 3993*c0909341SAndroid Build Coastguard Worker jz .calc_avg_4_8 3994*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 3995*c0909341SAndroid Build Coastguard Worker.w8_wpad: ; wpadd=1 3996*c0909341SAndroid Build Coastguard Worker movddup m0, [yq] 3997*c0909341SAndroid Build Coastguard Worker movddup m1, [yq+strideq] 3998*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 3999*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4000*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4001*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4002*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4003*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4004*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4005*c0909341SAndroid Build Coastguard Worker add acq, 16 4006*c0909341SAndroid Build Coastguard Worker sub hd, 1 4007*c0909341SAndroid Build Coastguard Worker jg .w8_wpad 4008*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4009*c0909341SAndroid Build Coastguard Worker jz .calc_avg_4_8 4010*c0909341SAndroid Build Coastguard Worker.w8_hpad: 4011*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4012*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4013*c0909341SAndroid Build Coastguard Worker add acq, 16 4014*c0909341SAndroid Build Coastguard Worker sub hpadd, 1 4015*c0909341SAndroid Build Coastguard Worker jg .w8_hpad 4016*c0909341SAndroid Build Coastguard Worker jmp .calc_avg_4_8 4017*c0909341SAndroid Build Coastguard Worker.w16: 4018*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4019*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad 4020*c0909341SAndroid Build Coastguard Worker.w16_loop: 4021*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4022*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 4023*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4024*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4025*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4026*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4027*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4028*c0909341SAndroid Build Coastguard Worker mova m6, [yq+16] 4029*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq+16] 4030*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m2 4031*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4032*c0909341SAndroid Build Coastguard Worker paddw m6, m1 4033*c0909341SAndroid Build Coastguard Worker mova [acq+16], m6 4034*c0909341SAndroid Build Coastguard Worker paddw m4, m6 4035*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4036*c0909341SAndroid Build Coastguard Worker add acq, 32 4037*c0909341SAndroid Build Coastguard Worker dec hd 4038*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4039*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4040*c0909341SAndroid Build Coastguard Worker jz .calc_avg16 4041*c0909341SAndroid Build Coastguard Worker jmp .w16_hpad_loop 4042*c0909341SAndroid Build Coastguard Worker.w16_wpad: 4043*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 4044*c0909341SAndroid Build Coastguard Worker jl .w16_pad1 4045*c0909341SAndroid Build Coastguard Worker je .w16_pad2 4046*c0909341SAndroid Build Coastguard Worker.w16_pad3: 4047*c0909341SAndroid Build Coastguard Worker movddup m0, [yq] 4048*c0909341SAndroid Build Coastguard Worker movddup m1, [yq+strideq] 4049*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4050*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4051*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4052*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4053*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4054*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4055*c0909341SAndroid Build Coastguard Worker mova m6, m0 4056*c0909341SAndroid Build Coastguard Worker punpckhqdq m6, m0, m0 4057*c0909341SAndroid Build Coastguard Worker mova [acq+16], m6 4058*c0909341SAndroid Build Coastguard Worker paddw m4, m6 4059*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4060*c0909341SAndroid Build Coastguard Worker add acq, 32 4061*c0909341SAndroid Build Coastguard Worker dec hd 4062*c0909341SAndroid Build Coastguard Worker jg .w16_pad3 4063*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_done 4064*c0909341SAndroid Build Coastguard Worker.w16_pad2: 4065*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4066*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 4067*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4068*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4069*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4070*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4071*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4072*c0909341SAndroid Build Coastguard Worker pshufhw m6, m0, q3333 4073*c0909341SAndroid Build Coastguard Worker punpckhqdq m6, m6 4074*c0909341SAndroid Build Coastguard Worker mova [acq+16], m6 4075*c0909341SAndroid Build Coastguard Worker paddw m4, m6 4076*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4077*c0909341SAndroid Build Coastguard Worker add acq, 32 4078*c0909341SAndroid Build Coastguard Worker dec hd 4079*c0909341SAndroid Build Coastguard Worker jg .w16_pad2 4080*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_done 4081*c0909341SAndroid Build Coastguard Worker.w16_pad1: 4082*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4083*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 4084*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4085*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4086*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4087*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4088*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4089*c0909341SAndroid Build Coastguard Worker movddup m6, [yq+16] 4090*c0909341SAndroid Build Coastguard Worker movddup m1, [yq+strideq+16] 4091*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m2 4092*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4093*c0909341SAndroid Build Coastguard Worker paddw m6, m1 4094*c0909341SAndroid Build Coastguard Worker pshufhw m6, m6, q3333 4095*c0909341SAndroid Build Coastguard Worker mova [acq+16], m6 4096*c0909341SAndroid Build Coastguard Worker paddw m4, m6 4097*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4098*c0909341SAndroid Build Coastguard Worker add acq, 32 4099*c0909341SAndroid Build Coastguard Worker dec hd 4100*c0909341SAndroid Build Coastguard Worker jg .w16_pad1 4101*c0909341SAndroid Build Coastguard Worker.w16_wpad_done: 4102*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4103*c0909341SAndroid Build Coastguard Worker jz .calc_avg16 4104*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop: 4105*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4106*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4107*c0909341SAndroid Build Coastguard Worker mova [acq+16], m6 4108*c0909341SAndroid Build Coastguard Worker paddw m4, m6 4109*c0909341SAndroid Build Coastguard Worker add acq, 32 4110*c0909341SAndroid Build Coastguard Worker dec hpadd 4111*c0909341SAndroid Build Coastguard Worker jg .w16_hpad_loop 4112*c0909341SAndroid Build Coastguard Worker jmp .calc_avg16 4113*c0909341SAndroid Build Coastguard Worker 4114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4115*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 4116*c0909341SAndroid Build Coastguard Worker%else 4117*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 4118*c0909341SAndroid Build Coastguard Worker%endif 4119*c0909341SAndroid Build Coastguard Worker.calc_avg_4_8: 4120*c0909341SAndroid Build Coastguard Worker psrlw m2, 9 4121*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m2 4122*c0909341SAndroid Build Coastguard Worker jmp .calc_avg 4123*c0909341SAndroid Build Coastguard Worker.calc_avg16: 4124*c0909341SAndroid Build Coastguard Worker psrld m0, m4, 16 4125*c0909341SAndroid Build Coastguard Worker pslld m4, 16 4126*c0909341SAndroid Build Coastguard Worker psrld m4, 16 4127*c0909341SAndroid Build Coastguard Worker paddd m4, m0 4128*c0909341SAndroid Build Coastguard Worker.calc_avg: 4129*c0909341SAndroid Build Coastguard Worker movd szd, m5 4130*c0909341SAndroid Build Coastguard Worker psrad m5, 1 4131*c0909341SAndroid Build Coastguard Worker tzcnt r1d, szd 4132*c0909341SAndroid Build Coastguard Worker paddd m4, m5 4133*c0909341SAndroid Build Coastguard Worker movd m1, r1d 4134*c0909341SAndroid Build Coastguard Worker pshufd m0, m4, q2301 4135*c0909341SAndroid Build Coastguard Worker paddd m0, m4 4136*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1032 4137*c0909341SAndroid Build Coastguard Worker paddd m0, m4 4138*c0909341SAndroid Build Coastguard Worker psrad m0, m1 ; sum >>= log2sz; 4139*c0909341SAndroid Build Coastguard Worker packssdw m0, m0 4140*c0909341SAndroid Build Coastguard Worker RELOAD_ACQ_32 acq 4141*c0909341SAndroid Build Coastguard Worker.sub_loop: 4142*c0909341SAndroid Build Coastguard Worker mova m1, [acq] 4143*c0909341SAndroid Build Coastguard Worker psubw m1, m0 ; ac[x] -= sum; 4144*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4145*c0909341SAndroid Build Coastguard Worker add acq, 16 4146*c0909341SAndroid Build Coastguard Worker sub szd, 8 4147*c0909341SAndroid Build Coastguard Worker jg .sub_loop 4148*c0909341SAndroid Build Coastguard Worker RET 4149*c0909341SAndroid Build Coastguard Worker 4150*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4151*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak 4152*c0909341SAndroid Build Coastguard Worker movddup m2, [pb_4] 4153*c0909341SAndroid Build Coastguard Worker%else 4154*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h 4155*c0909341SAndroid Build Coastguard Worker mov t0d, 0x04040404 4156*c0909341SAndroid Build Coastguard Worker movd m2, t0d 4157*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 4158*c0909341SAndroid Build Coastguard Worker%endif 4159*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 4160*c0909341SAndroid Build Coastguard Worker mov t0d, hm 4161*c0909341SAndroid Build Coastguard Worker mov hd, t0d 4162*c0909341SAndroid Build Coastguard Worker imul t0d, wd 4163*c0909341SAndroid Build Coastguard Worker movd m6, t0d 4164*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 4165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4166*c0909341SAndroid Build Coastguard Worker mov ac_bakq, acq 4167*c0909341SAndroid Build Coastguard Worker%endif 4168*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 4169*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 4170*c0909341SAndroid Build Coastguard Worker pxor m4, m4 4171*c0909341SAndroid Build Coastguard Worker pxor m5, m5 4172*c0909341SAndroid Build Coastguard Worker cmp wd, 8 4173*c0909341SAndroid Build Coastguard Worker jg .w16 4174*c0909341SAndroid Build Coastguard Worker je .w8 4175*c0909341SAndroid Build Coastguard Worker ; fall-through 4176*c0909341SAndroid Build Coastguard Worker 4177*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4178*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 4179*c0909341SAndroid Build Coastguard Worker%else 4180*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 4181*c0909341SAndroid Build Coastguard Worker%endif 4182*c0909341SAndroid Build Coastguard Worker.w4: 4183*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4184*c0909341SAndroid Build Coastguard Worker.w4_loop: 4185*c0909341SAndroid Build Coastguard Worker movq m1, [yq] 4186*c0909341SAndroid Build Coastguard Worker movhps m1, [yq+strideq] 4187*c0909341SAndroid Build Coastguard Worker movq m0, [yq+strideq*2] 4188*c0909341SAndroid Build Coastguard Worker movhps m0, [yq+stride3q] 4189*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4190*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4191*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4192*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4193*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4194*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4195*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*4] 4196*c0909341SAndroid Build Coastguard Worker add acq, 32 4197*c0909341SAndroid Build Coastguard Worker sub hd, 4 4198*c0909341SAndroid Build Coastguard Worker jg .w4_loop 4199*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4200*c0909341SAndroid Build Coastguard Worker jz .calc_avg_4 4201*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 4202*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop: 4203*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4204*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4205*c0909341SAndroid Build Coastguard Worker add acq, 16 4206*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4207*c0909341SAndroid Build Coastguard Worker jg .w4_hpad_loop 4208*c0909341SAndroid Build Coastguard Worker jmp .calc_avg_4 4209*c0909341SAndroid Build Coastguard Worker.w8: 4210*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4211*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4212*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad 4213*c0909341SAndroid Build Coastguard Worker.w8_loop: 4214*c0909341SAndroid Build Coastguard Worker mova m1, [yq] 4215*c0909341SAndroid Build Coastguard Worker mova m0, [yq+strideq] 4216*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4217*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4218*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4219*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4220*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4221*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4222*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq*2] 4223*c0909341SAndroid Build Coastguard Worker mova m0, [yq+stride3q] 4224*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4225*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4226*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4227*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4228*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4229*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4230*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*4] 4231*c0909341SAndroid Build Coastguard Worker add acq, 64 4232*c0909341SAndroid Build Coastguard Worker sub hd, 4 4233*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4234*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4235*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4236*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 4237*c0909341SAndroid Build Coastguard Worker.w8_wpad: 4238*c0909341SAndroid Build Coastguard Worker movddup m1, [yq] 4239*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4240*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4241*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4242*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4243*c0909341SAndroid Build Coastguard Worker movddup m0, [yq+strideq] 4244*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4245*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4246*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4247*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4248*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4249*c0909341SAndroid Build Coastguard Worker add acq, 32 4250*c0909341SAndroid Build Coastguard Worker sub hd, 2 4251*c0909341SAndroid Build Coastguard Worker jg .w8_wpad 4252*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4253*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4254*c0909341SAndroid Build Coastguard Worker.w8_hpad: 4255*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4256*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4257*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4258*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4259*c0909341SAndroid Build Coastguard Worker add acq, 32 4260*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4261*c0909341SAndroid Build Coastguard Worker jg .w8_hpad 4262*c0909341SAndroid Build Coastguard Worker jmp .calc_avg_8_16 4263*c0909341SAndroid Build Coastguard Worker.w16: 4264*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4265*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad 4266*c0909341SAndroid Build Coastguard Worker.w16_loop: 4267*c0909341SAndroid Build Coastguard Worker mova m1, [yq] 4268*c0909341SAndroid Build Coastguard Worker mova m0, [yq+16] 4269*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4270*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4271*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4272*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4273*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4274*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4275*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 4276*c0909341SAndroid Build Coastguard Worker mova m0, [yq+strideq+16] 4277*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4278*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4279*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4280*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4281*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4282*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4283*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4284*c0909341SAndroid Build Coastguard Worker add acq, 64 4285*c0909341SAndroid Build Coastguard Worker sub hd, 2 4286*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4287*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4288*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4289*c0909341SAndroid Build Coastguard Worker jmp .w16_hpad_loop 4290*c0909341SAndroid Build Coastguard Worker.w16_wpad: 4291*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 4292*c0909341SAndroid Build Coastguard Worker jl .w16_pad1 4293*c0909341SAndroid Build Coastguard Worker je .w16_pad2 4294*c0909341SAndroid Build Coastguard Worker.w16_pad3: 4295*c0909341SAndroid Build Coastguard Worker movddup m1, [yq] 4296*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4297*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4298*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4299*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4300*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 4301*c0909341SAndroid Build Coastguard Worker mova [acq+16], m1 4302*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4303*c0909341SAndroid Build Coastguard Worker movddup m1, [yq+strideq] 4304*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4305*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4306*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4307*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4308*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m1, m1 4309*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4310*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4311*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4312*c0909341SAndroid Build Coastguard Worker add acq, 64 4313*c0909341SAndroid Build Coastguard Worker sub hd, 2 4314*c0909341SAndroid Build Coastguard Worker jg .w16_pad3 4315*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_done 4316*c0909341SAndroid Build Coastguard Worker.w16_pad2: 4317*c0909341SAndroid Build Coastguard Worker mova m1, [yq] 4318*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4319*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4320*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4321*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4322*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 4323*c0909341SAndroid Build Coastguard Worker mova [acq+16], m1 4324*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4325*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 4326*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4327*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4328*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4329*c0909341SAndroid Build Coastguard Worker mova m0, m1 4330*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4331*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 4332*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4333*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4334*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4335*c0909341SAndroid Build Coastguard Worker add acq, 64 4336*c0909341SAndroid Build Coastguard Worker sub hd, 2 4337*c0909341SAndroid Build Coastguard Worker jg .w16_pad2 4338*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_done 4339*c0909341SAndroid Build Coastguard Worker.w16_pad1: 4340*c0909341SAndroid Build Coastguard Worker mova m1, [yq] 4341*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4342*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4343*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4344*c0909341SAndroid Build Coastguard Worker movddup m0, [yq+16] 4345*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4346*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4347*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4348*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4349*c0909341SAndroid Build Coastguard Worker mova m1, [yq+strideq] 4350*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4351*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4352*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4353*c0909341SAndroid Build Coastguard Worker movddup m0, [yq+strideq+16] 4354*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4355*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4356*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4357*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4358*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4359*c0909341SAndroid Build Coastguard Worker add acq, 64 4360*c0909341SAndroid Build Coastguard Worker sub hd, 2 4361*c0909341SAndroid Build Coastguard Worker jg .w16_pad1 4362*c0909341SAndroid Build Coastguard Worker.w16_wpad_done: 4363*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4364*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4365*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop: 4366*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4367*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4368*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4369*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4370*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4371*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4372*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4373*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4374*c0909341SAndroid Build Coastguard Worker add acq, 64 4375*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4376*c0909341SAndroid Build Coastguard Worker jg .w16_hpad_loop 4377*c0909341SAndroid Build Coastguard Worker jmp .calc_avg_8_16 4378*c0909341SAndroid Build Coastguard Worker 4379*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4380*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 4381*c0909341SAndroid Build Coastguard Worker%else 4382*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 4383*c0909341SAndroid Build Coastguard Worker%endif 4384*c0909341SAndroid Build Coastguard Worker.calc_avg_4: 4385*c0909341SAndroid Build Coastguard Worker psrlw m2, 10 4386*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m2 4387*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4, m2 4388*c0909341SAndroid Build Coastguard Worker jmp .calc_avg 4389*c0909341SAndroid Build Coastguard Worker.calc_avg_8_16: 4390*c0909341SAndroid Build Coastguard Worker mova m0, m5 4391*c0909341SAndroid Build Coastguard Worker psrld m5, 16 4392*c0909341SAndroid Build Coastguard Worker pslld m0, 16 4393*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4394*c0909341SAndroid Build Coastguard Worker paddd m5, m0 4395*c0909341SAndroid Build Coastguard Worker mova m0, m4 4396*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4397*c0909341SAndroid Build Coastguard Worker pslld m4, 16 4398*c0909341SAndroid Build Coastguard Worker psrld m4, 16 4399*c0909341SAndroid Build Coastguard Worker paddd m0, m4 4400*c0909341SAndroid Build Coastguard Worker.calc_avg: 4401*c0909341SAndroid Build Coastguard Worker paddd m5, m0 4402*c0909341SAndroid Build Coastguard Worker movd szd, m6 4403*c0909341SAndroid Build Coastguard Worker psrad m6, 1 4404*c0909341SAndroid Build Coastguard Worker tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); 4405*c0909341SAndroid Build Coastguard Worker paddd m5, m6 4406*c0909341SAndroid Build Coastguard Worker movd m1, r1d 4407*c0909341SAndroid Build Coastguard Worker pshufd m0, m5, q2301 4408*c0909341SAndroid Build Coastguard Worker paddd m0, m5 4409*c0909341SAndroid Build Coastguard Worker pshufd m5, m0, q1032 4410*c0909341SAndroid Build Coastguard Worker paddd m0, m5 4411*c0909341SAndroid Build Coastguard Worker psrad m0, m1 ; sum >>= log2sz; 4412*c0909341SAndroid Build Coastguard Worker packssdw m0, m0 4413*c0909341SAndroid Build Coastguard Worker RELOAD_ACQ_32 acq ; ac = ac_orig 4414*c0909341SAndroid Build Coastguard Worker.sub_loop: 4415*c0909341SAndroid Build Coastguard Worker mova m1, [acq] 4416*c0909341SAndroid Build Coastguard Worker psubw m1, m0 4417*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4418*c0909341SAndroid Build Coastguard Worker add acq, 16 4419*c0909341SAndroid Build Coastguard Worker sub szd, 8 4420*c0909341SAndroid Build Coastguard Worker jg .sub_loop 4421*c0909341SAndroid Build Coastguard Worker RET 4422*c0909341SAndroid Build Coastguard Worker 4423*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4424*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak 4425*c0909341SAndroid Build Coastguard Worker movddup m2, [pb_4] 4426*c0909341SAndroid Build Coastguard Worker%else 4427*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h 4428*c0909341SAndroid Build Coastguard Worker%define ac_bakq [rsp+16*4] 4429*c0909341SAndroid Build Coastguard Worker mov t0d, 0x04040404 4430*c0909341SAndroid Build Coastguard Worker movd m2, t0d 4431*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 4432*c0909341SAndroid Build Coastguard Worker%endif 4433*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 4434*c0909341SAndroid Build Coastguard Worker movifnidn hpadd, hpadm 4435*c0909341SAndroid Build Coastguard Worker movd m0, hpadd 4436*c0909341SAndroid Build Coastguard Worker mov t0d, hm 4437*c0909341SAndroid Build Coastguard Worker mov hd, t0d 4438*c0909341SAndroid Build Coastguard Worker imul t0d, wd 4439*c0909341SAndroid Build Coastguard Worker movd m6, t0d 4440*c0909341SAndroid Build Coastguard Worker movd hpadd, m0 4441*c0909341SAndroid Build Coastguard Worker mov ac_bakq, acq 4442*c0909341SAndroid Build Coastguard Worker shl hpadd, 2 4443*c0909341SAndroid Build Coastguard Worker sub hd, hpadd 4444*c0909341SAndroid Build Coastguard Worker pxor m5, m5 4445*c0909341SAndroid Build Coastguard Worker pxor m4, m4 4446*c0909341SAndroid Build Coastguard Worker cmp wd, 16 4447*c0909341SAndroid Build Coastguard Worker jg .w32 4448*c0909341SAndroid Build Coastguard Worker cmp wd, 8 4449*c0909341SAndroid Build Coastguard Worker jg .w16 4450*c0909341SAndroid Build Coastguard Worker je .w8 4451*c0909341SAndroid Build Coastguard Worker ; fall-through 4452*c0909341SAndroid Build Coastguard Worker 4453*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4454*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 4455*c0909341SAndroid Build Coastguard Worker%else 4456*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 4457*c0909341SAndroid Build Coastguard Worker%endif 4458*c0909341SAndroid Build Coastguard Worker.w4: 4459*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4460*c0909341SAndroid Build Coastguard Worker.w4_loop: 4461*c0909341SAndroid Build Coastguard Worker movd m1, [yq] 4462*c0909341SAndroid Build Coastguard Worker movd m3, [yq+strideq] 4463*c0909341SAndroid Build Coastguard Worker punpckldq m1, m3 4464*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4465*c0909341SAndroid Build Coastguard Worker movd m0, [yq+strideq*2] 4466*c0909341SAndroid Build Coastguard Worker movd m3, [yq+stride3q] 4467*c0909341SAndroid Build Coastguard Worker punpckldq m0, m3 4468*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 4469*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4470*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4471*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4472*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4473*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4474*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4475*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*4] 4476*c0909341SAndroid Build Coastguard Worker add acq, 32 4477*c0909341SAndroid Build Coastguard Worker sub hd, 4 4478*c0909341SAndroid Build Coastguard Worker jg .w4_loop 4479*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4480*c0909341SAndroid Build Coastguard Worker jz .calc_avg_4 4481*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 4482*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop: 4483*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4484*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4485*c0909341SAndroid Build Coastguard Worker add acq, 16 4486*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4487*c0909341SAndroid Build Coastguard Worker jg .w4_hpad_loop 4488*c0909341SAndroid Build Coastguard Worker.calc_avg_4: 4489*c0909341SAndroid Build Coastguard Worker psrlw m2, 10 4490*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m2 4491*c0909341SAndroid Build Coastguard Worker jmp .calc_avg 4492*c0909341SAndroid Build Coastguard Worker 4493*c0909341SAndroid Build Coastguard Worker.w8: 4494*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4495*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4496*c0909341SAndroid Build Coastguard Worker jnz .w8_wpad 4497*c0909341SAndroid Build Coastguard Worker.w8_loop: 4498*c0909341SAndroid Build Coastguard Worker movq m1, [yq] 4499*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4500*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4501*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4502*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4503*c0909341SAndroid Build Coastguard Worker movq m0, [yq+strideq] 4504*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 4505*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4506*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4507*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4508*c0909341SAndroid Build Coastguard Worker movq m1, [yq+strideq*2] 4509*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4510*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4511*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4512*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4513*c0909341SAndroid Build Coastguard Worker movq m0, [yq+stride3q] 4514*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 4515*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4516*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4517*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4518*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*4] 4519*c0909341SAndroid Build Coastguard Worker add acq, 64 4520*c0909341SAndroid Build Coastguard Worker sub hd, 4 4521*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4522*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4523*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4524*c0909341SAndroid Build Coastguard Worker jmp .w8_hpad 4525*c0909341SAndroid Build Coastguard Worker.w8_wpad: 4526*c0909341SAndroid Build Coastguard Worker movd m1, [yq] 4527*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4528*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 4529*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4530*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4531*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4532*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4533*c0909341SAndroid Build Coastguard Worker movd m0, [yq+strideq] 4534*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 4535*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 4536*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4537*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4538*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4539*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4540*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4541*c0909341SAndroid Build Coastguard Worker add acq, 32 4542*c0909341SAndroid Build Coastguard Worker sub hd, 2 4543*c0909341SAndroid Build Coastguard Worker jg .w8_wpad 4544*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4545*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4546*c0909341SAndroid Build Coastguard Worker.w8_hpad: 4547*c0909341SAndroid Build Coastguard Worker mova [acq], m0 4548*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4549*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4550*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4551*c0909341SAndroid Build Coastguard Worker add acq, 32 4552*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4553*c0909341SAndroid Build Coastguard Worker jg .w8_hpad 4554*c0909341SAndroid Build Coastguard Worker jmp .calc_avg_8_16 4555*c0909341SAndroid Build Coastguard Worker 4556*c0909341SAndroid Build Coastguard Worker.w16: 4557*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4558*c0909341SAndroid Build Coastguard Worker jnz .w16_wpad 4559*c0909341SAndroid Build Coastguard Worker.w16_loop: 4560*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4561*c0909341SAndroid Build Coastguard Worker mova m1, m0 4562*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4563*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4564*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4565*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4566*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4567*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4568*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4569*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4570*c0909341SAndroid Build Coastguard Worker mova m0, [yq+strideq] 4571*c0909341SAndroid Build Coastguard Worker mova m1, m0 4572*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4573*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4574*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4575*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4576*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4577*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4578*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4579*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4580*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4581*c0909341SAndroid Build Coastguard Worker add acq, 64 4582*c0909341SAndroid Build Coastguard Worker sub hd, 2 4583*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4584*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4585*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4586*c0909341SAndroid Build Coastguard Worker jmp .w16_hpad_loop 4587*c0909341SAndroid Build Coastguard Worker.w16_wpad: 4588*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 4589*c0909341SAndroid Build Coastguard Worker jl .w16_pad1 4590*c0909341SAndroid Build Coastguard Worker je .w16_pad2 4591*c0909341SAndroid Build Coastguard Worker.w16_pad3: 4592*c0909341SAndroid Build Coastguard Worker movd m1, [yq] 4593*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4594*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 4595*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4596*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4597*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4598*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4599*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 4600*c0909341SAndroid Build Coastguard Worker mova [acq+16], m1 4601*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4602*c0909341SAndroid Build Coastguard Worker movd m1, [yq+strideq] 4603*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4604*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 4605*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4606*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4607*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4608*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4609*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m1, m1 4610*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4611*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4612*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4613*c0909341SAndroid Build Coastguard Worker add acq, 64 4614*c0909341SAndroid Build Coastguard Worker sub hd, 2 4615*c0909341SAndroid Build Coastguard Worker jg .w16_pad3 4616*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_done 4617*c0909341SAndroid Build Coastguard Worker.w16_pad2: 4618*c0909341SAndroid Build Coastguard Worker movq m1, [yq] 4619*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4620*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4621*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4622*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4623*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4624*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 4625*c0909341SAndroid Build Coastguard Worker mova [acq+16], m1 4626*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4627*c0909341SAndroid Build Coastguard Worker movq m1, [yq+strideq] 4628*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4629*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4630*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4631*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4632*c0909341SAndroid Build Coastguard Worker mova m0, m1 4633*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4634*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 4635*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4636*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4637*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4638*c0909341SAndroid Build Coastguard Worker add acq, 64 4639*c0909341SAndroid Build Coastguard Worker sub hd, 2 4640*c0909341SAndroid Build Coastguard Worker jg .w16_pad2 4641*c0909341SAndroid Build Coastguard Worker jmp .w16_wpad_done 4642*c0909341SAndroid Build Coastguard Worker.w16_pad1: 4643*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4644*c0909341SAndroid Build Coastguard Worker mova m1, m0 4645*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4646*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4647*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4648*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4649*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4650*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 4651*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4652*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4653*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4654*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4655*c0909341SAndroid Build Coastguard Worker mova m0, [yq+strideq] 4656*c0909341SAndroid Build Coastguard Worker mova m1, m0 4657*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4658*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4659*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4660*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4661*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4662*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 4663*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4664*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4665*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4666*c0909341SAndroid Build Coastguard Worker paddw m4, m0 4667*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq*2] 4668*c0909341SAndroid Build Coastguard Worker add acq, 64 4669*c0909341SAndroid Build Coastguard Worker sub hd, 2 4670*c0909341SAndroid Build Coastguard Worker jg .w16_pad1 4671*c0909341SAndroid Build Coastguard Worker.w16_wpad_done: 4672*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4673*c0909341SAndroid Build Coastguard Worker jz .calc_avg_8_16 4674*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop: 4675*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4676*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4677*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4678*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4679*c0909341SAndroid Build Coastguard Worker mova [acq+32], m1 4680*c0909341SAndroid Build Coastguard Worker mova [acq+48], m0 4681*c0909341SAndroid Build Coastguard Worker paddw m4, m1 4682*c0909341SAndroid Build Coastguard Worker paddw m5, m0 4683*c0909341SAndroid Build Coastguard Worker add acq, 64 4684*c0909341SAndroid Build Coastguard Worker sub hpadd, 2 4685*c0909341SAndroid Build Coastguard Worker jg .w16_hpad_loop 4686*c0909341SAndroid Build Coastguard Worker.calc_avg_8_16: 4687*c0909341SAndroid Build Coastguard Worker mova m0, m5 4688*c0909341SAndroid Build Coastguard Worker psrld m5, 16 4689*c0909341SAndroid Build Coastguard Worker pslld m0, 16 4690*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4691*c0909341SAndroid Build Coastguard Worker paddd m5, m0 4692*c0909341SAndroid Build Coastguard Worker mova m0, m4 4693*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4694*c0909341SAndroid Build Coastguard Worker pslld m4, 16 4695*c0909341SAndroid Build Coastguard Worker psrld m4, 16 4696*c0909341SAndroid Build Coastguard Worker paddd m0, m4 4697*c0909341SAndroid Build Coastguard Worker paddd m5, m0 4698*c0909341SAndroid Build Coastguard Worker jmp .calc_avg 4699*c0909341SAndroid Build Coastguard Worker 4700*c0909341SAndroid Build Coastguard Worker.w32: 4701*c0909341SAndroid Build Coastguard Worker pxor m0, m0 4702*c0909341SAndroid Build Coastguard Worker mova [rsp ], m0 4703*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m0 4704*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m0 4705*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m0 4706*c0909341SAndroid Build Coastguard Worker test wpadd, wpadd 4707*c0909341SAndroid Build Coastguard Worker jnz .w32_wpad 4708*c0909341SAndroid Build Coastguard Worker.w32_loop: 4709*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4710*c0909341SAndroid Build Coastguard Worker mova m1, m0 4711*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4712*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4713*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4714*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4715*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4716*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4717*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4718*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4719*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4720*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4721*c0909341SAndroid Build Coastguard Worker mova m4, [yq+16] 4722*c0909341SAndroid Build Coastguard Worker mova m3, m4 4723*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m3 4724*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 4725*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4726*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4727*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4728*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m4 4729*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2 4730*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4731*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4732*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4733*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4734*c0909341SAndroid Build Coastguard Worker add acq, 64 4735*c0909341SAndroid Build Coastguard Worker sub hd, 1 4736*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4737*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4738*c0909341SAndroid Build Coastguard Worker jz .calc_avg_32 4739*c0909341SAndroid Build Coastguard Worker jmp .w32_hpad_loop 4740*c0909341SAndroid Build Coastguard Worker.w32_wpad: 4741*c0909341SAndroid Build Coastguard Worker cmp wpadd, 2 4742*c0909341SAndroid Build Coastguard Worker jl .w32_pad1 4743*c0909341SAndroid Build Coastguard Worker je .w32_pad2 4744*c0909341SAndroid Build Coastguard Worker cmp wpadd, 4 4745*c0909341SAndroid Build Coastguard Worker jl .w32_pad3 4746*c0909341SAndroid Build Coastguard Worker je .w32_pad4 4747*c0909341SAndroid Build Coastguard Worker cmp wpadd, 6 4748*c0909341SAndroid Build Coastguard Worker jl .w32_pad5 4749*c0909341SAndroid Build Coastguard Worker je .w32_pad6 4750*c0909341SAndroid Build Coastguard Worker.w32_pad7: 4751*c0909341SAndroid Build Coastguard Worker movd m1, [yq] 4752*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4753*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 4754*c0909341SAndroid Build Coastguard Worker pshufhw m1, m1, q3333 4755*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4756*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4757*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4758*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4759*c0909341SAndroid Build Coastguard Worker mova m0, m1 4760*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 4761*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4762*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4763*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4764*c0909341SAndroid Build Coastguard Worker mova m3, m0 4765*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4766*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4767*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4768*c0909341SAndroid Build Coastguard Worker mova m4, m3 4769*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4770*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4771*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4772*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4773*c0909341SAndroid Build Coastguard Worker add acq, 64 4774*c0909341SAndroid Build Coastguard Worker sub hd, 1 4775*c0909341SAndroid Build Coastguard Worker jg .w32_pad7 4776*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_done 4777*c0909341SAndroid Build Coastguard Worker.w32_pad6: 4778*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4779*c0909341SAndroid Build Coastguard Worker mova m1, m0 4780*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4781*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4782*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4783*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4784*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4785*c0909341SAndroid Build Coastguard Worker pshufhw m0, m1, q3333 4786*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m0 4787*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4788*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4789*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4790*c0909341SAndroid Build Coastguard Worker mova m3, m0 4791*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4792*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4793*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4794*c0909341SAndroid Build Coastguard Worker mova m4, m3 4795*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4796*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4797*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4798*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4799*c0909341SAndroid Build Coastguard Worker add acq, 64 4800*c0909341SAndroid Build Coastguard Worker sub hd, 1 4801*c0909341SAndroid Build Coastguard Worker jg .w32_pad6 4802*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_done 4803*c0909341SAndroid Build Coastguard Worker.w32_pad5: 4804*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4805*c0909341SAndroid Build Coastguard Worker mova m1, m0 4806*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4807*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4808*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4809*c0909341SAndroid Build Coastguard Worker mova m5, [rsp] 4810*c0909341SAndroid Build Coastguard Worker paddw m5, m1 4811*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4812*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4813*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 4814*c0909341SAndroid Build Coastguard Worker pshufhw m0, m0, q3333 4815*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4816*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4817*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4818*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4819*c0909341SAndroid Build Coastguard Worker mova m3, m0 4820*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 4821*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4822*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4823*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4824*c0909341SAndroid Build Coastguard Worker mova m4, m3 4825*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4826*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4827*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4828*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4829*c0909341SAndroid Build Coastguard Worker add acq, 64 4830*c0909341SAndroid Build Coastguard Worker sub hd, 1 4831*c0909341SAndroid Build Coastguard Worker jg .w32_pad5 4832*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_done 4833*c0909341SAndroid Build Coastguard Worker.w32_pad4: 4834*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4835*c0909341SAndroid Build Coastguard Worker mova m1, m0 4836*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4837*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4838*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4839*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4840*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4841*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4842*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4843*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4844*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4845*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4846*c0909341SAndroid Build Coastguard Worker mova m3, m0 4847*c0909341SAndroid Build Coastguard Worker pshufhw m3, m3, q3333 4848*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 4849*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4850*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4851*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4852*c0909341SAndroid Build Coastguard Worker mova m4, m3 4853*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4854*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4855*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4856*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4857*c0909341SAndroid Build Coastguard Worker add acq, 64 4858*c0909341SAndroid Build Coastguard Worker sub hd, 1 4859*c0909341SAndroid Build Coastguard Worker jg .w32_pad4 4860*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_done 4861*c0909341SAndroid Build Coastguard Worker.w32_pad3: 4862*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4863*c0909341SAndroid Build Coastguard Worker mova m1, m0 4864*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4865*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4866*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4867*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4868*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4869*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4870*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4871*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4872*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4873*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4874*c0909341SAndroid Build Coastguard Worker movd m3, [yq+16] 4875*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m3 4876*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m3 4877*c0909341SAndroid Build Coastguard Worker pshufhw m3, m3, q3333 4878*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 4879*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4880*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4881*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4882*c0909341SAndroid Build Coastguard Worker mova m4, m3 4883*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m4 4884*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4885*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4886*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4887*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4888*c0909341SAndroid Build Coastguard Worker add acq, 64 4889*c0909341SAndroid Build Coastguard Worker sub hd, 1 4890*c0909341SAndroid Build Coastguard Worker jg .w32_pad3 4891*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_done 4892*c0909341SAndroid Build Coastguard Worker.w32_pad2: 4893*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4894*c0909341SAndroid Build Coastguard Worker mova m1, m0 4895*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4896*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4897*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4898*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4899*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4900*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4901*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4902*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4903*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4904*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4905*c0909341SAndroid Build Coastguard Worker mova m3, [yq+16] 4906*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m3 4907*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 4908*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4909*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4910*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4911*c0909341SAndroid Build Coastguard Worker pshufhw m4, m3, q3333 4912*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m4 4913*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4914*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4915*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4916*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4917*c0909341SAndroid Build Coastguard Worker add acq, 64 4918*c0909341SAndroid Build Coastguard Worker sub hd, 1 4919*c0909341SAndroid Build Coastguard Worker jg .w32_pad2 4920*c0909341SAndroid Build Coastguard Worker jmp .w32_wpad_done 4921*c0909341SAndroid Build Coastguard Worker.w32_pad1: 4922*c0909341SAndroid Build Coastguard Worker mova m0, [yq] 4923*c0909341SAndroid Build Coastguard Worker mova m1, m0 4924*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m1 4925*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 4926*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4927*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4928*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4929*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m0 4930*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 4931*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4932*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4933*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4934*c0909341SAndroid Build Coastguard Worker mova m4, [yq+16] 4935*c0909341SAndroid Build Coastguard Worker mova m3, m4 4936*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m3 4937*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 4938*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4939*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4940*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4941*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m4 4942*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m4 4943*c0909341SAndroid Build Coastguard Worker pshufhw m4, m4, q3333 4944*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2 4945*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4946*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4947*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4948*c0909341SAndroid Build Coastguard Worker lea yq, [yq+strideq] 4949*c0909341SAndroid Build Coastguard Worker add acq, 64 4950*c0909341SAndroid Build Coastguard Worker sub hd, 1 4951*c0909341SAndroid Build Coastguard Worker jg .w32_pad1 4952*c0909341SAndroid Build Coastguard Worker.w32_wpad_done: 4953*c0909341SAndroid Build Coastguard Worker test hpadd, hpadd 4954*c0909341SAndroid Build Coastguard Worker jz .calc_avg_32 4955*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop: 4956*c0909341SAndroid Build Coastguard Worker mova [acq], m1 4957*c0909341SAndroid Build Coastguard Worker mova [acq+16], m0 4958*c0909341SAndroid Build Coastguard Worker paddw m5, m1, [rsp] 4959*c0909341SAndroid Build Coastguard Worker mova [rsp ], m5 4960*c0909341SAndroid Build Coastguard Worker paddw m5, m0, [rsp+16] 4961*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m5 4962*c0909341SAndroid Build Coastguard Worker mova [acq+32], m3 4963*c0909341SAndroid Build Coastguard Worker mova [acq+48], m4 4964*c0909341SAndroid Build Coastguard Worker paddw m5, m3, [rsp+32] 4965*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m5 4966*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [rsp+48] 4967*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m5 4968*c0909341SAndroid Build Coastguard Worker add acq, 64 4969*c0909341SAndroid Build Coastguard Worker sub hpadd, 1 4970*c0909341SAndroid Build Coastguard Worker jg .w32_hpad_loop 4971*c0909341SAndroid Build Coastguard Worker 4972*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 4973*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 4974*c0909341SAndroid Build Coastguard Worker%else 4975*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 4976*c0909341SAndroid Build Coastguard Worker%endif 4977*c0909341SAndroid Build Coastguard Worker 4978*c0909341SAndroid Build Coastguard Worker.calc_avg_32: 4979*c0909341SAndroid Build Coastguard Worker mova m5, [rsp] 4980*c0909341SAndroid Build Coastguard Worker mova m0, m5 4981*c0909341SAndroid Build Coastguard Worker psrld m5, 16 4982*c0909341SAndroid Build Coastguard Worker pslld m0, 16 4983*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4984*c0909341SAndroid Build Coastguard Worker paddd m5, m0 4985*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+16] 4986*c0909341SAndroid Build Coastguard Worker mova m3, m0 4987*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4988*c0909341SAndroid Build Coastguard Worker pslld m3, 16 4989*c0909341SAndroid Build Coastguard Worker psrld m3, 16 4990*c0909341SAndroid Build Coastguard Worker paddd m0, m3 4991*c0909341SAndroid Build Coastguard Worker paddd m5, m0 4992*c0909341SAndroid Build Coastguard Worker mova m0, [rsp+32] 4993*c0909341SAndroid Build Coastguard Worker mova m3, m0 4994*c0909341SAndroid Build Coastguard Worker psrld m0, 16 4995*c0909341SAndroid Build Coastguard Worker pslld m3, 16 4996*c0909341SAndroid Build Coastguard Worker psrld m3, 16 4997*c0909341SAndroid Build Coastguard Worker paddd m0, m3 4998*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+48] 4999*c0909341SAndroid Build Coastguard Worker mova m3, m1 5000*c0909341SAndroid Build Coastguard Worker psrld m1, 16 5001*c0909341SAndroid Build Coastguard Worker pslld m3, 16 5002*c0909341SAndroid Build Coastguard Worker psrld m3, 16 5003*c0909341SAndroid Build Coastguard Worker paddd m1, m3 5004*c0909341SAndroid Build Coastguard Worker paddd m1, m0 5005*c0909341SAndroid Build Coastguard Worker paddd m5, m1 5006*c0909341SAndroid Build Coastguard Worker.calc_avg: 5007*c0909341SAndroid Build Coastguard Worker movd szd, m6 5008*c0909341SAndroid Build Coastguard Worker psrad m6, 1 5009*c0909341SAndroid Build Coastguard Worker tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); 5010*c0909341SAndroid Build Coastguard Worker paddd m5, m6 5011*c0909341SAndroid Build Coastguard Worker movd m1, r1d 5012*c0909341SAndroid Build Coastguard Worker pshufd m0, m5, q2301 5013*c0909341SAndroid Build Coastguard Worker paddd m0, m5 5014*c0909341SAndroid Build Coastguard Worker pshufd m5, m0, q1032 5015*c0909341SAndroid Build Coastguard Worker paddd m0, m5 5016*c0909341SAndroid Build Coastguard Worker psrad m0, m1 ; sum >>= log2sz; 5017*c0909341SAndroid Build Coastguard Worker packssdw m0, m0 5018*c0909341SAndroid Build Coastguard Worker RELOAD_ACQ_32 acq ; ac = ac_orig 5019*c0909341SAndroid Build Coastguard Worker.sub_loop: 5020*c0909341SAndroid Build Coastguard Worker mova m1, [acq] 5021*c0909341SAndroid Build Coastguard Worker psubw m1, m0 5022*c0909341SAndroid Build Coastguard Worker mova [acq], m1 5023*c0909341SAndroid Build Coastguard Worker add acq, 16 5024*c0909341SAndroid Build Coastguard Worker sub szd, 8 5025*c0909341SAndroid Build Coastguard Worker jg .sub_loop 5026*c0909341SAndroid Build Coastguard Worker RET 5027*c0909341SAndroid Build Coastguard Worker 5028*c0909341SAndroid Build Coastguard Worker; %1 simd register that hold the mask and will hold the result 5029*c0909341SAndroid Build Coastguard Worker; %2 simd register that holds the "true" values 5030*c0909341SAndroid Build Coastguard Worker; %3 location of the "false" values (simd register/memory) 5031*c0909341SAndroid Build Coastguard Worker%macro BLEND 3 ; mask, true, false 5032*c0909341SAndroid Build Coastguard Worker pand %2, %1 5033*c0909341SAndroid Build Coastguard Worker pandn %1, %3 5034*c0909341SAndroid Build Coastguard Worker por %1, %2 5035*c0909341SAndroid Build Coastguard Worker%endmacro 5036*c0909341SAndroid Build Coastguard Worker 5037*c0909341SAndroid Build Coastguard Worker%macro PAETH 2 ; top, ldiff 5038*c0909341SAndroid Build Coastguard Worker pavgb m1, m%1, m3 5039*c0909341SAndroid Build Coastguard Worker pxor m0, m%1, m3 5040*c0909341SAndroid Build Coastguard Worker pand m0, m4 5041*c0909341SAndroid Build Coastguard Worker psubusb m2, m5, m1 5042*c0909341SAndroid Build Coastguard Worker psubb m1, m0 5043*c0909341SAndroid Build Coastguard Worker psubusb m1, m5 5044*c0909341SAndroid Build Coastguard Worker por m1, m2 5045*c0909341SAndroid Build Coastguard Worker paddusb m1, m1 5046*c0909341SAndroid Build Coastguard Worker por m1, m0 ; min(tldiff, 255) 5047*c0909341SAndroid Build Coastguard Worker psubusb m2, m5, m3 5048*c0909341SAndroid Build Coastguard Worker psubusb m0, m3, m5 5049*c0909341SAndroid Build Coastguard Worker por m2, m0 ; tdiff 5050*c0909341SAndroid Build Coastguard Worker%ifnum %2 5051*c0909341SAndroid Build Coastguard Worker pminub m2, m%2 5052*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, m%2, m2 ; ldiff <= tdiff 5053*c0909341SAndroid Build Coastguard Worker%else 5054*c0909341SAndroid Build Coastguard Worker mova m0, %2 5055*c0909341SAndroid Build Coastguard Worker pminub m2, m0 5056*c0909341SAndroid Build Coastguard Worker pcmpeqb m0, m2 5057*c0909341SAndroid Build Coastguard Worker%endif 5058*c0909341SAndroid Build Coastguard Worker pminub m1, m2 5059*c0909341SAndroid Build Coastguard Worker pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff 5060*c0909341SAndroid Build Coastguard Worker mova m2, m3 5061*c0909341SAndroid Build Coastguard Worker BLEND m0, m2, m%1 5062*c0909341SAndroid Build Coastguard Worker BLEND m1, m0, m5 5063*c0909341SAndroid Build Coastguard Worker%endmacro 5064*c0909341SAndroid Build Coastguard Worker 5065*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h 5066*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_ssse3_table 5067*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5068*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5069*c0909341SAndroid Build Coastguard Worker pxor m0, m0 5070*c0909341SAndroid Build Coastguard Worker movd m5, [tlq] 5071*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 5072*c0909341SAndroid Build Coastguard Worker LEA r5, ipred_paeth_ssse3_table 5073*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 5074*c0909341SAndroid Build Coastguard Worker movddup m4, [base+ipred_paeth_shuf] 5075*c0909341SAndroid Build Coastguard Worker add wq, r5 5076*c0909341SAndroid Build Coastguard Worker jmp wq 5077*c0909341SAndroid Build Coastguard Worker.w4: 5078*c0909341SAndroid Build Coastguard Worker movd m6, [tlq+1] ; top 5079*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q0000 5080*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 5081*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5082*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5083*c0909341SAndroid Build Coastguard Worker por m7, m0 ; ldiff 5084*c0909341SAndroid Build Coastguard Worker.w4_loop: 5085*c0909341SAndroid Build Coastguard Worker sub tlq, 4 5086*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] 5087*c0909341SAndroid Build Coastguard Worker mova m1, [base+ipred_h_shuf] 5088*c0909341SAndroid Build Coastguard Worker pshufb m3, m1 ; left 5089*c0909341SAndroid Build Coastguard Worker PAETH 6, 7 5090*c0909341SAndroid Build Coastguard Worker movd [dstq ], m1 5091*c0909341SAndroid Build Coastguard Worker pshuflw m0, m1, q1032 5092*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq ], m0 5093*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m1 5094*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], m1 5095*c0909341SAndroid Build Coastguard Worker psrlq m1, 32 5096*c0909341SAndroid Build Coastguard Worker movd [dstq+r3 ], m1 5097*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5098*c0909341SAndroid Build Coastguard Worker sub hd, 4 5099*c0909341SAndroid Build Coastguard Worker jg .w4_loop 5100*c0909341SAndroid Build Coastguard Worker RET 5101*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5102*c0909341SAndroid Build Coastguard Worker.w8: 5103*c0909341SAndroid Build Coastguard Worker movddup m6, [tlq+1] 5104*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5105*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5106*c0909341SAndroid Build Coastguard Worker por m7, m0 5107*c0909341SAndroid Build Coastguard Worker.w8_loop: 5108*c0909341SAndroid Build Coastguard Worker sub tlq, 2 5109*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] 5110*c0909341SAndroid Build Coastguard Worker pshufb m3, [base+ipred_paeth_shuf] 5111*c0909341SAndroid Build Coastguard Worker PAETH 6, 7 5112*c0909341SAndroid Build Coastguard Worker movq [dstq ], m1 5113*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq], m1 5114*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5115*c0909341SAndroid Build Coastguard Worker sub hd, 2 5116*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5117*c0909341SAndroid Build Coastguard Worker RET 5118*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5119*c0909341SAndroid Build Coastguard Worker.w16: 5120*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+1] 5121*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5122*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5123*c0909341SAndroid Build Coastguard Worker por m7, m0 5124*c0909341SAndroid Build Coastguard Worker.w16_loop: 5125*c0909341SAndroid Build Coastguard Worker sub tlq, 1 5126*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] 5127*c0909341SAndroid Build Coastguard Worker pxor m1, m1 5128*c0909341SAndroid Build Coastguard Worker pshufb m3, m1 5129*c0909341SAndroid Build Coastguard Worker PAETH 6, 7 5130*c0909341SAndroid Build Coastguard Worker mova [dstq], m1 5131*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5132*c0909341SAndroid Build Coastguard Worker sub hd, 1 5133*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5134*c0909341SAndroid Build Coastguard Worker RET 5135*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5136*c0909341SAndroid Build Coastguard Worker.w32: 5137*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+1] 5138*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5139*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5140*c0909341SAndroid Build Coastguard Worker por m7, m0 5141*c0909341SAndroid Build Coastguard Worker mova [rsp ], m6 5142*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m7 5143*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+17] 5144*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5145*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5146*c0909341SAndroid Build Coastguard Worker por m7, m0 5147*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m6 5148*c0909341SAndroid Build Coastguard Worker.w32_loop: 5149*c0909341SAndroid Build Coastguard Worker dec tlq 5150*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] 5151*c0909341SAndroid Build Coastguard Worker pxor m1, m1 5152*c0909341SAndroid Build Coastguard Worker pshufb m3, m1 5153*c0909341SAndroid Build Coastguard Worker mova m6, [rsp] 5154*c0909341SAndroid Build Coastguard Worker PAETH 6, [rsp+16] 5155*c0909341SAndroid Build Coastguard Worker mova [dstq ], m1 5156*c0909341SAndroid Build Coastguard Worker mova m6, [rsp+32] 5157*c0909341SAndroid Build Coastguard Worker PAETH 6, 7 5158*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 5159*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5160*c0909341SAndroid Build Coastguard Worker dec hd 5161*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5162*c0909341SAndroid Build Coastguard Worker RET 5163*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5164*c0909341SAndroid Build Coastguard Worker.w64: 5165*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+1] 5166*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5167*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5168*c0909341SAndroid Build Coastguard Worker por m7, m0 5169*c0909341SAndroid Build Coastguard Worker mova [rsp ], m6 5170*c0909341SAndroid Build Coastguard Worker mova [rsp+16], m7 5171*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+17] 5172*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5173*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5174*c0909341SAndroid Build Coastguard Worker por m7, m0 5175*c0909341SAndroid Build Coastguard Worker mova [rsp+32], m6 5176*c0909341SAndroid Build Coastguard Worker mova [rsp+48], m7 5177*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+33] 5178*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5179*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5180*c0909341SAndroid Build Coastguard Worker por m7, m0 5181*c0909341SAndroid Build Coastguard Worker mova [rsp+64], m6 5182*c0909341SAndroid Build Coastguard Worker mova [rsp+80], m7 5183*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+49] 5184*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 5185*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 5186*c0909341SAndroid Build Coastguard Worker por m7, m0 5187*c0909341SAndroid Build Coastguard Worker mova [rsp+96], m6 5188*c0909341SAndroid Build Coastguard Worker.w64_loop: 5189*c0909341SAndroid Build Coastguard Worker dec tlq 5190*c0909341SAndroid Build Coastguard Worker movd m3, [tlq] 5191*c0909341SAndroid Build Coastguard Worker pxor m1, m1 5192*c0909341SAndroid Build Coastguard Worker pshufb m3, m1 5193*c0909341SAndroid Build Coastguard Worker mova m6, [rsp] 5194*c0909341SAndroid Build Coastguard Worker PAETH 6, [rsp+16] 5195*c0909341SAndroid Build Coastguard Worker mova [dstq ], m1 5196*c0909341SAndroid Build Coastguard Worker mova m6, [rsp+32] 5197*c0909341SAndroid Build Coastguard Worker PAETH 6, [rsp+48] 5198*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 5199*c0909341SAndroid Build Coastguard Worker mova m6, [rsp+64] 5200*c0909341SAndroid Build Coastguard Worker PAETH 6, [rsp+80] 5201*c0909341SAndroid Build Coastguard Worker mova [dstq+32], m1 5202*c0909341SAndroid Build Coastguard Worker mova m6, [rsp+96] 5203*c0909341SAndroid Build Coastguard Worker PAETH 6, 7 5204*c0909341SAndroid Build Coastguard Worker mova [dstq+48], m1 5205*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5206*c0909341SAndroid Build Coastguard Worker dec hd 5207*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5208*c0909341SAndroid Build Coastguard Worker RET 5209*c0909341SAndroid Build Coastguard Worker 5210*c0909341SAndroid Build Coastguard Worker 5211*c0909341SAndroid Build Coastguard Worker%macro FILTER 4 ;dst, src, tmp, shuf 5212*c0909341SAndroid Build Coastguard Worker%ifnum %4 5213*c0909341SAndroid Build Coastguard Worker pshufb m%2, m%4 5214*c0909341SAndroid Build Coastguard Worker%else 5215*c0909341SAndroid Build Coastguard Worker pshufb m%2, %4 5216*c0909341SAndroid Build Coastguard Worker%endif 5217*c0909341SAndroid Build Coastguard Worker pshufd m%1, m%2, q0000 ;p0 p1 5218*c0909341SAndroid Build Coastguard Worker pmaddubsw m%1, m2 5219*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%2, q1111 ;p2 p3 5220*c0909341SAndroid Build Coastguard Worker pmaddubsw m%3, m3 5221*c0909341SAndroid Build Coastguard Worker paddw m%1, [base+pw_8] 5222*c0909341SAndroid Build Coastguard Worker paddw m%1, m%3 5223*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%2, q2222 ;p4 p5 5224*c0909341SAndroid Build Coastguard Worker pmaddubsw m%3, m4 5225*c0909341SAndroid Build Coastguard Worker paddw m%1, m%3 5226*c0909341SAndroid Build Coastguard Worker pshufd m%3, m%2, q3333 ;p6 __ 5227*c0909341SAndroid Build Coastguard Worker pmaddubsw m%3, m5 5228*c0909341SAndroid Build Coastguard Worker paddw m%1, m%3 5229*c0909341SAndroid Build Coastguard Worker psraw m%1, 4 5230*c0909341SAndroid Build Coastguard Worker packuswb m%1, m%1 5231*c0909341SAndroid Build Coastguard Worker%endmacro 5232*c0909341SAndroid Build Coastguard Worker 5233*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter 5234*c0909341SAndroid Build Coastguard Worker%define base r6-$$ 5235*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 5236*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5237*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm 5238*c0909341SAndroid Build Coastguard Worker movzx filterd, filterb 5239*c0909341SAndroid Build Coastguard Worker%else 5240*c0909341SAndroid Build Coastguard Worker movzx filterd, byte filterm 5241*c0909341SAndroid Build Coastguard Worker%endif 5242*c0909341SAndroid Build Coastguard Worker shl filterd, 6 5243*c0909341SAndroid Build Coastguard Worker lea filterq, [base+filter_intra_taps+filterq] 5244*c0909341SAndroid Build Coastguard Worker movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 5245*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_filter_ssse3_table+wq*4] 5246*c0909341SAndroid Build Coastguard Worker mova m2, [filterq+16*0] 5247*c0909341SAndroid Build Coastguard Worker mova m3, [filterq+16*1] 5248*c0909341SAndroid Build Coastguard Worker mova m4, [filterq+16*2] 5249*c0909341SAndroid Build Coastguard Worker mova m5, [filterq+16*3] 5250*c0909341SAndroid Build Coastguard Worker lea wq, [base+ipred_filter_ssse3_table+wq] 5251*c0909341SAndroid Build Coastguard Worker mov hd, hm 5252*c0909341SAndroid Build Coastguard Worker jmp wq 5253*c0909341SAndroid Build Coastguard Worker.w4: 5254*c0909341SAndroid Build Coastguard Worker mova m1, [base+filter_shuf1] 5255*c0909341SAndroid Build Coastguard Worker sub tlq, 3 5256*c0909341SAndroid Build Coastguard Worker sub tlq, hq 5257*c0909341SAndroid Build Coastguard Worker jmp .w4_loop_start 5258*c0909341SAndroid Build Coastguard Worker.w4_loop: 5259*c0909341SAndroid Build Coastguard Worker movd m0, [tlq+hq] 5260*c0909341SAndroid Build Coastguard Worker punpckldq m0, m6 5261*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5262*c0909341SAndroid Build Coastguard Worker.w4_loop_start: 5263*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 7, 1 5264*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m6 5265*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q1032 5266*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m6 5267*c0909341SAndroid Build Coastguard Worker sub hd, 2 5268*c0909341SAndroid Build Coastguard Worker jg .w4_loop 5269*c0909341SAndroid Build Coastguard Worker RET 5270*c0909341SAndroid Build Coastguard Worker 5271*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5272*c0909341SAndroid Build Coastguard Worker.w8: 5273*c0909341SAndroid Build Coastguard Worker movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 5274*c0909341SAndroid Build Coastguard Worker sub tlq, 5 5275*c0909341SAndroid Build Coastguard Worker sub tlq, hq 5276*c0909341SAndroid Build Coastguard Worker 5277*c0909341SAndroid Build Coastguard Worker.w8_loop: 5278*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 1, [base+filter_shuf1] 5279*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5280*c0909341SAndroid Build Coastguard Worker FILTER 0, 6, 1, [base+filter_shuf2] 5281*c0909341SAndroid Build Coastguard Worker 5282*c0909341SAndroid Build Coastguard Worker punpckldq m6, m7, m0 5283*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], m6 5284*c0909341SAndroid Build Coastguard Worker punpckhqdq m6, m6 5285*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], m6 5286*c0909341SAndroid Build Coastguard Worker 5287*c0909341SAndroid Build Coastguard Worker movd m0, [tlq+hq] ;_ 6 5 0 5288*c0909341SAndroid Build Coastguard Worker punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 5289*c0909341SAndroid Build Coastguard Worker 5290*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5291*c0909341SAndroid Build Coastguard Worker sub hd, 2 5292*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5293*c0909341SAndroid Build Coastguard Worker RET 5294*c0909341SAndroid Build Coastguard Worker 5295*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5296*c0909341SAndroid Build Coastguard Worker.w16: 5297*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+1] ;top row 5298*c0909341SAndroid Build Coastguard Worker sub tlq, 5 5299*c0909341SAndroid Build Coastguard Worker sub tlq, hq 5300*c0909341SAndroid Build Coastguard Worker 5301*c0909341SAndroid Build Coastguard Worker.w16_loop: 5302*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 1, [base+filter_shuf1] 5303*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5304*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m7 5305*c0909341SAndroid Build Coastguard Worker psrlq m7, 32 5306*c0909341SAndroid Build Coastguard Worker palignr m7, m6, 4 5307*c0909341SAndroid Build Coastguard Worker 5308*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 1, [base+filter_shuf2] 5309*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5310*c0909341SAndroid Build Coastguard Worker movd [dstq+4+strideq*0], m6 5311*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 5312*c0909341SAndroid Build Coastguard Worker palignr m6, m7, 4 5313*c0909341SAndroid Build Coastguard Worker 5314*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 1, [base+filter_shuf2] 5315*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5316*c0909341SAndroid Build Coastguard Worker movd [dstq+8+strideq*0], m7 5317*c0909341SAndroid Build Coastguard Worker psrlq m7, 32 5318*c0909341SAndroid Build Coastguard Worker palignr m7, m6, 4 5319*c0909341SAndroid Build Coastguard Worker 5320*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 1, [base+filter_shuf2] 5321*c0909341SAndroid Build Coastguard Worker movd [dstq+12+strideq*0], m6 5322*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 5323*c0909341SAndroid Build Coastguard Worker palignr m6, m7, 4 5324*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m6 5325*c0909341SAndroid Build Coastguard Worker 5326*c0909341SAndroid Build Coastguard Worker movd m0, [tlq+hq] ;_ 6 5 0 5327*c0909341SAndroid Build Coastguard Worker punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 5328*c0909341SAndroid Build Coastguard Worker 5329*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5330*c0909341SAndroid Build Coastguard Worker sub hd, 2 5331*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5332*c0909341SAndroid Build Coastguard Worker RET 5333*c0909341SAndroid Build Coastguard Worker 5334*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5335*c0909341SAndroid Build Coastguard Worker.w32: 5336*c0909341SAndroid Build Coastguard Worker movu m6, [tlq+1] ;top row 5337*c0909341SAndroid Build Coastguard Worker lea filterq, [tlq+17] 5338*c0909341SAndroid Build Coastguard Worker sub tlq, 5 5339*c0909341SAndroid Build Coastguard Worker sub tlq, hq 5340*c0909341SAndroid Build Coastguard Worker 5341*c0909341SAndroid Build Coastguard Worker.w32_loop: 5342*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 1, [base+filter_shuf1] 5343*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5344*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m7 5345*c0909341SAndroid Build Coastguard Worker psrlq m7, 32 5346*c0909341SAndroid Build Coastguard Worker palignr m7, m6, 4 5347*c0909341SAndroid Build Coastguard Worker 5348*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 1, [base+filter_shuf2] 5349*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5350*c0909341SAndroid Build Coastguard Worker movd [dstq+4+strideq*0], m6 5351*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 5352*c0909341SAndroid Build Coastguard Worker palignr m6, m7, 4 5353*c0909341SAndroid Build Coastguard Worker 5354*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 1, [base+filter_shuf2] 5355*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5356*c0909341SAndroid Build Coastguard Worker movd [dstq+8+strideq*0], m7 5357*c0909341SAndroid Build Coastguard Worker psrlq m7, 32 5358*c0909341SAndroid Build Coastguard Worker palignr m7, m6, 4 5359*c0909341SAndroid Build Coastguard Worker 5360*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 1, [base+filter_shuf2] 5361*c0909341SAndroid Build Coastguard Worker movu m1, [filterq] 5362*c0909341SAndroid Build Coastguard Worker punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ 5363*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5364*c0909341SAndroid Build Coastguard Worker movd [dstq+12+strideq*0], m6 5365*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 5366*c0909341SAndroid Build Coastguard Worker palignr m6, m7, 4 5367*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m6 5368*c0909341SAndroid Build Coastguard Worker 5369*c0909341SAndroid Build Coastguard Worker mova m6, m1 5370*c0909341SAndroid Build Coastguard Worker 5371*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 6, [base+filter_shuf2] 5372*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5373*c0909341SAndroid Build Coastguard Worker movd [dstq+16+strideq*0], m7 5374*c0909341SAndroid Build Coastguard Worker psrlq m7, 32 5375*c0909341SAndroid Build Coastguard Worker palignr m7, m1, 4 5376*c0909341SAndroid Build Coastguard Worker 5377*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 1, [base+filter_shuf2] 5378*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5379*c0909341SAndroid Build Coastguard Worker movd [dstq+20+strideq*0], m6 5380*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 5381*c0909341SAndroid Build Coastguard Worker palignr m6, m7, 4 5382*c0909341SAndroid Build Coastguard Worker 5383*c0909341SAndroid Build Coastguard Worker FILTER 7, 0, 1, [base+filter_shuf2] 5384*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5385*c0909341SAndroid Build Coastguard Worker movd [dstq+24+strideq*0], m7 5386*c0909341SAndroid Build Coastguard Worker psrlq m7, 32 5387*c0909341SAndroid Build Coastguard Worker palignr m7, m6, 4 5388*c0909341SAndroid Build Coastguard Worker 5389*c0909341SAndroid Build Coastguard Worker FILTER 6, 0, 1, [base+filter_shuf2] 5390*c0909341SAndroid Build Coastguard Worker movd [dstq+28+strideq*0], m6 5391*c0909341SAndroid Build Coastguard Worker psrlq m6, 32 5392*c0909341SAndroid Build Coastguard Worker palignr m6, m7, 4 5393*c0909341SAndroid Build Coastguard Worker mova [dstq+16+strideq*1], m6 5394*c0909341SAndroid Build Coastguard Worker 5395*c0909341SAndroid Build Coastguard Worker mova m6, [dstq+strideq*1] 5396*c0909341SAndroid Build Coastguard Worker movd m0, [tlq+hq] ;_ 6 5 0 5397*c0909341SAndroid Build Coastguard Worker punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 5398*c0909341SAndroid Build Coastguard Worker lea filterq, [dstq+16+strideq*1] 5399*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5400*c0909341SAndroid Build Coastguard Worker sub hd, 2 5401*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5402*c0909341SAndroid Build Coastguard Worker RET 5403