1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 34 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 35spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 36 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 37 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 38 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 39spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 40 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 41spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 42 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 43 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 44 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 45spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 46 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 47 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 48 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 49spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 50 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 51 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 52 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 53prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 54 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 55 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 56 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 57prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 58 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 59 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 60 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 61prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 62 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 63 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 64 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 65spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 66 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 67 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 68 db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78 69spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78 70 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 71 db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110 72 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 73spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 74 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 75 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 76 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 77spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 79 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 80 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 81spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 82 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 83 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 84 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 85spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 86 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 87 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 88 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 89spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 90 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 91 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 92spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 93spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 94 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 95w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 96 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 97w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 98 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 99 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 100 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 101w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 102 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 103 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 104 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 105w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 106 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 107 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 108 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 109w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 110 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 111 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 112 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 113warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 114 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 115 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 116 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 117warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 118 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 119 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 120 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 121warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 122 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 123 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 124 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 125deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 126pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 127 dd 1 128pw_2048: times 2 dw 2048 129 dd 3 130pw_8192: times 2 dw 8192 131avg_shift: dw 5, 5, 3, 3 132pw_27615: times 2 dw 27615 133pw_32766: times 2 dw 32766 134warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 135warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 136warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 137blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 138resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 139resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 140resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 141resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 142resize_permE: dq 0, 2, 4, 6 143resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 144resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 145rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 146resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 147 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 148 149prep_hv_shift: dq 6, 4 150put_bilin_h_rnd: dw 8, 8, 10, 10 151prep_mul: dw 16, 16, 4, 4 152put_8tap_h_rnd: dd 34, 40 153prep_8tap_rnd: dd 128 - (8192 << 8) 154warp_8x8_rnd_h: dd 512, 2048 155warp_8x8_rnd_v: dd 262144, 65536 156warp_8x8t_rnd_v: dd 16384 - (8192 << 15) 157avg_round: dw -16400, -16400, -16388, -16388 158w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) 159mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) 160w_mask_round: dd 128, 64 161bidir_shift: dw 6, 6, 4, 4 162 163pb_64: times 4 db 64 164pw_m512: times 2 dw -512 165pw_2: times 2 dw 2 166pw_64: times 2 dw 64 167pd_32: dd 32 168pd_63: dd 63 169pd_128: dd 128 170pd_640: dd 640 171pd_2176: dd 2176 172pd_16384: dd 16384 173pd_0_4: dd 0, 4 174 175%define pw_16 prep_mul 176%define pd_512 warp_8x8_rnd_h 177 178%macro BASE_JMP_TABLE 3-* 179 %xdefine %1_%2_table (%%table - %3) 180 %xdefine %%base %1_%2 181 %%table: 182 %rep %0 - 2 183 dw %%base %+ _w%3 - %%base 184 %rotate 1 185 %endrep 186%endmacro 187 188%macro HV_JMP_TABLE 5-* 189 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) 190 %xdefine %%base %1_%3 191 %assign %%types %4 192 %if %%types & 1 193 %xdefine %1_%2_h_%3_table (%%h - %5) 194 %%h: 195 %rep %0 - 4 196 dw %%prefix %+ .h_w%5 - %%base 197 %rotate 1 198 %endrep 199 %rotate 4 200 %endif 201 %if %%types & 2 202 %xdefine %1_%2_v_%3_table (%%v - %5) 203 %%v: 204 %rep %0 - 4 205 dw %%prefix %+ .v_w%5 - %%base 206 %rotate 1 207 %endrep 208 %rotate 4 209 %endif 210 %if %%types & 4 211 %xdefine %1_%2_hv_%3_table (%%hv - %5) 212 %%hv: 213 %rep %0 - 4 214 dw %%prefix %+ .hv_w%5 - %%base 215 %rotate 1 216 %endrep 217 %endif 218%endmacro 219 220%macro BIDIR_JMP_TABLE 2-* 221 %xdefine %1_%2_table (%%table - 2*%3) 222 %xdefine %%base %1_%2_table 223 %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 224 %%table: 225 %rep %0 - 2 226 dd %%prefix %+ .w%3 - %%base 227 %rotate 1 228 %endrep 229%endmacro 230 231%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) 232%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) 233 234BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 235BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 236BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 237BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 238BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 239BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 240BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 241BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 242BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 243BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 244BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 245HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 246HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 247HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 248HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 249HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 250HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 251 252%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 253 254cextern mc_subpel_filters 255%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 256 257cextern mc_warp_filter 258cextern obmc_masks_avx2 259cextern resize_filter 260 261SECTION .text 262 263%if WIN64 264DECLARE_REG_TMP 4 265%else 266DECLARE_REG_TMP 8 267%endif 268 269INIT_ZMM avx512icl 270cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy 271 mov mxyd, r6m ; mx 272 lea r7, [put_avx512icl] 273 tzcnt t0d, wm 274 movifnidn hd, hm 275 test mxyd, mxyd 276 jnz .h 277 mov mxyd, r7m ; my 278 test mxyd, mxyd 279 jnz .v 280.put: 281 movzx t0d, word [r7+t0*2+table_offset(put,)] 282 add t0, r7 283 jmp t0 284.put_w2: 285 mov r6d, [srcq+ssq*0] 286 mov r7d, [srcq+ssq*1] 287 lea srcq, [srcq+ssq*2] 288 mov [dstq+dsq*0], r6d 289 mov [dstq+dsq*1], r7d 290 lea dstq, [dstq+dsq*2] 291 sub hd, 2 292 jg .put_w2 293 RET 294.put_w4: 295 mov r6, [srcq+ssq*0] 296 mov r7, [srcq+ssq*1] 297 lea srcq, [srcq+ssq*2] 298 mov [dstq+dsq*0], r6 299 mov [dstq+dsq*1], r7 300 lea dstq, [dstq+dsq*2] 301 sub hd, 2 302 jg .put_w4 303 RET 304.put_w8: 305 movu xmm0, [srcq+ssq*0] 306 movu xmm1, [srcq+ssq*1] 307 lea srcq, [srcq+ssq*2] 308 mova [dstq+dsq*0], xmm0 309 mova [dstq+dsq*1], xmm1 310 lea dstq, [dstq+dsq*2] 311 sub hd, 2 312 jg .put_w8 313 RET 314.put_w16: 315 movu ym0, [srcq+ssq*0] 316 movu ym1, [srcq+ssq*1] 317 lea srcq, [srcq+ssq*2] 318 mova [dstq+dsq*0], ym0 319 mova [dstq+dsq*1], ym1 320 lea dstq, [dstq+dsq*2] 321 sub hd, 2 322 jg .put_w16 323 RET 324.put_w32: 325 movu m0, [srcq+ssq*0] 326 movu m1, [srcq+ssq*1] 327 lea srcq, [srcq+ssq*2] 328 mova [dstq+dsq*0], m0 329 mova [dstq+dsq*1], m1 330 lea dstq, [dstq+dsq*2] 331 sub hd, 2 332 jg .put_w32 333 RET 334.put_w64: 335 movu m0, [srcq+ssq*0+64*0] 336 movu m1, [srcq+ssq*0+64*1] 337 movu m2, [srcq+ssq*1+64*0] 338 movu m3, [srcq+ssq*1+64*1] 339 lea srcq, [srcq+ssq*2] 340 mova [dstq+dsq*0+64*0], m0 341 mova [dstq+dsq*0+64*1], m1 342 mova [dstq+dsq*1+64*0], m2 343 mova [dstq+dsq*1+64*1], m3 344 lea dstq, [dstq+dsq*2] 345 sub hd, 2 346 jg .put_w64 347 RET 348.put_w128: 349 movu m0, [srcq+64*0] 350 movu m1, [srcq+64*1] 351 movu m2, [srcq+64*2] 352 movu m3, [srcq+64*3] 353 add srcq, ssq 354 mova [dstq+64*0], m0 355 mova [dstq+64*1], m1 356 mova [dstq+64*2], m2 357 mova [dstq+64*3], m3 358 add dstq, dsq 359 dec hd 360 jg .put_w128 361 RET 362.h: 363 vpbroadcastw m5, mxyd 364 mov mxyd, r7m ; my 365 vpbroadcastd m4, [pw_16] 366 psubw m4, m5 367 test mxyd, mxyd 368 jnz .hv 369 ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 370 movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] 371 mov r6d, r8m ; bitdepth_max 372 add t0, r7 373 shr r6d, 11 374 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] 375 jmp t0 376.h_w2: 377 movq xmm1, [srcq+ssq*0] 378 movhps xmm1, [srcq+ssq*1] 379 lea srcq, [srcq+ssq*2] 380 pmullw xmm0, xmm1, xm4 381 psrlq xmm1, 16 382 pmullw xmm1, xm5 383 paddw xmm0, xm6 384 paddw xmm0, xmm1 385 psrlw xmm0, 4 386 movd [dstq+dsq*0], xmm0 387 pextrd [dstq+dsq*1], xmm0, 2 388 lea dstq, [dstq+dsq*2] 389 sub hd, 2 390 jg .h_w2 391 RET 392.h_w4: 393 movq xmm0, [srcq+ssq*0+0] 394 movhps xmm0, [srcq+ssq*1+0] 395 movq xmm1, [srcq+ssq*0+2] 396 movhps xmm1, [srcq+ssq*1+2] 397 lea srcq, [srcq+ssq*2] 398 pmullw xmm0, xm4 399 pmullw xmm1, xm5 400 paddw xmm0, xm6 401 paddw xmm0, xmm1 402 psrlw xmm0, 4 403 movq [dstq+dsq*0], xmm0 404 movhps [dstq+dsq*1], xmm0 405 lea dstq, [dstq+dsq*2] 406 sub hd, 2 407 jg .h_w4 408 RET 409.h_w8: 410 movu xm0, [srcq+ssq*0+0] 411 vinserti32x4 ym0, [srcq+ssq*1+0], 1 412 movu xm1, [srcq+ssq*0+2] 413 vinserti32x4 ym1, [srcq+ssq*1+2], 1 414 lea srcq, [srcq+ssq*2] 415 pmullw ym0, ym4 416 pmullw ym1, ym5 417 paddw ym0, ym6 418 paddw ym0, ym1 419 psrlw ym0, 4 420 mova [dstq+dsq*0], xm0 421 vextracti32x4 [dstq+dsq*1], ym0, 1 422 lea dstq, [dstq+dsq*2] 423 sub hd, 2 424 jg .h_w8 425 RET 426.h_w16: 427 movu ym0, [srcq+ssq*0+0] 428 vinserti32x8 m0, [srcq+ssq*1+0], 1 429 movu ym1, [srcq+ssq*0+2] 430 vinserti32x8 m1, [srcq+ssq*1+2], 1 431 lea srcq, [srcq+ssq*2] 432 pmullw m0, m4 433 pmullw m1, m5 434 paddw m0, m6 435 paddw m0, m1 436 psrlw m0, 4 437 mova [dstq+dsq*0], ym0 438 vextracti32x8 [dstq+dsq*1], m0, 1 439 lea dstq, [dstq+dsq*2] 440 sub hd, 2 441 jg .h_w16 442 RET 443.h_w32: 444 pmullw m0, m4, [srcq+ssq*0+0] 445 pmullw m2, m5, [srcq+ssq*0+2] 446 pmullw m1, m4, [srcq+ssq*1+0] 447 pmullw m3, m5, [srcq+ssq*1+2] 448 lea srcq, [srcq+ssq*2] 449 paddw m0, m6 450 paddw m1, m6 451 paddw m0, m2 452 paddw m1, m3 453 psrlw m0, 4 454 psrlw m1, 4 455 mova [dstq+dsq*0], m0 456 mova [dstq+dsq*1], m1 457 lea dstq, [dstq+dsq*2] 458 sub hd, 2 459 jg .h_w32 460 RET 461.h_w64: 462 pmullw m0, m4, [srcq+64*0+0] 463 pmullw m2, m5, [srcq+64*0+2] 464 pmullw m1, m4, [srcq+64*1+0] 465 pmullw m3, m5, [srcq+64*1+2] 466 add srcq, ssq 467 paddw m0, m6 468 paddw m1, m6 469 paddw m0, m2 470 paddw m1, m3 471 psrlw m0, 4 472 psrlw m1, 4 473 mova [dstq+64*0], m0 474 mova [dstq+64*1], m1 475 add dstq, dsq 476 dec hd 477 jg .h_w64 478 RET 479.h_w128: 480 pmullw m0, m4, [srcq+64*0+0] 481 pmullw m7, m5, [srcq+64*0+2] 482 pmullw m1, m4, [srcq+64*1+0] 483 pmullw m8, m5, [srcq+64*1+2] 484 pmullw m2, m4, [srcq+64*2+0] 485 pmullw m9, m5, [srcq+64*2+2] 486 pmullw m3, m4, [srcq+64*3+0] 487 pmullw m10, m5, [srcq+64*3+2] 488 add srcq, ssq 489 REPX {paddw x, m6}, m0, m1, m2, m3 490 paddw m0, m7 491 paddw m1, m8 492 paddw m2, m9 493 paddw m3, m10 494 REPX {psrlw x, 4}, m0, m1, m2, m3 495 mova [dstq+64*0], m0 496 mova [dstq+64*1], m1 497 mova [dstq+64*2], m2 498 mova [dstq+64*3], m3 499 add dstq, dsq 500 dec hd 501 jg .h_w128 502 RET 503.v: 504 movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] 505 shl mxyd, 11 506 vpbroadcastw m8, mxyd 507 add t0, r7 508 jmp t0 509.v_w2: 510 movd xmm0, [srcq+ssq*0] 511.v_w2_loop: 512 movd xmm1, [srcq+ssq*1] 513 lea srcq, [srcq+ssq*2] 514 punpckldq xmm2, xmm0, xmm1 515 movd xmm0, [srcq+ssq*0] 516 punpckldq xmm1, xmm0 517 psubw xmm1, xmm2 518 pmulhrsw xmm1, xm8 519 paddw xmm1, xmm2 520 movd [dstq+dsq*0], xmm1 521 pextrd [dstq+dsq*1], xmm1, 1 522 lea dstq, [dstq+dsq*2] 523 sub hd, 2 524 jg .v_w2_loop 525 RET 526.v_w4: 527 movq xmm0, [srcq+ssq*0] 528.v_w4_loop: 529 movq xmm1, [srcq+ssq*1] 530 lea srcq, [srcq+ssq*2] 531 punpcklqdq xmm2, xmm0, xmm1 532 movq xmm0, [srcq+ssq*0] 533 punpcklqdq xmm1, xmm0 534 psubw xmm1, xmm2 535 pmulhrsw xmm1, xm8 536 paddw xmm1, xmm2 537 movq [dstq+dsq*0], xmm1 538 movhps [dstq+dsq*1], xmm1 539 lea dstq, [dstq+dsq*2] 540 sub hd, 2 541 jg .v_w4_loop 542 RET 543.v_w8: 544 movu xmm0, [srcq+ssq*0] 545.v_w8_loop: 546 vbroadcasti128 ymm1, [srcq+ssq*1] 547 lea srcq, [srcq+ssq*2] 548 vpblendd ymm2, ymm0, ymm1, 0xf0 549 vbroadcasti128 ymm0, [srcq+ssq*0] 550 vpblendd ymm1, ymm0, 0xf0 551 psubw ymm1, ymm2 552 pmulhrsw ymm1, ym8 553 paddw ymm1, ymm2 554 mova [dstq+dsq*0], xmm1 555 vextracti128 [dstq+dsq*1], ymm1, 1 556 lea dstq, [dstq+dsq*2] 557 sub hd, 2 558 jg .v_w8_loop 559 vzeroupper 560 RET 561.v_w16: 562 movu ym0, [srcq+ssq*0] 563.v_w16_loop: 564 movu ym3, [srcq+ssq*1] 565 lea srcq, [srcq+ssq*2] 566 psubw ym1, ym3, ym0 567 pmulhrsw ym1, ym8 568 paddw ym1, ym0 569 movu ym0, [srcq+ssq*0] 570 psubw ym2, ym0, ym3 571 pmulhrsw ym2, ym8 572 paddw ym2, ym3 573 mova [dstq+dsq*0], ym1 574 mova [dstq+dsq*1], ym2 575 lea dstq, [dstq+dsq*2] 576 sub hd, 2 577 jg .v_w16_loop 578 RET 579.v_w32: 580 movu m0, [srcq+ssq*0] 581.v_w32_loop: 582 movu m3, [srcq+ssq*1] 583 lea srcq, [srcq+ssq*2] 584 psubw m1, m3, m0 585 pmulhrsw m1, m8 586 paddw m1, m0 587 movu m0, [srcq+ssq*0] 588 psubw m2, m0, m3 589 pmulhrsw m2, m8 590 paddw m2, m3 591 mova [dstq+dsq*0], m1 592 mova [dstq+dsq*1], m2 593 lea dstq, [dstq+dsq*2] 594 sub hd, 2 595 jg .v_w32_loop 596 RET 597.v_w64: 598 movu m0, [srcq+ssq*0+64*0] 599 movu m1, [srcq+ssq*0+64*1] 600.v_w64_loop: 601 movu m2, [srcq+ssq*1+64*0] 602 movu m3, [srcq+ssq*1+64*1] 603 lea srcq, [srcq+ssq*2] 604 psubw m4, m2, m0 605 pmulhrsw m4, m8 606 paddw m4, m0 607 movu m0, [srcq+ssq*0+64*0] 608 psubw m5, m3, m1 609 pmulhrsw m5, m8 610 paddw m5, m1 611 movu m1, [srcq+ssq*0+64*1] 612 psubw m6, m0, m2 613 pmulhrsw m6, m8 614 psubw m7, m1, m3 615 pmulhrsw m7, m8 616 mova [dstq+dsq*0+64*0], m4 617 mova [dstq+dsq*0+64*1], m5 618 paddw m6, m2 619 paddw m7, m3 620 mova [dstq+dsq*1+64*0], m6 621 mova [dstq+dsq*1+64*1], m7 622 lea dstq, [dstq+dsq*2] 623 sub hd, 2 624 jg .v_w64_loop 625 RET 626.v_w128: 627 movu m0, [srcq+ssq*0+64*0] 628 movu m1, [srcq+ssq*0+64*1] 629 movu m2, [srcq+ssq*0+64*2] 630 movu m3, [srcq+ssq*0+64*3] 631.v_w128_loop: 632 movu m4, [srcq+ssq*1+64*0] 633 movu m5, [srcq+ssq*1+64*1] 634 movu m6, [srcq+ssq*1+64*2] 635 movu m7, [srcq+ssq*1+64*3] 636 lea srcq, [srcq+ssq*2] 637 psubw m9, m4, m0 638 pmulhrsw m9, m8 639 paddw m9, m0 640 movu m0, [srcq+ssq*0+64*0] 641 psubw m10, m5, m1 642 pmulhrsw m10, m8 643 paddw m10, m1 644 movu m1, [srcq+ssq*0+64*1] 645 psubw m11, m6, m2 646 pmulhrsw m11, m8 647 paddw m11, m2 648 movu m2, [srcq+ssq*0+64*2] 649 psubw m12, m7, m3 650 pmulhrsw m12, m8 651 paddw m12, m3 652 movu m3, [srcq+ssq*0+64*3] 653 mova [dstq+dsq*0+64*0], m9 654 psubw m9, m0, m4 655 pmulhrsw m9, m8 656 mova [dstq+dsq*0+64*1], m10 657 psubw m10, m1, m5 658 pmulhrsw m10, m8 659 mova [dstq+dsq*0+64*2], m11 660 psubw m11, m2, m6 661 pmulhrsw m11, m8 662 mova [dstq+dsq*0+64*3], m12 663 psubw m12, m3, m7 664 pmulhrsw m12, m8 665 paddw m9, m4 666 paddw m10, m5 667 mova [dstq+dsq*1+64*0], m9 668 mova [dstq+dsq*1+64*1], m10 669 paddw m11, m6 670 paddw m12, m7 671 mova [dstq+dsq*1+64*2], m11 672 mova [dstq+dsq*1+64*3], m12 673 lea dstq, [dstq+dsq*2] 674 sub hd, 2 675 jg .v_w128_loop 676 RET 677.hv: 678 movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] 679 shl mxyd, 11 680 vpbroadcastd m6, [pw_2] 681 vpbroadcastw m7, mxyd 682 vpbroadcastd m8, [pw_8192] 683 add t0, r7 684 test dword r8m, 0x800 685 jnz .hv_12bpc 686 psllw m4, 2 687 psllw m5, 2 688 vpbroadcastd m8, [pw_2048] 689.hv_12bpc: 690 jmp t0 691.hv_w2: 692 vpbroadcastq xmm1, [srcq+ssq*0] 693 pmullw xmm0, xmm1, xm4 694 psrlq xmm1, 16 695 pmullw xmm1, xm5 696 paddw xmm0, xm6 697 paddw xmm0, xmm1 698 psrlw xmm0, 2 699.hv_w2_loop: 700 movq xmm2, [srcq+ssq*1] 701 lea srcq, [srcq+ssq*2] 702 movhps xmm2, [srcq+ssq*0] 703 pmullw xmm1, xmm2, xm4 704 psrlq xmm2, 16 705 pmullw xmm2, xm5 706 paddw xmm1, xm6 707 paddw xmm1, xmm2 708 psrlw xmm1, 2 ; 1 _ 2 _ 709 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ 710 mova xmm0, xmm1 711 psubw xmm1, xmm2 712 paddw xmm1, xmm1 713 pmulhw xmm1, xm7 714 paddw xmm1, xmm2 715 pmulhrsw xmm1, xm8 716 movd [dstq+dsq*0], xmm1 717 pextrd [dstq+dsq*1], xmm1, 2 718 lea dstq, [dstq+dsq*2] 719 sub hd, 2 720 jg .hv_w2_loop 721 RET 722.hv_w4: 723 pmullw xmm0, xm4, [srcq+ssq*0-8] 724 pmullw xmm1, xm5, [srcq+ssq*0-6] 725 paddw xmm0, xm6 726 paddw xmm0, xmm1 727 psrlw xmm0, 2 728.hv_w4_loop: 729 movq xmm1, [srcq+ssq*1+0] 730 movq xmm2, [srcq+ssq*1+2] 731 lea srcq, [srcq+ssq*2] 732 movhps xmm1, [srcq+ssq*0+0] 733 movhps xmm2, [srcq+ssq*0+2] 734 pmullw xmm1, xm4 735 pmullw xmm2, xm5 736 paddw xmm1, xm6 737 paddw xmm1, xmm2 738 psrlw xmm1, 2 ; 1 2 739 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 740 mova xmm0, xmm1 741 psubw xmm1, xmm2 742 paddw xmm1, xmm1 743 pmulhw xmm1, xm7 744 paddw xmm1, xmm2 745 pmulhrsw xmm1, xm8 746 movq [dstq+dsq*0], xmm1 747 movhps [dstq+dsq*1], xmm1 748 lea dstq, [dstq+dsq*2] 749 sub hd, 2 750 jg .hv_w4_loop 751 RET 752.hv_w8: 753 pmullw xmm0, xm4, [srcq+ssq*0+0] 754 pmullw xmm1, xm5, [srcq+ssq*0+2] 755 paddw xmm0, xm6 756 paddw xmm0, xmm1 757 psrlw xmm0, 2 758 vinserti32x4 ym0, xmm0, 1 759.hv_w8_loop: 760 movu xm1, [srcq+ssq*1+0] 761 movu xm2, [srcq+ssq*1+2] 762 lea srcq, [srcq+ssq*2] 763 vinserti32x4 ym1, [srcq+ssq*0+0], 1 764 vinserti32x4 ym2, [srcq+ssq*0+2], 1 765 pmullw ym1, ym4 766 pmullw ym2, ym5 767 paddw ym1, ym6 768 paddw ym1, ym2 769 psrlw ym1, 2 ; 1 2 770 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 771 mova ym0, ym1 772 psubw ym1, ym2 773 paddw ym1, ym1 774 pmulhw ym1, ym7 775 paddw ym1, ym2 776 pmulhrsw ym1, ym8 777 mova [dstq+dsq*0], xm1 778 vextracti32x4 [dstq+dsq*1], ym1, 1 779 lea dstq, [dstq+dsq*2] 780 sub hd, 2 781 jg .hv_w8_loop 782 RET 783.hv_w16: 784 pmullw ym0, ym4, [srcq+ssq*0+0] 785 pmullw ym1, ym5, [srcq+ssq*0+2] 786 paddw ym0, ym6 787 paddw ym0, ym1 788 psrlw ym0, 2 789 vinserti32x8 m0, ym0, 1 790.hv_w16_loop: 791 movu ym1, [srcq+ssq*1+0] 792 movu ym2, [srcq+ssq*1+2] 793 lea srcq, [srcq+ssq*2] 794 vinserti32x8 m1, [srcq+ssq*0+0], 1 795 vinserti32x8 m2, [srcq+ssq*0+2], 1 796 pmullw m1, m4 797 pmullw m2, m5 798 paddw m1, m6 799 paddw m1, m2 800 psrlw m1, 2 ; 1 2 801 vshufi32x4 m2, m0, m1, q1032 ; 0 1 802 mova m0, m1 803 psubw m1, m2 804 paddw m1, m1 805 pmulhw m1, m7 806 paddw m1, m2 807 pmulhrsw m1, m8 808 mova [dstq+dsq*0], ym1 809 vextracti32x8 [dstq+dsq*1], m1, 1 810 lea dstq, [dstq+dsq*2] 811 sub hd, 2 812 jg .hv_w16_loop 813 RET 814.hv_w32: 815.hv_w64: 816.hv_w128: 817 movifnidn wd, wm 818 lea r6d, [hq+wq*8-256] 819 mov r4, srcq 820 mov r7, dstq 821.hv_w32_loop0: 822 pmullw m0, m4, [srcq+ssq*0+0] 823 pmullw m1, m5, [srcq+ssq*0+2] 824 paddw m0, m6 825 paddw m0, m1 826 psrlw m0, 2 827.hv_w32_loop: 828 pmullw m3, m4, [srcq+ssq*1+0] 829 pmullw m1, m5, [srcq+ssq*1+2] 830 lea srcq, [srcq+ssq*2] 831 paddw m3, m6 832 paddw m3, m1 833 psrlw m3, 2 834 psubw m1, m3, m0 835 paddw m1, m1 836 pmulhw m1, m7 837 paddw m1, m0 838 pmullw m0, m4, [srcq+ssq*0+0] 839 pmullw m2, m5, [srcq+ssq*0+2] 840 paddw m0, m6 841 paddw m0, m2 842 psrlw m0, 2 843 psubw m2, m0, m3 844 paddw m2, m2 845 pmulhw m2, m7 846 paddw m2, m3 847 pmulhrsw m1, m8 848 pmulhrsw m2, m8 849 mova [dstq+dsq*0], m1 850 mova [dstq+dsq*1], m2 851 lea dstq, [dstq+dsq*2] 852 sub hd, 2 853 jg .hv_w32_loop 854 add r4, 64 855 add r7, 64 856 movzx hd, r6b 857 mov srcq, r4 858 mov dstq, r7 859 sub r6d, 1<<8 860 jg .hv_w32_loop0 861 RET 862 863cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 864 movifnidn mxyd, r5m ; mx 865 lea r6, [prep_avx512icl] 866 tzcnt wd, wm 867 movifnidn hd, hm 868 test mxyd, mxyd 869 jnz .h 870 mov mxyd, r6m ; my 871 test mxyd, mxyd 872 jnz .v 873.prep: 874 movzx wd, word [r6+wq*2+table_offset(prep,)] 875 mov r5d, r7m ; bitdepth_max 876 vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] 877 add wq, r6 878 shr r5d, 11 879 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] 880 lea stride3q, [strideq*3] 881 jmp wq 882.prep_w4: 883 mov r3d, 0x0c 884 kmovb k1, r3d 885.prep_w4_loop: 886 movq xm0, [srcq+strideq*0] 887 movhps xm0, [srcq+strideq*1] 888 vpbroadcastq ym1, [srcq+strideq*2] 889 vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4} 890 lea srcq, [srcq+strideq*4] 891 pmullw ym0, ym4 892 psubw ym0, ym5 893 mova [tmpq], ym0 894 add tmpq, 32 895 sub hd, 4 896 jg .prep_w4_loop 897 RET 898.prep_w8: 899 movu xm0, [srcq+strideq*0] 900 vinserti32x4 ym0, [srcq+strideq*1], 1 901 vinserti32x4 m0, [srcq+strideq*2], 2 902 vinserti32x4 m0, [srcq+stride3q ], 3 903 lea srcq, [srcq+strideq*4] 904 pmullw m0, m4 905 psubw m0, m5 906 mova [tmpq], m0 907 add tmpq, 64 908 sub hd, 4 909 jg .prep_w8 910 RET 911.prep_w16: 912 movu ym0, [srcq+strideq*0] 913 vinserti32x8 m0, [srcq+strideq*1], 1 914 movu ym1, [srcq+strideq*2] 915 vinserti32x8 m1, [srcq+stride3q ], 1 916 lea srcq, [srcq+strideq*4] 917 pmullw m0, m4 918 pmullw m1, m4 919 psubw m0, m5 920 psubw m1, m5 921 mova [tmpq+64*0], m0 922 mova [tmpq+64*1], m1 923 add tmpq, 64*2 924 sub hd, 4 925 jg .prep_w16 926 RET 927.prep_w32: 928 pmullw m0, m4, [srcq+strideq*0] 929 pmullw m1, m4, [srcq+strideq*1] 930 pmullw m2, m4, [srcq+strideq*2] 931 pmullw m3, m4, [srcq+stride3q ] 932 lea srcq, [srcq+strideq*4] 933 REPX {psubw x, m5}, m0, m1, m2, m3 934 mova [tmpq+64*0], m0 935 mova [tmpq+64*1], m1 936 mova [tmpq+64*2], m2 937 mova [tmpq+64*3], m3 938 add tmpq, 64*4 939 sub hd, 4 940 jg .prep_w32 941 RET 942.prep_w64: 943 pmullw m0, m4, [srcq+strideq*0+64*0] 944 pmullw m1, m4, [srcq+strideq*0+64*1] 945 pmullw m2, m4, [srcq+strideq*1+64*0] 946 pmullw m3, m4, [srcq+strideq*1+64*1] 947 lea srcq, [srcq+strideq*2] 948 REPX {psubw x, m5}, m0, m1, m2, m3 949 mova [tmpq+64*0], m0 950 mova [tmpq+64*1], m1 951 mova [tmpq+64*2], m2 952 mova [tmpq+64*3], m3 953 add tmpq, 64*4 954 sub hd, 2 955 jg .prep_w64 956 RET 957.prep_w128: 958 pmullw m0, m4, [srcq+64*0] 959 pmullw m1, m4, [srcq+64*1] 960 pmullw m2, m4, [srcq+64*2] 961 pmullw m3, m4, [srcq+64*3] 962 add srcq, strideq 963 REPX {psubw x, m5}, m0, m1, m2, m3 964 mova [tmpq+64*0], m0 965 mova [tmpq+64*1], m1 966 mova [tmpq+64*2], m2 967 mova [tmpq+64*3], m3 968 add tmpq, 64*4 969 dec hd 970 jg .prep_w128 971 RET 972.h: 973 vpbroadcastw m5, mxyd 974 mov mxyd, r6m ; my 975 vpbroadcastd m4, [pw_16] 976 vpbroadcastd m6, [pw_32766] 977 psubw m4, m5 978 test dword r7m, 0x800 979 jnz .h_12bpc 980 psllw m4, 2 981 psllw m5, 2 982.h_12bpc: 983 test mxyd, mxyd 984 jnz .hv 985 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 986 add wq, r6 987 lea stride3q, [strideq*3] 988 jmp wq 989.h_w4: 990 movu xm1, [srcq+strideq*0] 991 vinserti32x4 ym1, [srcq+strideq*2], 1 992 movu xm2, [srcq+strideq*1] 993 vinserti32x4 ym2, [srcq+stride3q ], 1 994 lea srcq, [srcq+strideq*4] 995 punpcklqdq ym0, ym1, ym2 996 psrldq ym1, 2 997 psrldq ym2, 2 998 pmullw ym0, ym4 999 punpcklqdq ym1, ym2 1000 pmullw ym1, ym5 1001 psubw ym0, ym6 1002 paddw ym0, ym1 1003 psraw ym0, 2 1004 mova [tmpq], ym0 1005 add tmpq, 32 1006 sub hd, 4 1007 jg .h_w4 1008 RET 1009.h_w8: 1010 movu xm0, [srcq+strideq*0+0] 1011 movu xm1, [srcq+strideq*0+2] 1012 vinserti32x4 ym0, [srcq+strideq*1+0], 1 1013 vinserti32x4 ym1, [srcq+strideq*1+2], 1 1014 vinserti32x4 m0, [srcq+strideq*2+0], 2 1015 vinserti32x4 m1, [srcq+strideq*2+2], 2 1016 vinserti32x4 m0, [srcq+stride3q +0], 3 1017 vinserti32x4 m1, [srcq+stride3q +2], 3 1018 lea srcq, [srcq+strideq*4] 1019 pmullw m0, m4 1020 pmullw m1, m5 1021 psubw m0, m6 1022 paddw m0, m1 1023 psraw m0, 2 1024 mova [tmpq], m0 1025 add tmpq, 64 1026 sub hd, 4 1027 jg .h_w8 1028 RET 1029.h_w16: 1030 movu ym0, [srcq+strideq*0+0] 1031 vinserti32x8 m0, [srcq+strideq*1+0], 1 1032 movu ym1, [srcq+strideq*0+2] 1033 vinserti32x8 m1, [srcq+strideq*1+2], 1 1034 lea srcq, [srcq+strideq*2] 1035 pmullw m0, m4 1036 pmullw m1, m5 1037 psubw m0, m6 1038 paddw m0, m1 1039 psraw m0, 2 1040 mova [tmpq], m0 1041 add tmpq, 64 1042 sub hd, 2 1043 jg .h_w16 1044 RET 1045.h_w32: 1046 pmullw m0, m4, [srcq+strideq*0+0] 1047 pmullw m2, m5, [srcq+strideq*0+2] 1048 pmullw m1, m4, [srcq+strideq*1+0] 1049 pmullw m3, m5, [srcq+strideq*1+2] 1050 lea srcq, [srcq+strideq*2] 1051 psubw m0, m6 1052 psubw m1, m6 1053 paddw m0, m2 1054 paddw m1, m3 1055 psraw m0, 2 1056 psraw m1, 2 1057 mova [tmpq+64*0], m0 1058 mova [tmpq+64*1], m1 1059 add tmpq, 64*2 1060 sub hd, 2 1061 jg .h_w32 1062 RET 1063.h_w64: 1064 pmullw m0, m4, [srcq+ 0] 1065 pmullw m2, m5, [srcq+ 2] 1066 pmullw m1, m4, [srcq+64] 1067 pmullw m3, m5, [srcq+66] 1068 add srcq, strideq 1069 psubw m0, m6 1070 psubw m1, m6 1071 paddw m0, m2 1072 paddw m1, m3 1073 psraw m0, 2 1074 psraw m1, 2 1075 mova [tmpq+64*0], m0 1076 mova [tmpq+64*1], m1 1077 add tmpq, 64*2 1078 dec hd 1079 jg .h_w64 1080 RET 1081.h_w128: 1082 pmullw m0, m4, [srcq+ 0] 1083 pmullw m7, m5, [srcq+ 2] 1084 pmullw m1, m4, [srcq+ 64] 1085 pmullw m8, m5, [srcq+ 66] 1086 pmullw m2, m4, [srcq+128] 1087 pmullw m9, m5, [srcq+130] 1088 pmullw m3, m4, [srcq+192] 1089 pmullw m10, m5, [srcq+194] 1090 add srcq, strideq 1091 REPX {psubw x, m6}, m0, m1, m2, m3 1092 paddw m0, m7 1093 paddw m1, m8 1094 paddw m2, m9 1095 paddw m3, m10 1096 REPX {psraw x, 2}, m0, m1, m2, m3 1097 mova [tmpq+64*0], m0 1098 mova [tmpq+64*1], m1 1099 mova [tmpq+64*2], m2 1100 mova [tmpq+64*3], m3 1101 add tmpq, 64*4 1102 dec hd 1103 jg .h_w128 1104 RET 1105.v: 1106 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1107 vpbroadcastw m9, mxyd 1108 vpbroadcastd m8, [pw_16] 1109 vpbroadcastd m10, [pw_32766] 1110 add wq, r6 1111 lea stride3q, [strideq*3] 1112 psubw m8, m9 1113 test dword r7m, 0x800 1114 jnz .v_12bpc 1115 psllw m8, 2 1116 psllw m9, 2 1117.v_12bpc: 1118 jmp wq 1119.v_w4: 1120 movq xmm0, [srcq+strideq*0] 1121.v_w4_loop: 1122 vpbroadcastq xmm2, [srcq+strideq*1] 1123 vpbroadcastq ymm1, [srcq+strideq*2] 1124 vpbroadcastq ymm3, [srcq+stride3q ] 1125 lea srcq, [srcq+strideq*4] 1126 vpblendd ymm2, ymm1, 0x30 1127 vpblendd ymm2, ymm3, 0xc0 1128 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 1129 movq xmm0, [srcq+strideq*0] 1130 valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 1131 pmullw ymm1, ym8 1132 pmullw ymm2, ym9 1133 psubw ymm1, ym10 1134 paddw ymm1, ymm2 1135 psraw ymm1, 2 1136 mova [tmpq], ymm1 1137 add tmpq, 32 1138 sub hd, 4 1139 jg .v_w4_loop 1140 vzeroupper 1141 RET 1142.v_w8: 1143 movu xm0, [srcq+strideq*0] 1144.v_w8_loop: 1145 vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1146 vinserti32x4 m1, [srcq+strideq*2], 2 1147 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 1148 lea srcq, [srcq+strideq*4] 1149 movu xm0, [srcq+strideq*0] 1150 valignq m2, m0, m1, 2 ; 1 2 3 4 1151 pmullw m1, m8 1152 pmullw m2, m9 1153 psubw m1, m10 1154 paddw m1, m2 1155 psraw m1, 2 1156 mova [tmpq], m1 1157 add tmpq, 64 1158 sub hd, 4 1159 jg .v_w8_loop 1160 RET 1161.v_w16: 1162 movu ym0, [srcq+strideq*0] 1163.v_w16_loop: 1164 vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 1165 movu ym3, [srcq+strideq*2] 1166 vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 1167 lea srcq, [srcq+strideq*4] 1168 movu ym0, [srcq+strideq*0] 1169 vshufi32x4 m3, m1, m3, q1032 ; 1 2 1170 vshufi32x4 m4, m2, m0, q1032 ; 3 4 1171 pmullw m1, m8 1172 pmullw m2, m8 1173 pmullw m3, m9 1174 pmullw m4, m9 1175 psubw m1, m10 1176 psubw m2, m10 1177 paddw m1, m3 1178 paddw m2, m4 1179 psraw m1, 2 1180 psraw m2, 2 1181 mova [tmpq+64*0], m1 1182 mova [tmpq+64*1], m2 1183 add tmpq, 64*2 1184 sub hd, 4 1185 jg .v_w16_loop 1186 RET 1187.v_w32: 1188 movu m0, [srcq+strideq*0] 1189.v_w32_loop: 1190 movu m3, [srcq+strideq*1] 1191 lea srcq, [srcq+strideq*2] 1192 pmullw m1, m8, m0 1193 movu m0, [srcq+strideq*0] 1194 pmullw m2, m8, m3 1195 pmullw m3, m9 1196 pmullw m4, m9, m0 1197 psubw m1, m10 1198 psubw m2, m10 1199 paddw m1, m3 1200 paddw m2, m4 1201 psraw m1, 2 1202 psraw m2, 2 1203 mova [tmpq+64*0], m1 1204 mova [tmpq+64*1], m2 1205 add tmpq, 64*2 1206 sub hd, 2 1207 jg .v_w32_loop 1208 RET 1209.v_w64: 1210 movu m0, [srcq+64*0] 1211 movu m1, [srcq+64*1] 1212.v_w64_loop: 1213 add srcq, strideq 1214 pmullw m2, m8, m0 1215 movu m0, [srcq+64*0] 1216 pmullw m3, m8, m1 1217 movu m1, [srcq+64*1] 1218 pmullw m4, m9, m0 1219 pmullw m5, m9, m1 1220 psubw m2, m10 1221 psubw m3, m10 1222 paddw m2, m4 1223 paddw m3, m5 1224 psraw m2, 2 1225 psraw m3, 2 1226 mova [tmpq+64*0], m2 1227 mova [tmpq+64*1], m3 1228 add tmpq, 64*2 1229 dec hd 1230 jg .v_w64_loop 1231 RET 1232.v_w128: 1233 movu m0, [srcq+64*0] 1234 movu m1, [srcq+64*1] 1235 movu m2, [srcq+64*2] 1236 movu m3, [srcq+64*3] 1237.v_w128_loop: 1238 add srcq, strideq 1239 pmullw m4, m8, m0 1240 movu m0, [srcq+64*0] 1241 pmullw m5, m8, m1 1242 movu m1, [srcq+64*1] 1243 pmullw m6, m8, m2 1244 movu m2, [srcq+64*2] 1245 pmullw m7, m8, m3 1246 movu m3, [srcq+64*3] 1247 pmullw m11, m9, m0 1248 pmullw m12, m9, m1 1249 pmullw m13, m9, m2 1250 pmullw m14, m9, m3 1251 REPX {psubw x, m10}, m4, m5, m6, m7 1252 paddw m4, m11 1253 paddw m5, m12 1254 paddw m6, m13 1255 paddw m7, m14 1256 REPX {psraw x, 2}, m4, m5, m6, m7 1257 mova [tmpq+64*0], m4 1258 mova [tmpq+64*1], m5 1259 mova [tmpq+64*2], m6 1260 mova [tmpq+64*3], m7 1261 add tmpq, 64*4 1262 dec hd 1263 jg .v_w128_loop 1264 RET 1265.hv: 1266 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1267 shl mxyd, 11 1268 vpbroadcastw m7, mxyd 1269 add wq, r6 1270 lea stride3q, [strideq*3] 1271 jmp wq 1272.hv_w4: 1273 movq xmm0, [srcq+strideq*0+0] 1274 movq xmm1, [srcq+strideq*0+2] 1275 pmullw xmm0, xm4 1276 pmullw xmm1, xm5 1277 psubw xmm0, xm6 1278 paddw xmm0, xmm1 1279 psraw xmm0, 2 1280 vpbroadcastq ym0, xmm0 1281.hv_w4_loop: 1282 movu xm1, [srcq+strideq*1] 1283 vinserti128 ym1, [srcq+stride3q ], 1 1284 movu xm2, [srcq+strideq*2] 1285 lea srcq, [srcq+strideq*4] 1286 vinserti128 ym2, [srcq+strideq*0], 1 1287 punpcklqdq ym3, ym1, ym2 1288 psrldq ym1, 2 1289 psrldq ym2, 2 1290 pmullw ym3, ym4 1291 punpcklqdq ym1, ym2 1292 pmullw ym1, ym5 1293 psubw ym3, ym6 1294 paddw ym1, ym3 1295 psraw ym1, 2 ; 1 2 3 4 1296 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 1297 mova ym0, ym1 1298 psubw ym1, ym2 1299 pmulhrsw ym1, ym7 1300 paddw ym1, ym2 1301 mova [tmpq], ym1 1302 add tmpq, 32 1303 sub hd, 4 1304 jg .hv_w4_loop 1305 RET 1306.hv_w8: 1307 pmullw xm0, xm4, [srcq+strideq*0+0] 1308 pmullw xm1, xm5, [srcq+strideq*0+2] 1309 psubw xm0, xm6 1310 paddw xm0, xm1 1311 psraw xm0, 2 1312 vinserti32x4 m0, xm0, 3 1313.hv_w8_loop: 1314 movu xm1, [srcq+strideq*1+0] 1315 movu xm2, [srcq+strideq*1+2] 1316 vinserti32x4 ym1, [srcq+strideq*2+0], 1 1317 vinserti32x4 ym2, [srcq+strideq*2+2], 1 1318 vinserti32x4 m1, [srcq+stride3q +0], 2 1319 vinserti32x4 m2, [srcq+stride3q +2], 2 1320 lea srcq, [srcq+strideq*4] 1321 vinserti32x4 m1, [srcq+strideq*0+0], 3 1322 vinserti32x4 m2, [srcq+strideq*0+2], 3 1323 pmullw m1, m4 1324 pmullw m2, m5 1325 psubw m1, m6 1326 paddw m1, m2 1327 psraw m1, 2 ; 1 2 3 4 1328 valignq m2, m1, m0, 6 ; 0 1 2 3 1329 mova m0, m1 1330 psubw m1, m2 1331 pmulhrsw m1, m7 1332 paddw m1, m2 1333 mova [tmpq], m1 1334 add tmpq, 64 1335 sub hd, 4 1336 jg .hv_w8_loop 1337 RET 1338.hv_w16: 1339 pmullw ym0, ym4, [srcq+strideq*0+0] 1340 pmullw ym1, ym5, [srcq+strideq*0+2] 1341 psubw ym0, ym6 1342 paddw ym0, ym1 1343 psraw ym0, 2 1344 vinserti32x8 m0, ym0, 1 1345.hv_w16_loop: 1346 movu ym1, [srcq+strideq*1+0] 1347 movu ym2, [srcq+strideq*1+2] 1348 lea srcq, [srcq+strideq*2] 1349 vinserti32x8 m1, [srcq+strideq*0+0], 1 1350 vinserti32x8 m2, [srcq+strideq*0+2], 1 1351 pmullw m1, m4 1352 pmullw m2, m5 1353 psubw m1, m6 1354 paddw m1, m2 1355 psraw m1, 2 ; 1 2 1356 vshufi32x4 m2, m0, m1, q1032 ; 0 1 1357 mova m0, m1 1358 psubw m1, m2 1359 pmulhrsw m1, m7 1360 paddw m1, m2 1361 mova [tmpq], m1 1362 add tmpq, 64 1363 sub hd, 2 1364 jg .hv_w16_loop 1365 RET 1366.hv_w32: 1367 pmullw m0, m4, [srcq+strideq*0+0] 1368 pmullw m1, m5, [srcq+strideq*0+2] 1369 psubw m0, m6 1370 paddw m0, m1 1371 psraw m0, 2 1372.hv_w32_loop: 1373 pmullw m3, m4, [srcq+strideq*1+0] 1374 pmullw m1, m5, [srcq+strideq*1+2] 1375 lea srcq, [srcq+strideq*2] 1376 psubw m3, m6 1377 paddw m3, m1 1378 psraw m3, 2 1379 psubw m1, m3, m0 1380 pmulhrsw m1, m7 1381 paddw m1, m0 1382 pmullw m0, m4, [srcq+strideq*0+0] 1383 pmullw m2, m5, [srcq+strideq*0+2] 1384 psubw m0, m6 1385 paddw m0, m2 1386 psraw m0, 2 1387 psubw m2, m0, m3 1388 pmulhrsw m2, m7 1389 paddw m2, m3 1390 mova [tmpq+64*0], m1 1391 mova [tmpq+64*1], m2 1392 add tmpq, 64*2 1393 sub hd, 2 1394 jg .hv_w32_loop 1395 RET 1396.hv_w64: 1397 pmullw m0, m4, [srcq+ 0] 1398 pmullw m2, m5, [srcq+ 2] 1399 pmullw m1, m4, [srcq+64] 1400 pmullw m3, m5, [srcq+66] 1401 psubw m0, m6 1402 psubw m1, m6 1403 paddw m0, m2 1404 paddw m1, m3 1405 psraw m0, 2 1406 psraw m1, 2 1407.hv_w64_loop: 1408 add srcq, strideq 1409 pmullw m2, m4, [srcq+ 0] 1410 pmullw m8, m5, [srcq+ 2] 1411 pmullw m3, m4, [srcq+64] 1412 pmullw m9, m5, [srcq+66] 1413 psubw m2, m6 1414 psubw m3, m6 1415 paddw m2, m8 1416 paddw m3, m9 1417 psraw m2, 2 1418 psraw m3, 2 1419 psubw m8, m2, m0 1420 psubw m9, m3, m1 1421 pmulhrsw m8, m7 1422 pmulhrsw m9, m7 1423 paddw m8, m0 1424 mova m0, m2 1425 paddw m9, m1 1426 mova m1, m3 1427 mova [tmpq+64*0], m8 1428 mova [tmpq+64*1], m9 1429 add tmpq, 64*2 1430 dec hd 1431 jg .hv_w64_loop 1432 RET 1433.hv_w128: 1434 pmullw m0, m4, [srcq+ 0] 1435 pmullw m8, m5, [srcq+ 2] 1436 pmullw m1, m4, [srcq+ 64] 1437 pmullw m9, m5, [srcq+ 66] 1438 pmullw m2, m4, [srcq+128] 1439 pmullw m10, m5, [srcq+130] 1440 pmullw m3, m4, [srcq+192] 1441 pmullw m11, m5, [srcq+194] 1442 REPX {psubw x, m6}, m0, m1, m2, m3 1443 paddw m0, m8 1444 paddw m1, m9 1445 paddw m2, m10 1446 paddw m3, m11 1447 REPX {psraw x, 2}, m0, m1, m2, m3 1448.hv_w128_loop: 1449 add srcq, strideq 1450 pmullw m8, m4, [srcq+ 0] 1451 pmullw m12, m5, [srcq+ 2] 1452 pmullw m9, m4, [srcq+ 64] 1453 pmullw m13, m5, [srcq+ 66] 1454 pmullw m10, m4, [srcq+128] 1455 pmullw m14, m5, [srcq+130] 1456 pmullw m11, m4, [srcq+192] 1457 pmullw m15, m5, [srcq+194] 1458 REPX {psubw x, m6}, m8, m9, m10, m11 1459 paddw m8, m12 1460 paddw m9, m13 1461 paddw m10, m14 1462 paddw m11, m15 1463 REPX {psraw x, 2}, m8, m9, m10, m11 1464 psubw m12, m8, m0 1465 psubw m13, m9, m1 1466 psubw m14, m10, m2 1467 psubw m15, m11, m3 1468 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 1469 paddw m12, m0 1470 mova m0, m8 1471 paddw m13, m1 1472 mova m1, m9 1473 mova [tmpq+64*0], m12 1474 mova [tmpq+64*1], m13 1475 paddw m14, m2 1476 mova m2, m10 1477 paddw m15, m3 1478 mova m3, m11 1479 mova [tmpq+64*2], m14 1480 mova [tmpq+64*3], m15 1481 add tmpq, 64*4 1482 dec hd 1483 jg .hv_w128_loop 1484 RET 1485 1486; int8_t subpel_filters[5][15][8] 1487%assign FILTER_REGULAR (0*15 << 16) | 3*15 1488%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1489%assign FILTER_SHARP (2*15 << 16) | 3*15 1490 1491%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1492cglobal %1_%2_16bpc 1493 mov t0d, FILTER_%3 1494%ifidn %3, %4 1495 mov t1d, t0d 1496%else 1497 mov t1d, FILTER_%4 1498%endif 1499%if %0 == 5 ; skip the jump in the last filter 1500 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1501%endif 1502%endmacro 1503 1504%if WIN64 1505DECLARE_REG_TMP 4, 5 1506%define buf rsp+stack_offset+8 ; shadow space 1507%else 1508DECLARE_REG_TMP 7, 8 1509%define buf rsp-40 ; red zone 1510%endif 1511 1512%define PUT_8TAP_FN FN put_8tap, 1513PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc 1514PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc 1515PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc 1516PUT_8TAP_FN regular, REGULAR, REGULAR 1517 1518cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1519%define base r8-put_avx512icl 1520 imul mxd, mxm, 0x010101 1521 add mxd, t0d ; 6tap_h, mx, 4tap_h 1522 imul myd, mym, 0x010101 1523 add myd, t1d ; 6tap_v, my, 4tap_v 1524 lea r8, [put_avx512icl] 1525 movifnidn wd, wm 1526 movifnidn hd, hm 1527 test mxd, 0xf00 1528 jnz .h 1529 test myd, 0xf00 1530 jnz .v 1531.put: 1532 tzcnt wd, wd 1533 movzx wd, word [r8+wq*2+table_offset(put,)] 1534 add wq, r8 1535%if WIN64 1536 pop r8 1537%endif 1538 jmp wq 1539.h_w8: 1540 mova m4, [spel_h_shufA] 1541 movu m5, [spel_h_shufB] 1542 movu m6, [spel_h_shufC] 1543.h_w8_loop: 1544 movu ym2, [srcq+ssq*0] 1545 vinserti32x8 m2, [srcq+ssq*1], 1 1546 lea srcq, [srcq+ssq*2] 1547 mova m0, m8 1548 vpermb m1, m4, m2 1549 vpdpwssd m0, m10, m1 1550 vpermb m1, m5, m2 1551 vpdpwssd m0, m11, m1 1552 vpermb m1, m6, m2 1553 vpdpwssd m0, m12, m1 1554 psrad m0, 6 1555 vextracti32x8 ym1, m0, 1 1556 packusdw ym0, ym1 1557 pminsw ym0, ym15 1558 mova [dstq+dsq*0], xm0 1559 vextracti32x4 [dstq+dsq*1], ym0, 1 1560 lea dstq, [dstq+dsq*2] 1561 sub hd, 2 1562 jg .h_w8_loop 1563 RET 1564.h: 1565 vpbroadcastw m15, r8m 1566 test myd, 0xf00 1567 jnz .hv 1568 mov r7d, r8m 1569 shr r7d, 11 1570 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] 1571 cmp wd, 4 1572 jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4 1573 shr mxd, 16 1574 sub srcq, 4 1575 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 1576 mova [buf], xmm0 1577 vpbroadcastd m10, xmm0 1578 vpbroadcastd m12, [buf+8] 1579 vpbroadcastd m11, [buf+4] 1580 sub wd, 16 1581 jl .h_w8 1582 vbroadcasti32x4 m6, [spel_h_shufA] 1583 vbroadcasti32x4 m7, [spel_h_shufB] 1584 jg .h_w32 1585.h_w16_loop: 1586 movu ym2, [srcq+ssq*0+ 0] 1587 vinserti32x8 m2, [srcq+ssq*1+ 0], 1 1588 movu ym3, [srcq+ssq*0+12] 1589 vinserti32x8 m3, [srcq+ssq*1+12], 1 1590 lea srcq, [srcq+ssq*2] 1591 mova m0, m8 1592 mova m1, m8 1593 pshufb m4, m2, m6 1594 vpdpwssd m0, m10, m4 ; a0 b0 1595 pshufb m4, m3, m7 1596 vpdpwssd m1, m12, m4 ; a2' b2' 1597 pshufb m2, m7 1598 pshufb m3, m6 1599 vpdpwssd m0, m11, m2 ; a1 b1 1600 vpdpwssd m1, m11, m3 ; a1' b1' 1601 shufpd m2, m3, 0x55 1602 vpdpwssd m0, m12, m2 ; a2 b2 1603 vpdpwssd m1, m10, m2 ; a0' b0' 1604 psrad m0, 6 1605 psrad m1, 6 1606 packusdw m0, m1 1607 pminsw m0, m15 1608 mova [dstq+dsq*0], ym0 1609 vextracti32x8 [dstq+dsq*1], m0, 1 1610 lea dstq, [dstq+dsq*2] 1611 sub hd, 2 1612 jg .h_w16_loop 1613 RET 1614.h_w32: 1615 lea srcq, [srcq+wq*2] 1616 lea dstq, [dstq+wq*2] 1617 neg wq 1618.h_w32_loop0: 1619 mov r6, wq 1620.h_w32_loop: 1621 movu m2, [srcq+r6*2+ 0] 1622 movu m3, [srcq+r6*2+12] 1623 mova m0, m8 1624 mova m1, m8 1625 pshufb m4, m2, m6 1626 vpdpwssd m0, m10, m4 ; a0 1627 pshufb m4, m3, m7 1628 vpdpwssd m1, m12, m4 ; b2 1629 pshufb m2, m7 1630 pshufb m3, m6 1631 vpdpwssd m0, m11, m2 ; a1 1632 vpdpwssd m1, m11, m3 ; b1 1633 shufpd m2, m3, 0x55 1634 vpdpwssd m0, m12, m2 ; a2 1635 vpdpwssd m1, m10, m2 ; b0 1636 psrad m0, 6 1637 psrad m1, 6 1638 packusdw m0, m1 1639 pminsw m0, m15 1640 mova [dstq+r6*2], m0 1641 add r6, 32 1642 jl .h_w32_loop 1643 add srcq, ssq 1644 add dstq, dsq 1645 dec hd 1646 jg .h_w32_loop0 1647 RET 1648.v: 1649 movzx mxd, myb 1650 shr myd, 16 1651 cmp hd, 6 1652 cmovs myd, mxd 1653 vpbroadcastd m11, [pd_32] 1654 pmovsxbw xmm0, [base+subpel_filters+1+myq*8] 1655 tzcnt r7d, wd 1656 vpbroadcastw m15, r8m 1657 mov r6, ssq 1658 movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)] 1659 neg r6 1660 mova [rsp+stack_offset+8], xmm0 1661 vpbroadcastd m12, xmm0 1662 add r7, r8 1663 vpbroadcastd m13, [rsp+stack_offset+12] 1664 vpbroadcastd m14, [rsp+stack_offset+16] 1665 jmp r7 1666.v_w2: 1667 movd xmm2, [srcq+r6 *2] 1668 pinsrd xmm2, [srcq+r6 *1], 1 1669 pinsrd xmm2, [srcq+ssq*0], 2 1670 pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 1671 lea srcq, [srcq+ssq*2] 1672 movd xmm0, [srcq+ssq*0] 1673 palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1674 punpcklwd xmm1, xmm2, xmm3 ; 01 12 1675 punpckhwd xmm2, xmm3 ; 23 34 1676.v_w2_loop: 1677 movd xmm3, [srcq+ssq*1] 1678 mova xmm4, xm11 1679 vpdpwssd xmm4, xmm1, xm12 ; a0 b0 1680 lea srcq, [srcq+ssq*2] 1681 mova xmm1, xmm2 1682 vpdpwssd xmm4, xmm2, xm13 ; a1 b1 1683 punpckldq xmm2, xmm0, xmm3 ; 4 5 1684 movd xmm0, [srcq+ssq*0] 1685 punpckldq xmm3, xmm0 ; 5 6 1686 punpcklwd xmm2, xmm3 ; 45 56 1687 vpdpwssd xmm4, xmm2, xm14 ; a2 b2 1688 psrad xmm4, 6 1689 packusdw xmm4, xmm4 1690 pminsw xmm4, xm15 1691 movd [dstq+dsq*0], xmm4 1692 pextrd [dstq+dsq*1], xmm4, 1 1693 lea dstq, [dstq+dsq*2] 1694 sub hd, 2 1695 jg .v_w2_loop 1696 RET 1697.v_w4: 1698 movq xmm1, [srcq+r6 *2] 1699 vpbroadcastq ymm3, [srcq+r6 *1] 1700 vpbroadcastq ymm2, [srcq+ssq*0] 1701 vpbroadcastq ymm4, [srcq+ssq*1] 1702 lea srcq, [srcq+ssq*2] 1703 vpbroadcastq ymm0, [srcq+ssq*0] 1704 vpblendd ymm1, ymm3, 0x30 1705 vpblendd ymm3, ymm2, 0x30 1706 punpcklwd ymm1, ymm3 ; 01 12 1707 vpblendd ymm2, ymm4, 0x30 1708 vpblendd ymm4, ymm0, 0x30 1709 punpcklwd ymm2, ymm4 ; 23 34 1710.v_w4_loop: 1711 vpbroadcastq ymm3, [srcq+ssq*1] 1712 mova ymm4, ym11 1713 vpdpwssd ymm4, ymm1, ym12 ; a0 b0 1714 lea srcq, [srcq+ssq*2] 1715 mova ymm1, ymm2 1716 vpdpwssd ymm4, ymm2, ym13 ; a1 b1 1717 vpblendd ymm2, ymm0, ymm3, 0x30 1718 vpbroadcastq ymm0, [srcq+ssq*0] 1719 vpblendd ymm3, ymm0, 0x30 1720 punpcklwd ymm2, ymm3 ; 45 56 1721 vpdpwssd ymm4, ymm2, ym14 ; a2 b2 1722 psrad ymm4, 6 1723 vextracti128 xmm3, ymm4, 1 1724 packusdw xmm4, xmm3 1725 pminsw xmm4, xm15 1726 movq [dstq+dsq*0], xmm4 1727 movhps [dstq+dsq*1], xmm4 1728 lea dstq, [dstq+dsq*2] 1729 sub hd, 2 1730 jg .v_w4_loop 1731 vzeroupper 1732 RET 1733.v_w8: 1734 vbroadcasti32x4 m0, [srcq+ssq*0] 1735 vinserti32x4 m1, m0, [srcq+r6 *2], 0 1736 vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2 1737 vinserti32x4 ym0, [srcq+ssq*1], 1 1738 lea srcq, [srcq+ssq*2] 1739 mova m5, [spel_v_shuf8] 1740 vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4 1741 vpermb m1, m5, m1 ; 01 12 1742 vpermb m2, m5, m0 ; 23 34 1743.v_w8_loop: 1744 vinserti32x4 m0, [srcq+ssq*1], 3 1745 lea srcq, [srcq+ssq*2] 1746 movu xm3, [srcq+ssq*0] 1747 mova m4, m11 1748 vpdpwssd m4, m12, m1 ; a0 b0 1749 vshufi32x4 m0, m3, q1032 ; 4 5 6 1750 mova m1, m2 1751 vpdpwssd m4, m13, m2 ; a1 b1 1752 vpermb m2, m5, m0 ; 45 56 1753 vpdpwssd m4, m14, m2 ; a2 b2 1754 psrad m4, 6 1755 vextracti32x8 ym3, m4, 1 1756 packusdw ym4, ym3 1757 pminsw ym4, ym15 1758 mova [dstq+dsq*0], xm4 1759 vextracti32x4 [dstq+dsq*1], ym4, 1 1760 lea dstq, [dstq+dsq*2] 1761 sub hd, 2 1762 jg .v_w8_loop 1763 RET 1764.v_w16: 1765 vbroadcasti32x8 m0, [srcq+r6 *1] 1766 vinserti32x8 m1, m0, [srcq+ssq*0], 1 1767 vinserti32x8 m0, [srcq+r6*2], 0 1768 mova m6, [spel_v_shuf16] 1769 movu ym3, [srcq+ssq*1] 1770 lea srcq, [srcq+ssq*2] 1771 vinserti32x8 m3, [srcq+ssq*0], 1 1772 vpermb m1, m6, m1 ; 12 1773 vpermb m0, m6, m0 ; 01 1774 vpermb m3, m6, m3 ; 34 1775 mova m7, [deint_q_shuf] 1776 vpshrdd m2, m1, m3, 16 ; 23 1777.v_w16_loop: 1778 mova m5, m11 1779 vpdpwssd m5, m12, m1 ; b0 1780 mova m4, m11 1781 vpdpwssd m4, m12, m0 ; a0 1782 mova m1, m3 1783 vpdpwssd m5, m13, m3 ; b1 1784 mova m0, m2 1785 vpdpwssd m4, m13, m2 ; a1 1786 movu ym3, [srcq+ssq*1] 1787 lea srcq, [srcq+ssq*2] 1788 vinserti32x8 m3, [srcq+ssq*0], 1 1789 vpermb m3, m6, m3 ; 56 1790 vpshrdd m2, m1, m3, 16 ; 45 1791 vpdpwssd m5, m14, m3 ; b2 1792 vpdpwssd m4, m14, m2 ; a2 1793 psrad m5, 6 1794 psrad m4, 6 1795 packusdw m4, m5 1796 pminsw m4, m15 1797 vpermq m4, m7, m4 1798 mova [dstq+dsq*0], ym4 1799 vextracti32x8 [dstq+dsq*1], m4, 1 1800 lea dstq, [dstq+dsq*2] 1801 sub hd, 2 1802 jg .v_w16_loop 1803 RET 1804.v_w32: 1805.v_w64: 1806.v_w128: 1807 lea wd, [hq+wq*8-256] 1808.v_w32_loop0: 1809 movu m16, [srcq+r6 *2] 1810 movu m17, [srcq+r6 *1] 1811 lea r7, [srcq+ssq*2] 1812 movu m18, [srcq+ssq*0] 1813 movu m19, [srcq+ssq*1] 1814 mov r8, dstq 1815 movu m20, [r7 +ssq*0] 1816 punpcklwd m0, m16, m17 ; 01 1817 punpckhwd m16, m17 1818 punpcklwd m1, m17, m18 ; 12 1819 punpckhwd m17, m18 1820 punpcklwd m2, m18, m19 ; 23 1821 punpckhwd m18, m19 1822 punpcklwd m3, m19, m20 ; 34 1823 punpckhwd m19, m20 1824.v_w32_loop: 1825 mova m4, m11 1826 vpdpwssd m4, m12, m0 ; a0 1827 mova m6, m11 1828 vpdpwssd m6, m12, m16 1829 mova m5, m11 1830 vpdpwssd m5, m12, m1 ; b0 1831 mova m7, m11 1832 vpdpwssd m7, m12, m17 1833 mova m0, m2 1834 vpdpwssd m4, m13, m2 ; a1 1835 mova m16, m18 1836 vpdpwssd m6, m13, m18 1837 mova m1, m3 1838 vpdpwssd m5, m13, m3 ; b1 1839 mova m17, m19 1840 vpdpwssd m7, m13, m19 1841 movu m19, [r7+ssq*1] 1842 lea r7, [r7+ssq*2] 1843 punpcklwd m2, m20, m19 ; 45 1844 punpckhwd m18, m20, m19 1845 movu m20, [r7+ssq*0] 1846 vpdpwssd m4, m14, m2 ; a2 1847 vpdpwssd m6, m14, m18 1848 punpcklwd m3, m19, m20 ; 56 1849 punpckhwd m19, m20 1850 vpdpwssd m5, m14, m3 ; b2 1851 vpdpwssd m7, m14, m19 1852 REPX {psrad x, 6}, m4, m6, m5, m7 1853 packusdw m4, m6 1854 packusdw m5, m7 1855 pminsw m4, m15 1856 pminsw m5, m15 1857 mova [r8+dsq*0], m4 1858 mova [r8+dsq*1], m5 1859 lea r8, [r8+dsq*2] 1860 sub hd, 2 1861 jg .v_w32_loop 1862 add srcq, 64 1863 add dstq, 64 1864 movzx hd, wb 1865 sub wd, 1<<8 1866 jg .v_w32_loop0 1867 vzeroupper 1868 RET 1869.hv: 1870 cmp wd, 4 1871 jg .hv_w8 1872 movzx mxd, mxb 1873 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 1874 movzx mxd, myb 1875 shr myd, 16 1876 cmp hd, 6 1877 cmovs myd, mxd 1878 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 1879 mov r6, ssq 1880 sub srcq, 2 1881 neg r6 1882 test dword r8m, 0x800 1883 jnz .hv_12bit 1884 vpbroadcastd m10, [pd_2176] 1885 psllw xmm0, 6 1886 jmp .hv_main 1887.hv_12bit: 1888 vpbroadcastd m10, [pd_640] 1889 psllw xmm0, 4 1890 psllw xmm1, 2 1891.hv_main: 1892 movu xm4, [srcq+r6 *2] 1893 vinserti32x4 ym4, [srcq+r6 *1], 1 1894 vinserti32x4 m4, [srcq+ssq*0], 2 1895 vbroadcasti32x4 m6, [spel_h_shufA] 1896 vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3 1897 lea srcq, [srcq+ssq*2] 1898 movu xm5, [srcq+ssq*0] ; 4 1899 mova [buf+ 0], xmm0 1900 mova [buf+16], xmm1 1901 vpbroadcastd m8, [buf+ 4] 1902 vpbroadcastd m9, [buf+ 8] 1903 vpbroadcastd ym12, xmm1 1904 vpbroadcastd ym13, [buf+20] 1905 vpbroadcastd ym14, [buf+24] 1906 cmp wd, 4 1907 je .hv_w4 1908 vbroadcasti32x4 m2, [spel_h_shufA] 1909 mova m3, [spel_h_shuf2b] 1910 mova m1, m10 1911 pshufb m4, m6 1912 pshufb xm5, xm6 1913 punpcklqdq m2, m4, m5 1914 vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_ 1915 mova ym6, [spel_h_shuf2a] 1916 punpckhqdq m4, m5 1917 mova xm5, [spel_shuf2] 1918 vpdpwssd m1, m9, m4 1919 vpermb m1, m3, m1 ; 01 12 1920 vextracti32x4 xm2, ym1, 1 ; 23 34 1921.hv_w2_loop: 1922 movu xm3, [srcq+ssq*1] 1923 lea srcq, [srcq+ssq*2] 1924 vinserti32x4 ym3, [srcq+ssq*0], 1 1925 vpermb ym3, ym6, ym3 1926 pmaddwd xmm0, xm12, xm1 ; a0 b0 1927 mova xm4, xm10 1928 vpdpwssd xm4, xm8, xm3 1929 vextracti32x4 xm3, ym3, 1 1930 mova xm1, xm2 1931 vpdpwssd xmm0, xm13, xm2 ; a1 b1 1932 vpdpwssd xm4, xm9, xm3 ; 5 6 1933 vpermt2b xm2, xm5, xm4 ; 45 56 1934 vpdpwssd xmm0, xm14, xm2 ; a2 b2 1935 psrad xmm0, 10 1936 packusdw xmm0, xmm0 1937 pminsw xmm0, xm15 1938 movd [dstq+dsq*0], xmm0 1939 pextrd [dstq+dsq*1], xmm0, 1 1940 lea dstq, [dstq+dsq*2] 1941 sub hd, 2 1942 jg .hv_w2_loop 1943 RET 1944.hv_w4: 1945 vbroadcasti32x4 m7, [spel_h_shufB] 1946 mova ym0, [spel_shuf4a] 1947 pshufb m1, m4, m6 1948 mova m2, m10 1949 vpdpwssd m2, m8, m1 1950 pshufb xm1, xm5, xm6 1951 mova xm3, xm10 1952 vpdpwssd xm3, xm8, xm1 1953 pshufb m4, m7 1954 pshufb xm5, xm7 1955 vpdpwssd m2, m9, m4 ; 0 1 2 3 1956 vpdpwssd xm3, xm9, xm5 ; 4 1957 mova ym5, [spel_shuf4b] 1958 vpermb m1, m0, m2 ; 01 12 1959 vshufi32x4 m2, m3, q1032 ; 2 3 4 1960 vpermb m2, m0, m2 ; 23 34 1961.hv_w4_loop: 1962 movu xm3, [srcq+ssq*1] 1963 lea srcq, [srcq+ssq*2] 1964 vinserti32x4 ym3, [srcq+ssq*0], 1 1965 pmaddwd ym0, ym12, ym1 ; a0 b0 1966 mova ym1, ym2 1967 pshufb ym4, ym3, ym6 1968 mova ym2, ym10 1969 vpdpwssd ym2, ym8, ym4 1970 pshufb ym3, ym7 1971 vpdpwssd ym0, ym13, ym1 ; a1 b1 1972 vpdpwssd ym2, ym9, ym3 ; 5 6 1973 vpermt2b ym2, ym5, ym1 ; 45 56 1974 vpdpwssd ym0, ym14, ym2 ; a2 b2 1975 psrad ym0, 10 1976 vextracti32x4 xm4, ym0, 1 1977 packusdw xm0, xm4 1978 pminsw xmm0, xm0, xm15 1979 movq [dstq+dsq*0], xmm0 1980 movhps [dstq+dsq*1], xmm0 1981 lea dstq, [dstq+dsq*2] 1982 sub hd, 2 1983 jg .hv_w4_loop 1984 RET 1985.hv_w8: 1986 shr mxd, 16 1987 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 1988 movzx mxd, myb 1989 shr myd, 16 1990 cmp hd, 6 1991 cmovs myd, mxd 1992 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 1993 mov r6, ssq 1994 sub srcq, 4 1995 neg r6 1996 test dword r8m, 0x800 1997 jnz .hv_w8_12bit 1998 vpbroadcastd m8, [pd_2176] 1999 psllw xmm0, 6 2000 jmp .hv_w8_main 2001.hv_w8_12bit: 2002 vpbroadcastd m8, [pd_640] 2003 psllw xmm0, 4 2004 psllw xmm1, 2 2005.hv_w8_main: 2006 mova [buf+ 0], xmm0 2007 mova [buf+16], xmm1 2008 vpbroadcastd m9, xmm0 2009 vpbroadcastd m10, [buf+ 4] 2010 vpbroadcastd m11, [buf+ 8] 2011 vpbroadcastd m12, xmm1 2012 vpbroadcastd m13, [buf+20] 2013 vpbroadcastd m14, [buf+24] 2014 cmp wd, 16 2015 jge .hv_w16 2016 mova m6, [spel_h_shufA] 2017 movu ym16, [srcq+r6 *2] 2018 vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1 2019 movu ym17, [srcq+ssq*0] 2020 vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3 2021 lea srcq, [srcq+ssq*2] 2022 movu ym18, [srcq+ssq*0] ; 4 2023 movu m7, [spel_h_shufC] 2024 vpermb m3, m6, m16 2025 mova m1, m8 2026 vpermb m4, m6, m17 2027 vpdpwssd m1, m9, m3 ; a0 b0 2028 mova m2, m8 2029 vpermb m5, m6, m18 2030 vpdpwssd m2, m9, m4 ; c0 d0 2031 mova m0, m8 2032 vpermb m16, m7, m16 2033 vpdpwssd m0, m9, m5 ; e0 2034 vpermb m17, m7, m17 2035 vpdpwssd m1, m11, m16 ; a2 b2 2036 vpermb m18, m7, m18 2037 vpdpwssd m2, m11, m17 ; c2 d2 2038 shufpd m3, m16, 0x55 2039 vpdpwssd m0, m11, m18 ; e2 2040 mova m16, [spel_shuf8a] 2041 shufpd m4, m17, 0x55 2042 vpdpwssd m1, m10, m3 ; a1 b1 2043 shufpd m5, m18, 0x55 2044 vpdpwssd m2, m10, m4 ; c1 d1 2045 vpdpwssd m0, m10, m5 ; e1 2046 mova m5, [spel_shuf8b] 2047 vpermt2b m1, m16, m2 ; 01 12 2048 vpermt2b m2, m16, m0 ; 23 34 2049.hv_w8_loop: 2050 movu ym18, [srcq+ssq*1] 2051 lea srcq, [srcq+ssq*2] 2052 vinserti32x8 m18, [srcq+ssq*0], 1 2053 mova m0, m8 2054 vpermb m17, m6, m18 2055 vpdpwssd m0, m9, m17 ; f0 g0 2056 vpermb m18, m7, m18 2057 pmaddwd m16, m12, m1 ; A0 B0 2058 vpdpwssd m0, m11, m18 ; f2 g2 2059 shufpd m17, m18, 0x55 2060 mova m1, m2 2061 vpdpwssd m16, m13, m2 ; A1 B1 2062 vpdpwssd m0, m10, m17 ; f1 g1 2063 vpermt2b m2, m5, m0 ; 45 56 2064 vpdpwssd m16, m14, m2 ; A2 B2 2065 psrad m16, 10 2066 vextracti32x8 ym17, m16, 1 2067 packusdw ym16, ym17 2068 pminsw ym16, ym15 2069 mova [dstq+dsq*0], xm16 2070 vextracti128 [dstq+dsq*1], ym16, 1 2071 lea dstq, [dstq+dsq*2] 2072 sub hd, 2 2073 jg .hv_w8_loop 2074 vzeroupper 2075 RET 2076.hv_w16: 2077 vbroadcasti32x4 m20, [spel_h_shufA] 2078 vbroadcasti32x4 m21, [spel_h_shufB] 2079 jg .hv_w32 2080 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] 2081 vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 2082 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 2083 movu ym16, [srcq+r6 *1+ 0] 2084 movu ym17, [srcq+r6 *1+12] 2085 vinserti32x8 m16, [srcq+ssq*0+ 0], 1 2086 vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2 2087 movu ym18, [srcq+ssq*1+ 0] 2088 movu ym19, [srcq+ssq*1+12] 2089 lea srcq, [srcq+ssq*2] 2090 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 2091 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4 2092 pshufb m2, m20 2093 mova m1, m8 2094 pshufb m3, m16, m20 2095 vpdpwssd m1, m11, m2 ; a2 2096 mova m2, m8 2097 pshufb m4, m17, m21 2098 vpdpwssd m2, m9, m3 ; b0 c0 2099 mova m3, m8 2100 pshufb m5, m18, m20 2101 vpdpwssd m3, m11, m4 ; b2' c2' 2102 mova m4, m8 2103 pshufb m7, m19, m21 2104 vpdpwssd m4, m9, m5 ; d0 e0 2105 mova m5, m8 2106 pshufb m0, m6, m20 2107 vpdpwssd m5, m11, m7 ; d2' e2' 2108 mova m7, [spel_shuf16] 2109 pshufb m16, m21 2110 vpdpwssd m1, m9, m0 ; a0 2111 pshufb m17, m20 2112 vpdpwssd m2, m10, m16 ; b1 c1 2113 pshufb m18, m21 2114 vpdpwssd m3, m10, m17 ; b1' c1' 2115 pshufb m19, m20 2116 vpdpwssd m4, m10, m18 ; d1 e1 2117 pshufb m6, m21 2118 vpdpwssd m5, m10, m19 ; d1' e1' 2119 shufpd m16, m17, 0x55 2120 vpdpwssd m1, m10, m6 ; a1 2121 shufpd m18, m19, 0x55 2122 vpdpwssd m2, m11, m16 ; b2 c2 2123 vpdpwssd m3, m9, m16 ; b0' c0' 2124 vpdpwssd m4, m11, m18 ; d2 e2 2125 vpdpwssd m5, m9, m18 ; d0' e0' 2126 pslldq m1, 1 2127 vpermt2b m2, m7, m3 ; 12 2128 vpermt2b m4, m7, m5 ; 34 2129 vpshrdd m1, m2, 16 ; 01 2130 vpshrdd m3, m2, m4, 16 ; 23 2131.hv_w16_loop: 2132 movu ym18, [srcq+ssq*1+ 0] 2133 movu ym19, [srcq+ssq*1+12] 2134 lea srcq, [srcq+ssq*2] 2135 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 2136 vinserti32x8 m19, [srcq+ssq*0+12], 1 2137 mova m5, m8 2138 mova m6, m8 2139 pshufb m17, m18, m20 2140 vpdpwssd m5, m9, m17 ; f0 g0 2141 pshufb m16, m19, m21 2142 vpdpwssd m6, m11, m16 ; f2' g2' 2143 pmaddwd m17, m12, m2 ; B0 2144 mova m2, m4 2145 pmaddwd m16, m12, m1 ; A0 2146 mova m1, m3 2147 pshufb m18, m21 2148 vpdpwssd m5, m10, m18 ; f1 g1 2149 pshufb m19, m20 2150 vpdpwssd m6, m10, m19 ; f1' g1' 2151 vpdpwssd m17, m13, m4 ; B1 2152 vpdpwssd m16, m13, m3 ; A1 2153 shufpd m18, m19, 0x55 2154 vpdpwssd m5, m11, m18 ; f2 g2 2155 vpdpwssd m6, m9, m18 ; f0' g0' 2156 mova m4, m7 2157 vpermi2b m4, m5, m6 ; 56 2158 vpshrdd m3, m2, m4, 16 ; 45 2159 vpdpwssd m17, m14, m4 ; B2 2160 vpdpwssd m16, m14, m3 ; A2 2161 psrad m16, 10 2162 psrad m17, 10 2163 vshufi32x4 m18, m16, m17, q3232 2164 vinserti32x8 m16, ym17, 1 2165 packusdw m16, m18 2166 pminsw m16, m15 2167 mova [dstq+dsq*0], ym16 2168 vextracti32x8 [dstq+dsq*1], m16, 1 2169 lea dstq, [dstq+dsq*2] 2170 sub hd, 2 2171 jg .hv_w16_loop 2172 vzeroupper 2173 RET 2174.hv_w32: 2175 WIN64_SPILL_XMM 28 2176 mova m27, [spel_shuf32] 2177 lea wd, [hq+wq*8-256] 2178.hv_w32_loop0: 2179 movu m16, [srcq+r6 *2+ 0] 2180 movu m7, [srcq+r6 *2+12] 2181 movu m6, [srcq+r6 *1+ 0] 2182 movu m18, [srcq+r6 *1+12] 2183 lea r7, [srcq+ssq*2] 2184 movu m17, [srcq+ssq*0+ 0] 2185 movu m19, [srcq+ssq*0+12] 2186 movu m22, [srcq+ssq*1+ 0] 2187 movu m24, [srcq+ssq*1+12] 2188 mov r8, dstq 2189 movu m23, [r7 +ssq*0+ 0] 2190 movu m25, [r7 +ssq*0+12] 2191 pshufb m1, m16, m20 2192 mova m0, m8 2193 pshufb m2, m7, m21 2194 vpdpwssd m0, m9, m1 ; a0 2195 mova m1, m8 2196 pshufb m4, m6, m20 2197 vpdpwssd m1, m11, m2 ; a2' 2198 mova m2, m8 2199 pshufb m3, m17, m20 2200 vpdpwssd m2, m9, m4 ; b0 2201 mova m4, m8 2202 pshufb m5, m18, m21 2203 vpdpwssd m4, m9, m3 ; c0 2204 mova m3, m8 2205 pshufb m26, m19, m21 2206 vpdpwssd m3, m11, m5 ; b2' 2207 mova m5, m8 2208 pshufb m16, m21 2209 vpdpwssd m5, m11, m26 ; c2' 2210 pshufb m7, m20 2211 vpdpwssd m0, m10, m16 ; a1 2212 pshufb m6, m21 2213 vpdpwssd m1, m10, m7 ; a1' 2214 pshufb m17, m21 2215 vpdpwssd m2, m10, m6 ; b1 2216 pshufb m18, m20 2217 vpdpwssd m4, m10, m17 ; c1 2218 pshufb m19, m20 2219 vpdpwssd m3, m10, m18 ; b1' 2220 shufpd m16, m7, 0x55 2221 vpdpwssd m5, m10, m19 ; c1' 2222 shufpd m6, m18, 0x55 2223 vpdpwssd m0, m11, m16 ; a2 2224 shufpd m17, m19, 0x55 2225 vpdpwssd m1, m9, m16 ; a0' 2226 pshufb m16, m22, m20 2227 vpdpwssd m2, m11, m6 ; b2 2228 pshufb m7, m23, m20 2229 vpdpwssd m4, m11, m17 ; c2 2230 vpdpwssd m3, m9, m6 ; b0' 2231 mova m6, m8 2232 vpdpwssd m5, m9, m17 ; c0' 2233 pshufb m17, m24, m21 2234 vpdpwssd m6, m9, m16 ; d0 2235 mova m16, m8 2236 pshufb m26, m25, m21 2237 vpdpwssd m16, m9, m7 ; e0 2238 mova m7, m8 2239 pshufb m22, m21 2240 vpdpwssd m7, m11, m17 ; d2' 2241 mova m17, m8 2242 pshufb m23, m21 2243 vpdpwssd m17, m11, m26 ; e2' 2244 pshufb m24, m20 2245 vpdpwssd m6, m10, m22 ; d1 2246 pshufb m25, m20 2247 vpdpwssd m16, m10, m23 ; e1 2248 shufpd m22, m24, 0x55 2249 vpdpwssd m7, m10, m24 ; d1' 2250 shufpd m23, m25, 0x55 2251 vpdpwssd m17, m10, m25 ; e1' 2252 pslldq m0, 1 2253 vpdpwssd m6, m11, m22 ; d2 2254 pslldq m1, 1 2255 vpdpwssd m16, m11, m23 ; e2 2256 vpermt2b m2, m27, m4 ; 12 2257 vpdpwssd m7, m9, m22 ; d0' 2258 vpermt2b m3, m27, m5 ; 12' 2259 vpdpwssd m17, m9, m23 ; e0' 2260 vpshrdd m0, m2, 16 ; 01 2261 vpermt2b m6, m27, m16 ; 34 2262 vpshrdd m1, m3, 16 ; 01' 2263 vpermt2b m7, m27, m17 ; 34' 2264 vpshrdd m4, m2, m6, 16 ; 23 2265 vpshrdd m5, m3, m7, 16 ; 23' 2266.hv_w32_loop: 2267 movu m22, [r7+ssq*1+ 0] 2268 movu m24, [r7+ssq*1+12] 2269 lea r7, [r7+ssq*2] 2270 movu m23, [r7+ssq*0+ 0] 2271 movu m25, [r7+ssq*0+12] 2272 pmaddwd m17, m12, m2 ; B0 2273 mova m2, m6 2274 pmaddwd m19, m12, m3 ; B0' 2275 mova m3, m7 2276 pmaddwd m16, m12, m0 ; A0 2277 mova m0, m4 2278 pmaddwd m18, m12, m1 ; A0' 2279 mova m1, m5 2280 vpdpwssd m17, m13, m6 ; B1 2281 vpdpwssd m19, m13, m7 ; B1' 2282 mova m6, m8 2283 vpdpwssd m16, m13, m4 ; A1 2284 pshufb m4, m22, m20 2285 vpdpwssd m18, m13, m5 ; A1' 2286 pshufb m7, m23, m20 2287 vpdpwssd m6, m9, m4 ; f0 2288 mova m4, m8 2289 pshufb m5, m24, m21 2290 vpdpwssd m4, m9, m7 ; g0 2291 mova m7, m8 2292 pshufb m26, m25, m21 2293 vpdpwssd m7, m11, m5 ; f2' 2294 mova m5, m8 2295 pshufb m22, m21 2296 vpdpwssd m5, m11, m26 ; g2' 2297 pshufb m23, m21 2298 vpdpwssd m6, m10, m22 ; f1 2299 pshufb m24, m20 2300 vpdpwssd m4, m10, m23 ; g1 2301 pshufb m25, m20 2302 vpdpwssd m7, m10, m24 ; f1' 2303 shufpd m22, m24, 0x55 2304 vpdpwssd m5, m10, m25 ; g1' 2305 shufpd m23, m25, 0x55 2306 vpdpwssd m6, m11, m22 ; f2 2307 vpdpwssd m4, m11, m23 ; g2 2308 vpdpwssd m7, m9, m22 ; f0' 2309 vpdpwssd m5, m9, m23 ; g0' 2310 vpermt2b m6, m27, m4 ; 56 2311 vpermt2b m7, m27, m5 ; 56' 2312 vpdpwssd m17, m14, m6 ; B2 2313 vpshrdd m4, m2, m6, 16 ; 45 2314 vpdpwssd m19, m14, m7 ; B2' 2315 vpshrdd m5, m3, m7, 16 ; 45' 2316 vpdpwssd m16, m14, m4 ; A2 2317 vpdpwssd m18, m14, m5 ; A2' 2318 REPX {psrad x, 10}, m17, m19, m16, m18 2319 packusdw m17, m19 2320 packusdw m16, m18 2321 pminsw m17, m15 2322 pminsw m16, m15 2323 mova [r8+dsq*0], m16 2324 mova [r8+dsq*1], m17 2325 lea r8, [r8+dsq*2] 2326 sub hd, 2 2327 jg .hv_w32_loop 2328 add srcq, 64 2329 add dstq, 64 2330 movzx hd, wb 2331 sub wd, 1<<8 2332 jg .hv_w32_loop0 2333 RET 2334 2335PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc 2336PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc 2337PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc 2338PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc 2339PUT_8TAP_FN sharp, SHARP, SHARP 2340 2341cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 2342 imul mxd, mxm, 0x010101 2343 add mxd, t0d ; 8tap_h, mx, 4tap_h 2344 imul myd, mym, 0x010101 2345 add myd, t1d ; 8tap_v, my, 4tap_v 2346 lea r8, [put_avx512icl] 2347 movifnidn wd, wm 2348 movifnidn hd, hm 2349 test mxd, 0xf00 2350 jnz .h 2351 test myd, 0xf00 2352 jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put 2353.v: 2354 movzx mxd, myb 2355 shr myd, 16 2356 cmp hd, 6 2357 cmovs myd, mxd 2358 vpbroadcastd m10, [pd_32] 2359 pmovsxbw xmm0, [base+subpel_filters+myq*8] 2360 tzcnt r7d, wd 2361 vpbroadcastw m11, r8m 2362 lea r6, [ssq*3] 2363 movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] 2364 sub srcq, r6 2365 mova [rsp+stack_offset+8], xmm0 2366 vpbroadcastd m12, xmm0 2367 add r7, r8 2368 vpbroadcastd m13, [rsp+stack_offset+12] 2369 vpbroadcastd m14, [rsp+stack_offset+16] 2370 vpbroadcastd m15, [rsp+stack_offset+20] 2371 jmp r7 2372.v_w2: 2373 movd xmm2, [srcq+ssq*0] 2374 pinsrd xmm2, [srcq+ssq*1], 1 2375 pinsrd xmm2, [srcq+ssq*2], 2 2376 add srcq, r6 2377 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 2378 movd xmm3, [srcq+ssq*1] 2379 vpbroadcastd xmm1, [srcq+ssq*2] 2380 add srcq, r6 2381 vpbroadcastd xmm0, [srcq+ssq*0] 2382 vpblendd xmm3, xmm1, 0x02 ; 4 5 2383 vpblendd xmm1, xmm0, 0x02 ; 5 6 2384 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2385 punpcklwd xmm3, xmm1 ; 45 56 2386 punpcklwd xmm1, xmm2, xmm4 ; 01 12 2387 punpckhwd xmm2, xmm4 ; 23 34 2388.v_w2_loop: 2389 vpbroadcastd xmm4, [srcq+ssq*1] 2390 lea srcq, [srcq+ssq*2] 2391 mova xmm5, xm10 2392 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 2393 mova xmm1, xmm2 2394 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 2395 mova xmm2, xmm3 2396 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 2397 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2398 vpbroadcastd xmm0, [srcq+ssq*0] 2399 vpblendd xmm4, xmm0, 0x02 ; 7 8 2400 punpcklwd xmm3, xmm4 ; 67 78 2401 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 2402 psrad xmm5, 6 2403 packusdw xmm5, xmm5 2404 pminsw xmm5, xm11 2405 movd [dstq+dsq*0], xmm5 2406 pextrd [dstq+dsq*1], xmm5, 1 2407 lea dstq, [dstq+dsq*2] 2408 sub hd, 2 2409 jg .v_w2_loop 2410 RET 2411.v_w4: 2412 movq xmm1, [srcq+ssq*0] 2413 vpbroadcastq ymm0, [srcq+ssq*1] 2414 vpbroadcastq ymm2, [srcq+ssq*2] 2415 add srcq, r6 2416 vpbroadcastq ymm4, [srcq+ssq*0] 2417 vpbroadcastq ymm3, [srcq+ssq*1] 2418 vpbroadcastq ymm5, [srcq+ssq*2] 2419 add srcq, r6 2420 vpblendd ymm1, ymm0, 0x30 2421 vpblendd ymm0, ymm2, 0x30 2422 punpcklwd ymm1, ymm0 ; 01 12 2423 vpbroadcastq ymm0, [srcq+ssq*0] 2424 vpblendd ymm2, ymm4, 0x30 2425 vpblendd ymm4, ymm3, 0x30 2426 punpcklwd ymm2, ymm4 ; 23 34 2427 vpblendd ymm3, ymm5, 0x30 2428 vpblendd ymm5, ymm0, 0x30 2429 punpcklwd ymm3, ymm5 ; 45 56 2430.v_w4_loop: 2431 vpbroadcastq ymm5, [srcq+ssq*1] 2432 lea srcq, [srcq+ssq*2] 2433 mova ymm4, ym10 2434 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 2435 mova ymm1, ymm2 2436 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 2437 mova ymm2, ymm3 2438 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 2439 vpblendd ymm3, ymm0, ymm5, 0x30 2440 vpbroadcastq ymm0, [srcq+ssq*0] 2441 vpblendd ymm5, ymm0, 0x30 2442 punpcklwd ymm3, ymm5 ; 67 78 2443 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 2444 psrad ymm4, 6 2445 vextracti128 xmm5, ymm4, 1 2446 packusdw xmm4, xmm5 2447 pminsw xmm4, xm11 2448 movq [dstq+dsq*0], xmm4 2449 movhps [dstq+dsq*1], xmm4 2450 lea dstq, [dstq+dsq*2] 2451 sub hd, 2 2452 jg .v_w4_loop 2453 vzeroupper 2454 RET 2455.v_w8: 2456 vbroadcasti32x4 m2, [srcq+ssq*2] 2457 vinserti32x4 m1, m2, [srcq+ssq*0], 0 2458 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 2459 add srcq, r6 2460 vinserti32x4 ym2, [srcq+ssq*0], 1 2461 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 2462 mova m6, [spel_v_shuf8] 2463 movu xm0, [srcq+ssq*1] 2464 vinserti32x4 ym0, [srcq+ssq*2], 1 2465 add srcq, r6 2466 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 2467 vpermb m1, m6, m1 ; 01 12 2468 vpermb m2, m6, m2 ; 23 34 2469 vpermb m3, m6, m0 ; 45 56 2470.v_w8_loop: 2471 vinserti32x4 m0, [srcq+ssq*1], 3 2472 lea srcq, [srcq+ssq*2] 2473 movu xm5, [srcq+ssq*0] 2474 mova m4, m10 2475 vpdpwssd m4, m12, m1 ; a0 b0 2476 mova m1, m2 2477 vshufi32x4 m0, m5, q1032 ; 6 7 8 2478 vpdpwssd m4, m13, m2 ; a1 b1 2479 mova m2, m3 2480 vpdpwssd m4, m14, m3 ; a2 b2 2481 vpermb m3, m6, m0 ; 67 78 2482 vpdpwssd m4, m15, m3 ; a3 b3 2483 psrad m4, 6 2484 vextracti32x8 ym5, m4, 1 2485 packusdw ym4, ym5 2486 pminsw ym4, ym11 2487 mova [dstq+dsq*0], xm4 2488 vextracti32x4 [dstq+dsq*1], ym4, 1 2489 lea dstq, [dstq+dsq*2] 2490 sub hd, 2 2491 jg .v_w8_loop 2492 RET 2493.v_w16: 2494 vbroadcasti32x8 m0, [srcq+ssq*1] 2495 vinserti32x8 m1, m0, [srcq+ssq*2], 1 2496 vinserti32x8 m0, [srcq+ssq*0], 0 2497 mova m8, [spel_v_shuf16] 2498 add srcq, r6 2499 movu ym3, [srcq+ssq*0] 2500 vinserti32x8 m3, [srcq+ssq*1], 1 2501 movu ym5, [srcq+ssq*2] 2502 add srcq, r6 2503 vinserti32x8 m5, [srcq+ssq*0], 1 2504 vpermb m1, m8, m1 ; 12 2505 vpermb m0, m8, m0 ; 01 2506 vpermb m3, m8, m3 ; 34 2507 vpermb m5, m8, m5 ; 56 2508 mova m9, [deint_q_shuf] 2509 vpshrdd m2, m1, m3, 16 ; 23 2510 vpshrdd m4, m3, m5, 16 ; 45 2511.v_w16_loop: 2512 mova m7, m10 2513 vpdpwssd m7, m12, m1 ; b0 2514 mova m6, m10 2515 vpdpwssd m6, m12, m0 ; a0 2516 mova m1, m3 2517 vpdpwssd m7, m13, m3 ; b1 2518 mova m0, m2 2519 vpdpwssd m6, m13, m2 ; a1 2520 mova m3, m5 2521 vpdpwssd m7, m14, m5 ; b2 2522 mova m2, m4 2523 vpdpwssd m6, m14, m4 ; a2 2524 movu ym5, [srcq+ssq*1] 2525 lea srcq, [srcq+ssq*2] 2526 vinserti32x8 m5, [srcq+ssq*0], 1 2527 vpermb m5, m8, m5 ; 78 2528 vpshrdd m4, m3, m5, 16 ; 67 2529 vpdpwssd m7, m15, m5 ; b3 2530 vpdpwssd m6, m15, m4 ; a3 2531 psrad m7, 6 2532 psrad m6, 6 2533 packusdw m6, m7 2534 pminsw m6, m11 2535 vpermq m6, m9, m6 2536 mova [dstq+dsq*0], ym6 2537 vextracti32x8 [dstq+dsq*1], m6, 1 2538 lea dstq, [dstq+dsq*2] 2539 sub hd, 2 2540 jg .v_w16_loop 2541 RET 2542.v_w32: 2543.v_w64: 2544.v_w128: 2545 WIN64_SPILL_XMM 23 2546 lea wd, [hq+wq*8-256] 2547.v_w32_loop0: 2548 movu m16, [srcq+ssq*0] 2549 movu m17, [srcq+ssq*1] 2550 lea r7, [srcq+r6 ] 2551 movu m18, [srcq+ssq*2] 2552 movu m19, [r7 +ssq*0] 2553 mov r8, dstq 2554 movu m20, [r7 +ssq*1] 2555 movu m21, [r7 +ssq*2] 2556 add r7, r6 2557 movu m22, [r7 +ssq*0] 2558 punpcklwd m0, m16, m17 ; 01l 2559 punpckhwd m16, m17 ; 01h 2560 punpcklwd m1, m17, m18 ; 12l 2561 punpckhwd m17, m18 ; 12h 2562 punpcklwd m2, m18, m19 ; 23l 2563 punpckhwd m18, m19 ; 23h 2564 punpcklwd m3, m19, m20 ; 34l 2565 punpckhwd m19, m20 ; 34h 2566 punpcklwd m4, m20, m21 ; 45l 2567 punpckhwd m20, m21 ; 45h 2568 punpcklwd m5, m21, m22 ; 56l 2569 punpckhwd m21, m22 ; 56h 2570.v_w32_loop: 2571 mova m6, m10 2572 vpdpwssd m6, m12, m0 ; a0l 2573 mova m8, m10 2574 vpdpwssd m8, m12, m16 ; a0h 2575 mova m7, m10 2576 vpdpwssd m7, m12, m1 ; b0l 2577 mova m9, m10 2578 vpdpwssd m9, m12, m17 ; b0h 2579 mova m0, m2 2580 vpdpwssd m6, m13, m2 ; a1l 2581 mova m16, m18 2582 vpdpwssd m8, m13, m18 ; a1h 2583 mova m1, m3 2584 vpdpwssd m7, m13, m3 ; b1l 2585 mova m17, m19 2586 vpdpwssd m9, m13, m19 ; b1h 2587 mova m2, m4 2588 vpdpwssd m6, m14, m4 ; a2l 2589 mova m18, m20 2590 vpdpwssd m8, m14, m20 ; a2h 2591 mova m3, m5 2592 vpdpwssd m7, m14, m5 ; b2l 2593 mova m19, m21 2594 vpdpwssd m9, m14, m21 ; b2h 2595 movu m21, [r7+ssq*1] 2596 lea r7, [r7+ssq*2] 2597 punpcklwd m4, m22, m21 ; 67l 2598 punpckhwd m20, m22, m21 ; 67h 2599 movu m22, [r7+ssq*0] 2600 vpdpwssd m6, m15, m4 ; a3l 2601 vpdpwssd m8, m15, m20 ; a3h 2602 punpcklwd m5, m21, m22 ; 78l 2603 punpckhwd m21, m22 ; 78h 2604 vpdpwssd m7, m15, m5 ; b3l 2605 vpdpwssd m9, m15, m21 ; b3h 2606 REPX {psrad x, 6}, m6, m8, m7, m9 2607 packusdw m6, m8 2608 packusdw m7, m9 2609 pminsw m6, m11 2610 pminsw m7, m11 2611 mova [r8+dsq*0], m6 2612 mova [r8+dsq*1], m7 2613 lea r8, [r8+dsq*2] 2614 sub hd, 2 2615 jg .v_w32_loop 2616 add srcq, 64 2617 add dstq, 64 2618 movzx hd, wb 2619 sub wd, 1<<8 2620 jg .v_w32_loop0 2621 RET 2622.h_w2: 2623 RESET_STACK_STATE 2624 mova ym2, [spel_h_shuf2a] 2625 sub srcq, 2 2626 pshufd xmm3, xmm0, q1111 2627 pshufd xmm4, xmm0, q2222 2628.h_w2_loop: 2629 movu xm1, [srcq+ssq*0] 2630 vinserti32x4 ym1, [srcq+ssq*1], 1 2631 lea srcq, [srcq+ssq*2] 2632 mova xmm0, xm8 2633 vpermb ym1, ym2, ym1 2634 vpdpwssd xmm0, xmm3, xm1 2635 vextracti32x4 xm1, ym1, 1 2636 vpdpwssd xmm0, xmm4, xm1 2637 psrad xmm0, 6 2638 packusdw xmm0, xmm0 2639 pminsw xmm0, xm15 2640 movd [dstq+dsq*0], xmm0 2641 pextrd [dstq+dsq*1], xmm0, 1 2642 lea dstq, [dstq+dsq*2] 2643 sub hd, 2 2644 jg .h_w2_loop 2645 RET 2646.h_w4: 2647 movzx mxd, mxb 2648 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2649 jl .h_w2 2650 vbroadcasti32x4 ym4, [spel_h_shufA] 2651 vbroadcasti32x4 ym5, [spel_h_shufB] 2652 sub srcq, 2 2653 pshufd xmm0, xmm0, q2211 2654 vpbroadcastq ym6, xmm0 2655 vpermq ym7, ymm0, q1111 2656.h_w4_loop: 2657 movu xm2, [srcq+ssq*0] 2658 vinserti32x4 ym2, [srcq+ssq*1], 1 2659 lea srcq, [srcq+ssq*2] 2660 mova ym0, ym8 2661 pshufb ym1, ym2, ym4 2662 vpdpwssd ym0, ym6, ym1 2663 pshufb ym2, ym5 2664 vpdpwssd ym0, ym7, ym2 2665 psrad ym0, 6 2666 vextracti32x4 xm1, ym0, 1 2667 packusdw xm0, xm1 2668 pminsw xmm0, xm0, xm15 2669 movq [dstq+dsq*0], xmm0 2670 movhps [dstq+dsq*1], xmm0 2671 lea dstq, [dstq+dsq*2] 2672 sub hd, 2 2673 jg .h_w4_loop 2674 RET 2675.h_w8: 2676 mova m4, [spel_h_shufA] 2677 movu m5, [spel_h_shufB] 2678 movu m6, [spel_h_shufC] 2679 mova m7, [spel_h_shufD] 2680.h_w8_loop: 2681 movu ym2, [srcq+ssq*0] 2682 vinserti32x8 m2, [srcq+ssq*1], 1 2683 lea srcq, [srcq+ssq*2] 2684 mova m0, m8 2685 vpermb m1, m4, m2 2686 vpdpwssd m0, m10, m1 2687 vpermb m1, m5, m2 2688 vpdpwssd m0, m11, m1 2689 vpermb m1, m6, m2 2690 vpdpwssd m0, m12, m1 2691 vpermb m1, m7, m2 2692 vpdpwssd m0, m13, m1 2693 psrad m0, 6 2694 vextracti32x8 ym1, m0, 1 2695 packusdw ym0, ym1 2696 pminsw ym0, ym15 2697 mova [dstq+dsq*0], xm0 2698 vextracti32x4 [dstq+dsq*1], ym0, 1 2699 lea dstq, [dstq+dsq*2] 2700 sub hd, 2 2701 jg .h_w8_loop 2702 RET 2703.h: 2704 vpbroadcastw m15, r8m 2705 test myd, 0xf00 2706 jnz .hv 2707 mov r7d, r8m 2708 shr r7d, 11 2709 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] 2710 cmp wd, 4 2711 jle .h_w4 2712 shr mxd, 16 2713 sub srcq, 6 2714 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2715 mova [buf], xmm0 2716 vpbroadcastd m10, xmm0 2717 vpbroadcastd m11, [buf+ 4] 2718 vpbroadcastd m12, [buf+ 8] 2719 vpbroadcastd m13, [buf+12] 2720 sub wd, 16 2721 jl .h_w8 2722 vbroadcasti32x4 m6, [spel_h_shufA] 2723 vbroadcasti32x4 m7, [spel_h_shufB] 2724 jg .h_w32 2725.h_w16_loop: 2726 movu ym2, [srcq+ssq*0+ 0] 2727 vinserti32x8 m2, [srcq+ssq*1+ 0], 1 2728 movu ym3, [srcq+ssq*0+16] 2729 vinserti32x8 m3, [srcq+ssq*1+16], 1 2730 lea srcq, [srcq+ssq*2] 2731 mova m0, m8 2732 mova m1, m8 2733 pshufb m4, m2, m6 2734 vpdpwssd m0, m10, m4 ; a0 2735 pshufb m4, m3, m6 2736 vpdpwssd m1, m12, m4 ; b2 2737 pshufb m4, m2, m7 2738 vpdpwssd m0, m11, m4 ; a1 2739 pshufb m4, m3, m7 2740 vpdpwssd m1, m13, m4 ; b3 2741 shufpd m2, m3, 0x55 2742 pshufb m4, m2, m6 2743 vpdpwssd m0, m12, m4 ; a2 2744 vpdpwssd m1, m10, m4 ; b0 2745 pshufb m2, m7 2746 vpdpwssd m0, m13, m2 ; a3 2747 vpdpwssd m1, m11, m2 ; b1 2748 psrad m0, 6 2749 psrad m1, 6 2750 packusdw m0, m1 2751 pminsw m0, m15 2752 mova [dstq+dsq*0], ym0 2753 vextracti32x8 [dstq+dsq*1], m0, 1 2754 lea dstq, [dstq+dsq*2] 2755 sub hd, 2 2756 jg .h_w16_loop 2757 RET 2758.h_w32: 2759 lea srcq, [srcq+wq*2] 2760 lea dstq, [dstq+wq*2] 2761 neg wq 2762.h_w32_loop0: 2763 mov r6, wq 2764.h_w32_loop: 2765 movu m2, [srcq+r6*2+ 0] 2766 movu m3, [srcq+r6*2+ 8] 2767 mova m0, m8 2768 mova m1, m8 2769 pshufb m4, m2, m6 2770 vpdpwssd m0, m10, m4 ; a0 2771 pshufb m4, m3, m6 2772 vpdpwssd m1, m10, m4 ; b0 2773 vpdpwssd m0, m12, m4 ; a2 2774 movu m4, [srcq+r6*2+16] 2775 pshufb m3, m7 2776 vpdpwssd m1, m11, m3 ; b1 2777 vpdpwssd m0, m13, m3 ; a3 2778 pshufb m3, m4, m6 2779 vpdpwssd m1, m12, m3 ; b2 2780 pshufb m2, m7 2781 vpdpwssd m0, m11, m2 ; a1 2782 pshufb m4, m7 2783 vpdpwssd m1, m13, m4 ; b3 2784 psrad m0, 6 2785 psrad m1, 6 2786 packusdw m0, m1 2787 pminsw m0, m15 2788 mova [dstq+r6*2], m0 2789 add r6, 32 2790 jl .h_w32_loop 2791 add srcq, ssq 2792 add dstq, dsq 2793 dec hd 2794 jg .h_w32_loop0 2795 RET 2796.hv: 2797 cmp wd, 4 2798 jg .hv_w8 2799 movzx mxd, mxb 2800 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2801 movzx mxd, myb 2802 shr myd, 16 2803 cmp hd, 6 2804 cmovs myd, mxd 2805 pmovsxbw xmm1, [base+subpel_filters+myq*8] 2806 lea r6, [ssq*3] 2807 sub srcq, 2 2808 sub srcq, r6 2809 test dword r8m, 0x800 2810 jnz .hv_12bit 2811 vpbroadcastd m10, [pd_2176] 2812 psllw xmm0, 6 2813 jmp .hv_main 2814.hv_12bit: 2815 vpbroadcastd m10, [pd_640] 2816 psllw xmm0, 4 2817 psllw xmm1, 2 2818.hv_main: 2819 mova [buf+ 0], xmm0 2820 mova [buf+16], xmm1 2821 vpbroadcastd m8, [buf+ 4] 2822 vpbroadcastd m9, [buf+ 8] 2823 vpbroadcastd ym11, xmm1 2824 vpbroadcastd ym12, [buf+20] 2825 vpbroadcastd ym13, [buf+24] 2826 vpbroadcastd ym14, [buf+28] 2827 movu xm4, [srcq+ssq*0] 2828 vinserti32x4 ym4, [srcq+ssq*1], 1 2829 vinserti32x4 m4, [srcq+ssq*2], 2 2830 add srcq, r6 2831 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 2832 movu xm0, [srcq+ssq*1] 2833 vinserti32x4 ym0, [srcq+ssq*2], 1 2834 add srcq, r6 2835 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 2836 cmp wd, 4 2837 je .hv_w4 2838 vbroadcasti32x4 m2, [spel_h_shufA] 2839 mova m3, [spel_h_shuf2b] 2840 mova ym6, [spel_h_shuf2a] 2841 mova xm7, [spel_shuf2] 2842 mova m1, m10 2843 pshufb m4, m2 2844 pshufb m0, m2 2845 punpcklqdq m2, m4, m0 2846 vpdpwssd m1, m8, m2 ; 04 15 26 3_ 2847 punpckhqdq m4, m0 2848 vpdpwssd m1, m9, m4 2849 vpermb m1, m3, m1 ; 01 12 2850 vextracti32x4 xm2, ym1, 1 ; 23 34 2851 vextracti32x4 xm3, m1, 2 ; 45 56 2852.hv_w2_loop: 2853 movu xm5, [srcq+ssq*1] 2854 lea srcq, [srcq+ssq*2] 2855 vinserti32x4 ym5, [srcq+ssq*0], 1 2856 mova xm4, xm10 2857 vpermb ym5, ym6, ym5 2858 pmaddwd xmm0, xm11, xm1 ; a0 b0 2859 vpdpwssd xm4, xm8, xm5 2860 vextracti32x4 xm5, ym5, 1 2861 mova xm1, xm2 2862 vpdpwssd xmm0, xm12, xm2 ; a1 b1 2863 vpdpwssd xm4, xm9, xm5 ; 7 8 2864 mova xm2, xm3 2865 vpdpwssd xmm0, xm13, xm3 ; a2 b2 2866 vpermt2b xm3, xm7, xm4 ; 67 78 2867 vpdpwssd xmm0, xm14, xm3 ; a3 b3 2868 psrad xmm0, 10 2869 packusdw xmm0, xmm0 2870 pminsw xmm0, xm15 2871 movd [dstq+dsq*0], xmm0 2872 pextrd [dstq+dsq*1], xmm0, 1 2873 lea dstq, [dstq+dsq*2] 2874 sub hd, 2 2875 jg .hv_w2_loop 2876 RET 2877.hv_w4: 2878 vbroadcasti32x4 m19, [spel_h_shufA] 2879 vbroadcasti32x4 m20, [spel_h_shufB] 2880 mova ym6, [spel_shuf4a] 2881 mova ym7, [spel_shuf4b] 2882 mova m2, m10 2883 mova m3, m10 2884 pshufb m1, m4, m19 2885 vpdpwssd m2, m8, m1 2886 pshufb m1, m0, m19 2887 vpdpwssd m3, m8, m1 2888 pshufb m4, m20 2889 vpdpwssd m2, m9, m4 2890 pshufb m0, m20 2891 vpdpwssd m3, m9, m0 2892 vpermb m1, m6, m2 ; 01 12 2893 vshufi32x4 m2, m3, q1032 2894 vpermb m3, m6, m3 ; 45 56 2895 vpermb m2, m6, m2 ; 23 34 2896.hv_w4_loop: 2897 movu xm18, [srcq+ssq*1] 2898 lea srcq, [srcq+ssq*2] 2899 vinserti128 ym18, [srcq+ssq*0], 1 2900 pmaddwd ym16, ym11, ym1 ; a0 b0 2901 mova ym1, ym2 2902 mova ym2, ym3 2903 pshufb ym17, ym18, ym19 2904 mova ym3, ym10 2905 vpdpwssd ym3, ym8, ym17 2906 pshufb ym18, ym20 2907 vpdpwssd ym16, ym12, ym1 ; a1 b1 2908 vpdpwssd ym3, ym9, ym18 ; 7 8 2909 vpdpwssd ym16, ym13, ym2 ; a2 b2 2910 vpermt2b ym3, ym7, ym2 ; 67 78 2911 vpdpwssd ym16, ym14, ym3 ; a3 b3 2912 psrad ym16, 10 2913 vextracti128 xm17, ym16, 1 2914 packusdw xm16, xm17 2915 pminsw xm16, xm15 2916 movq [dstq+dsq*0], xm16 2917 movhps [dstq+dsq*1], xm16 2918 lea dstq, [dstq+dsq*2] 2919 sub hd, 2 2920 jg .hv_w4_loop 2921 vzeroupper 2922 RET 2923.hv_w8: 2924 shr mxd, 16 2925 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2926 movzx mxd, myb 2927 shr myd, 16 2928 cmp hd, 6 2929 cmovs myd, mxd 2930 pmovsxbw xmm1, [base+subpel_filters+myq*8] 2931 lea r6, [ssq*3] 2932 sub srcq, 6 2933 sub srcq, r6 2934 test dword r8m, 0x800 2935 jnz .hv_w8_12bit 2936 vpbroadcastd m10, [pd_2176] 2937 psllw xmm0, 6 2938 jmp .hv_w8_main 2939.hv_w8_12bit: 2940 vpbroadcastd m10, [pd_640] 2941 psllw xmm0, 4 2942 psllw xmm1, 2 2943.hv_w8_main: 2944 mova [buf+ 0], xmm0 2945 mova [buf+16], xmm1 2946 vpbroadcastd m11, xmm0 2947 vpbroadcastd m12, [buf+ 4] 2948 vpbroadcastd m13, [buf+ 8] 2949 vpbroadcastd m14, [buf+12] 2950 vpbroadcastd m16, xmm1 2951 vpbroadcastd m17, [buf+20] 2952 vpbroadcastd m18, [buf+24] 2953 vpbroadcastd m19, [buf+28] 2954 cmp wd, 8 2955 jg .hv_w16 2956 mova m5, [spel_h_shufA] 2957 movu ym0, [srcq+ssq*0] 2958 vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 2959 movu ym9, [srcq+ssq*2] 2960 add srcq, r6 2961 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 2962 movu ym20, [srcq+ssq*1] 2963 vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 2964 add srcq, r6 2965 movu ym21, [srcq+ssq*0] ; 6 2966 movu m6, [spel_h_shufB] 2967 movu m7, [spel_h_shufC] 2968 vpermb m8, m5, m0 2969 mova m1, m10 2970 vpdpwssd m1, m11, m8 ; a0 b0 2971 vpermb m8, m5, m9 2972 mova m2, m10 2973 vpdpwssd m2, m11, m8 ; c0 d0 2974 vpermb m8, m5, m20 2975 mova m3, m10 2976 vpdpwssd m3, m11, m8 ; e0 f0 2977 vpermb m8, m5, m21 2978 mova m4, m10 2979 vpdpwssd m4, m11, m8 ; g0 2980 vpermb m8, m6, m0 2981 vpdpwssd m1, m12, m8 ; a1 b1 2982 vpermb m8, m6, m9 2983 vpdpwssd m2, m12, m8 ; c1 d1 2984 vpermb m8, m6, m20 2985 vpdpwssd m3, m12, m8 ; e1 f1 2986 vpermb m8, m6, m21 2987 vpdpwssd m4, m12, m8 ; g1 2988 vpermb m8, m7, m0 2989 vpdpwssd m1, m13, m8 ; a2 b2 2990 vpermb m8, m7, m9 2991 vpdpwssd m2, m13, m8 ; c2 d2 2992 vpermb m8, m7, m20 2993 vpdpwssd m3, m13, m8 ; e2 f2 2994 vpermb m8, m7, m21 2995 vpdpwssd m4, m13, m8 ; g2 2996 mova m8, [spel_h_shufD] 2997 vpermb m0, m8, m0 2998 vpdpwssd m1, m14, m0 ; a3 b3 2999 mova m0, [spel_shuf8a] 3000 vpermb m9, m8, m9 3001 vpdpwssd m2, m14, m9 ; c3 d3 3002 mova m9, [spel_shuf8b] 3003 vpermb m20, m8, m20 3004 vpdpwssd m3, m14, m20 ; e3 f3 3005 vpermb m21, m8, m21 3006 vpdpwssd m4, m14, m21 ; g3 3007 vpermt2b m1, m0, m2 ; 01 12 3008 vpermt2b m2, m0, m3 ; 23 34 3009 vpermt2b m3, m0, m4 ; 45 56 3010.hv_w8_loop: 3011 movu ym0, [srcq+ssq*1] 3012 lea srcq, [srcq+ssq*2] 3013 vinserti32x8 m0, [srcq+ssq*0], 1 3014 mova m4, m10 3015 vpermb m21, m5, m0 3016 vpdpwssd m4, m11, m21 ; h0 i0 3017 vpermb m21, m6, m0 3018 pmaddwd m20, m16, m1 ; A0 B0 3019 vpdpwssd m4, m12, m21 ; h1 i1 3020 vpermb m21, m7, m0 3021 mova m1, m2 3022 vpdpwssd m20, m17, m2 ; A1 B1 3023 vpdpwssd m4, m13, m21 ; h2 i2 3024 vpermb m21, m8, m0 3025 mova m2, m3 3026 vpdpwssd m20, m18, m3 ; A2 B2 3027 vpdpwssd m4, m14, m21 ; h3 i3 3028 vpermt2b m3, m9, m4 ; 67 78 3029 vpdpwssd m20, m19, m3 ; A3 B3 3030 psrad m20, 10 3031 vextracti32x8 ym21, m20, 1 3032 packusdw ym20, ym21 3033 pminsw ym20, ym15 3034 mova [dstq+dsq*0], xm20 3035 vextracti128 [dstq+dsq*1], ym20, 1 3036 lea dstq, [dstq+dsq*2] 3037 sub hd, 2 3038 jg .hv_w8_loop 3039 vzeroupper 3040 RET 3041.hv_w16: 3042 WIN64_SPILL_XMM 26 3043 vbroadcasti32x4 m20, [spel_h_shufA] 3044 vbroadcasti32x4 m21, [spel_h_shufB] 3045 add wd, wd 3046 mova m9, [spel_shuf16] 3047 lea wd, [hq+wq*8-256] 3048.hv_w16_loop0: 3049 vbroadcasti32x8 m5, [srcq+ssq*0+ 8] 3050 vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 3051 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 3052 movu ym6, [srcq+ssq*1+ 0] 3053 movu ym7, [srcq+ssq*1+16] 3054 lea r7, [srcq+r6] 3055 vinserti32x8 m6, [srcq+ssq*2+ 0], 1 3056 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 3057 movu ym22, [r7 +ssq*0+ 0] 3058 movu ym23, [r7 +ssq*0+16] 3059 mov r8, dstq 3060 vinserti32x8 m22, [r7 +ssq*1+ 0], 1 3061 vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4 3062 movu ym24, [r7 +ssq*2+ 0] 3063 movu ym25, [r7 +ssq*2+16] 3064 add r7, r6 3065 vinserti32x8 m24, [r7 +ssq*0+ 0], 1 3066 vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6 3067 pshufb m0, m4, m20 3068 mova m1, m10 3069 vpdpwssd m1, m11, m0 ; a0 3070 pshufb m0, m6, m20 3071 mova m2, m10 3072 vpdpwssd m2, m11, m0 ; b0 3073 pshufb m0, m7, m20 3074 mova m3, m10 3075 vpdpwssd m3, m13, m0 ; c2 3076 pshufb m0, m4, m21 3077 vpdpwssd m1, m12, m0 ; a1 3078 pshufb m0, m6, m21 3079 vpdpwssd m2, m12, m0 ; b1 3080 pshufb m0, m7, m21 3081 vpdpwssd m3, m14, m0 ; c3 3082 pshufb m0, m5, m20 3083 vpdpwssd m1, m13, m0 ; a2 3084 shufpd m6, m7, 0x55 3085 pshufb m7, m6, m20 3086 vpdpwssd m2, m13, m7 ; b2 3087 vpdpwssd m3, m11, m7 ; c0 3088 pshufb m5, m21 3089 vpdpwssd m1, m14, m5 ; a3 3090 pshufb m6, m21 3091 vpdpwssd m2, m14, m6 ; b3 3092 vpdpwssd m3, m12, m6 ; c1 3093 pshufb m0, m22, m20 3094 mova m4, m10 3095 vpdpwssd m4, m11, m0 ; d0 3096 pshufb m0, m23, m20 3097 mova m5, m10 3098 vpdpwssd m5, m13, m0 ; e2 3099 pshufb m0, m24, m20 3100 mova m6, m10 3101 vpdpwssd m6, m11, m0 ; f0 3102 pshufb m0, m25, m20 3103 mova m7, m10 3104 vpdpwssd m7, m13, m0 ; g2 3105 pshufb m0, m22, m21 3106 vpdpwssd m4, m12, m0 ; d1 3107 pshufb m0, m23, m21 3108 vpdpwssd m5, m14, m0 ; e3 3109 pshufb m0, m24, m21 3110 vpdpwssd m6, m12, m0 ; f1 3111 pshufb m0, m25, m21 3112 vpdpwssd m7, m14, m0 ; g3 3113 shufpd m22, m23, 0x55 3114 pshufb m23, m22, m20 3115 vpdpwssd m4, m13, m23 ; d2 3116 vpdpwssd m5, m11, m23 ; e0 3117 shufpd m24, m25, 0x55 3118 pshufb m25, m24, m20 3119 vpdpwssd m6, m13, m25 ; f2 3120 vpdpwssd m7, m11, m25 ; g0 3121 pshufb m22, m21 3122 vpdpwssd m4, m14, m22 ; d3 3123 vpdpwssd m5, m12, m22 ; e1 3124 pshufb m24, m21 3125 vpdpwssd m6, m14, m24 ; f3 3126 vpdpwssd m7, m12, m24 ; g1 3127 pslldq m1, 1 3128 vpermt2b m2, m9, m3 ; 12 3129 vpermt2b m4, m9, m5 ; 34 3130 vpermt2b m6, m9, m7 ; 56 3131 vpshrdd m1, m2, 16 ; 01 3132 vpshrdd m3, m2, m4, 16 ; 23 3133 vpshrdd m5, m4, m6, 16 ; 45 3134.hv_w16_loop: 3135 movu ym24, [r7+ssq*1+ 0] 3136 movu ym25, [r7+ssq*1+16] 3137 lea r7, [r7+ssq*2] 3138 vinserti32x8 m24, [r7+ssq*0+ 0], 1 3139 vinserti32x8 m25, [r7+ssq*0+16], 1 3140 mova m7, m10 3141 mova m8, m10 3142 pshufb m0, m24, m20 3143 vpdpwssd m7, m11, m0 ; h0 3144 pshufb m0, m25, m20 3145 vpdpwssd m8, m13, m0 ; i2 3146 pmaddwd m22, m16, m1 ; A0 3147 mova m1, m3 3148 pmaddwd m23, m16, m2 ; B0 3149 mova m2, m4 3150 pshufb m0, m24, m21 3151 vpdpwssd m7, m12, m0 ; h1 3152 pshufb m0, m25, m21 3153 vpdpwssd m8, m14, m0 ; i3 3154 vpdpwssd m22, m17, m3 ; A1 3155 mova m3, m5 3156 vpdpwssd m23, m17, m4 ; B1 3157 mova m4, m6 3158 shufpd m24, m25, 0x55 3159 pshufb m25, m24, m20 3160 vpdpwssd m7, m13, m25 ; h2 3161 vpdpwssd m8, m11, m25 ; i0 3162 vpdpwssd m22, m18, m5 ; A2 3163 vpdpwssd m23, m18, m6 ; B2 3164 pshufb m24, m21 3165 vpdpwssd m7, m14, m24 ; h3 3166 vpdpwssd m8, m12, m24 ; i1 3167 vpermt2b m7, m9, m8 ; 78 3168 vpshrdd m5, m6, m7, 16 ; 67 3169 vpdpwssd m22, m19, m5 ; A3 3170 vpdpwssd m23, m19, m7 ; B3 3171 mova m6, m7 3172 psrad m22, 10 3173 psrad m23, 10 3174 vshufi32x4 m0, m22, m23, q3232 3175 vinserti32x8 m22, ym23, 1 3176 packusdw m22, m0 3177 pminsw m22, m15 3178 mova [r8+dsq*0], ym22 3179 vextracti32x8 [r8+dsq*1], m22, 1 3180 lea r8, [r8+dsq*2] 3181 sub hd, 2 3182 jg .hv_w16_loop 3183 add srcq, 32 3184 add dstq, 32 3185 movzx hd, wb 3186 sub wd, 1<<8 3187 jg .hv_w16_loop0 3188 RET 3189 3190%if WIN64 3191DECLARE_REG_TMP 6, 4 3192%else 3193DECLARE_REG_TMP 6, 7 3194%endif 3195 3196%define PREP_8TAP_FN FN prep_8tap, 3197PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc 3198PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc 3199PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc 3200PREP_8TAP_FN regular, REGULAR, REGULAR 3201 3202cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my 3203%define base r7-prep_avx512icl 3204 imul mxd, mxm, 0x010101 3205 add mxd, t0d ; 6tap_h, mx, 4tap_h 3206 imul myd, mym, 0x010101 3207 add myd, t1d ; 6tap_v, my, 4tap_v 3208 lea r7, [prep_avx512icl] 3209 mov wd, wm 3210 movifnidn hd, hm 3211 test mxd, 0xf00 3212 jnz .h 3213 test myd, 0xf00 3214 jnz .v 3215.prep: 3216 tzcnt wd, wd 3217 mov r5d, r7m ; bitdepth_max 3218 vpbroadcastd m5, [pw_8192] 3219 movzx wd, word [r7+wq*2+table_offset(prep,)] 3220 shr r5d, 11 3221 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] 3222 add wq, r7 3223 lea r6, [ssq*3] 3224%if WIN64 3225 pop r7 3226%endif 3227 jmp wq 3228.h_w8: 3229 mova m6, [spel_h_shufA] 3230 movu m7, [spel_h_shufC] 3231 mova m8, [prep_endB] 3232.h_w8_loop: 3233 movu ym4, [srcq+ssq*0] 3234 vinserti32x8 m4, [srcq+ssq*1], 1 3235 movu ym5, [srcq+ssq*2] 3236 vinserti32x8 m5, [srcq+r6 ], 1 3237 lea srcq, [srcq+ssq*4] 3238 mova m0, m10 3239 mova m1, m10 3240 vpermb m2, m6, m4 3241 vpermb m3, m6, m5 3242 vpdpwssd m0, m12, m2 ; a0 b0 3243 vpdpwssd m1, m12, m3 ; c0 d0 3244 vpermb m4, m7, m4 3245 vpermb m5, m7, m5 3246 vpdpwssd m0, m14, m4 ; a2 b2 3247 vpdpwssd m1, m14, m5 ; c2 d2 3248 shufpd m2, m4, 0x55 3249 shufpd m3, m5, 0x55 3250 vpdpwssd m0, m13, m2 ; a1 b1 3251 vpdpwssd m1, m13, m3 ; c1 d1 3252 vpermt2b m0, m8, m1 3253 mova [tmpq], m0 3254 add tmpq, 64 3255 sub hd, 4 3256 jg .h_w8_loop 3257 RET 3258.h: 3259 vpbroadcastd m10, [prep_8tap_rnd] 3260 test myd, 0xf00 3261 jnz .hv 3262 lea r6, [ssq*3] 3263 cmp wd, 4 3264 je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4 3265 shr mxd, 16 3266 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 3267 mov r5d, r7m 3268 sub srcq, 4 3269 shr r5d, 11 3270 psllw xmm0, [base+prep_hv_shift+r5*8] 3271 mova [tmpq], xmm0 3272 vpbroadcastd m12, xmm0 3273 vpbroadcastd m13, [tmpq+ 4] 3274 vpbroadcastd m14, [tmpq+ 8] 3275 cmp wd, 16 3276 jl .h_w8 3277 vbroadcasti32x4 m5, [spel_h_shufA] 3278 vbroadcasti32x4 m6, [spel_h_shufB] 3279 mova m7, [prep_endC] 3280 jg .h_w32 3281.h_w16_loop: 3282 movu ym2, [srcq+ssq*0+ 0] 3283 vinserti32x8 m2, [srcq+ssq*1+ 0], 1 3284 movu ym3, [srcq+ssq*0+12] 3285 vinserti32x8 m3, [srcq+ssq*1+12], 1 3286 lea srcq, [srcq+ssq*2] 3287 mova m0, m10 3288 mova m1, m10 3289 pshufb m4, m2, m5 ; 01 3290 vpdpwssd m0, m12, m4 ; a0 b0 3291 pshufb m4, m3, m6 ; 89 3292 vpdpwssd m1, m14, m4 ; a2' b2' 3293 pshufb m2, m6 ; 23 3294 pshufb m3, m5 ; 67 3295 vpdpwssd m0, m13, m2 ; a1 b1 3296 vpdpwssd m1, m13, m3 ; a1' b1' 3297 shufpd m2, m3, 0x55 ; 45 3298 vpdpwssd m0, m14, m2 ; a2 b2 3299 vpdpwssd m1, m12, m2 ; a0' b0' 3300 vpermt2b m0, m7, m1 3301 mova [tmpq], m0 3302 add tmpq, 64 3303 sub hd, 2 3304 jg .h_w16_loop 3305 RET 3306.h_w32: 3307 lea srcq, [srcq+wq*2] 3308 neg wq 3309.h_w32_loop0: 3310 mov r6, wq 3311.h_w32_loop: 3312 movu m2, [srcq+r6*2+ 0] 3313 movu m3, [srcq+r6*2+12] 3314 mova m0, m10 3315 mova m1, m10 3316 pshufb m4, m2, m5 3317 vpdpwssd m0, m12, m4 3318 pshufb m4, m3, m6 3319 vpdpwssd m1, m14, m4 3320 pshufb m2, m6 3321 pshufb m3, m5 3322 vpdpwssd m0, m13, m2 3323 vpdpwssd m1, m13, m3 3324 shufpd m2, m3, 0x55 3325 vpdpwssd m0, m14, m2 3326 vpdpwssd m1, m12, m2 3327 vpermt2b m0, m7, m1 3328 mova [tmpq], m0 3329 add tmpq, 64 3330 add r6, 32 3331 jl .h_w32_loop 3332 add srcq, ssq 3333 dec hd 3334 jg .h_w32_loop0 3335 RET 3336.v: 3337 movzx mxd, myb 3338 shr myd, 16 3339 cmp hd, 4 3340 cmove myd, mxd 3341 mov r5d, r7m 3342 vpbroadcastd m10, [prep_8tap_rnd] 3343 pmovsxbw xmm0, [base+subpel_filters+1+myq*8] 3344 tzcnt r6d, wd 3345 shr r5d, 11 3346 movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)] 3347 psllw xmm0, [base+prep_hv_shift+r5*8] 3348 add r7, r6 3349 mova [tmpq], xmm0 3350 vpbroadcastd m12, xmm0 3351 mov r6, ssq 3352 vpbroadcastd m13, [tmpq+ 4] 3353 neg r6 3354 vpbroadcastd m14, [tmpq+ 8] 3355 jmp r7 3356.v_w4: 3357 mov r3d, 0x330c 3358 movq xm1, [srcq+r6 *2] 3359 kmovw k1, r3d 3360 vpbroadcastq ym1{k1}, [srcq+r6 *1] 3361 vpbroadcastq m2, [srcq+ssq*0] 3362 vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3 3363 movq xm0, [srcq+ssq*2] 3364 mova ym4, [prep_endA] 3365 valignq m0, m1, 2 3366 punpcklwd m1, m0 ; 01 12 23 34 3367.v_w4_loop: 3368 lea srcq, [srcq+ssq*4] 3369 movq xm2, [srcq+r6 *1] 3370 vpbroadcastq ym2{k1}, [srcq+ssq*0] 3371 vpbroadcastq m3, [srcq+ssq*1] 3372 vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3 3373 mova m3, m10 3374 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 3375 valignq m0, m2, m0, 6 ; 4 5 6 7 3376 punpcklwd m0, m2 ; 45 56 67 78 3377 vpdpwssd m3, m14, m0 ; a2 b2 c2 d2 3378 vshufi32x4 m1, m0, q1032 ; 23 34 45 56 3379 vpdpwssd m3, m13, m1 ; a1 b1 c1 d1 3380 mova m1, m0 3381 mova m0, m2 3382 vpermb m3, m4, m3 3383 mova [tmpq], ym3 3384 add tmpq, 32 3385 sub hd, 4 3386 jg .v_w4_loop 3387 RET 3388.v_w8: 3389 vbroadcasti32x4 ym1, [srcq+r6 *1] 3390 mov r3d, 0x33 3391 vbroadcasti32x4 m2, [srcq+ssq*0] 3392 kmovb k1, r3d 3393 mova m6, [spel_v_shuf8] 3394 vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2 3395 vbroadcasti32x4 ym0, [srcq+ssq*1] 3396 vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4 3397 mova m7, [prep_endB] 3398 vpermb m1, m6, m1 ; 01 12 3399 vpermb m2, m6, m0 ; 23 34 3400.v_w8_loop: 3401 lea srcq, [srcq+ssq*4] 3402 vbroadcasti32x4 ym3, [srcq+r6 *1] 3403 movu xm4, [srcq+ssq*0] 3404 vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6 3405 vbroadcasti32x4 ym0, [srcq+ssq*1] 3406 vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8 3407 mova m4, m10 3408 vpdpwssd m4, m12, m1 ; a0 b0 3409 mova m5, m10 3410 vpdpwssd m5, m12, m2 ; c0 d0 3411 vpermb m1, m6, m3 ; 45 56 3412 vpdpwssd m4, m13, m2 ; a1 b1 3413 vpermb m2, m6, m0 ; 67 78 3414 vpdpwssd m5, m13, m1 ; c1 d1 3415 vpdpwssd m4, m14, m1 ; a2 b2 3416 vpdpwssd m5, m14, m2 ; c2 d2 3417 vpermt2b m4, m7, m5 3418 mova [tmpq], m4 3419 add tmpq, 64 3420 sub hd, 4 3421 jg .v_w8_loop 3422 RET 3423.v_w16: 3424 vbroadcasti32x8 m0, [srcq+r6 *1] 3425 vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2 3426 vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1 3427 mova m6, [spel_v_shuf16] 3428 movu ym3, [srcq+ssq*1] 3429 lea srcq, [srcq+ssq*2] 3430 vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4 3431 mova m7, [prep_endA] 3432 vpermb m1, m6, m1 ; 12 3433 vpermb m0, m6, m0 ; 01 3434 vpermb m3, m6, m3 ; 34 3435 vpshrdd m2, m1, m3, 16 ; 23 3436.v_w16_loop: 3437 mova m5, m10 3438 vpdpwssd m5, m12, m1 ; b0 3439 mova m4, m10 3440 vpdpwssd m4, m12, m0 ; a0 3441 mova m1, m3 3442 vpdpwssd m5, m13, m3 ; b1 3443 movu ym3, [srcq+ssq*1] 3444 lea srcq, [srcq+ssq*2] 3445 vpdpwssd m4, m13, m2 ; a1 3446 vinserti32x8 m3, [srcq+ssq*0], 1 3447 mova m0, m2 3448 vpermb m3, m6, m3 ; 56 3449 vpshrdd m2, m1, m3, 16 ; 45 3450 vpdpwssd m5, m14, m3 ; b2 3451 vpdpwssd m4, m14, m2 ; a2 3452 vpermt2b m4, m7, m5 3453 mova [tmpq], m4 3454 add tmpq, 64 3455 sub hd, 2 3456 jg .v_w16_loop 3457 RET 3458.v_w32: 3459.v_w64: 3460.v_w128: 3461%if WIN64 3462 push r8 3463%endif 3464 mova m11, [prep_endC] 3465 lea r5, [hq+wq*8-256] 3466.v_w32_loop0: 3467 movu m4, [srcq+r6 *2] 3468 movu m5, [srcq+r6 *1] 3469 lea r7, [srcq+ssq*2] 3470 movu m6, [srcq+ssq*0] 3471 movu m7, [srcq+ssq*1] 3472 mov r8, tmpq 3473 movu m8, [r7 +ssq*0] 3474 punpcklwd m0, m4, m5 ; 01 3475 punpckhwd m4, m5 3476 punpcklwd m1, m5, m6 ; 12 3477 punpckhwd m5, m6 3478 punpcklwd m2, m6, m7 ; 23 3479 punpckhwd m6, m7 3480 punpcklwd m3, m7, m8 ; 34 3481 punpckhwd m7, m8 3482.v_w32_loop: 3483 mova m16, m10 3484 movu m9, [r7+ssq*1] 3485 mova m18, m10 3486 vpdpwssd m16, m12, m0 ; a0 3487 mova m17, m10 3488 vpdpwssd m18, m12, m4 3489 mova m19, m10 3490 vpdpwssd m17, m12, m1 ; b0 3491 lea r7, [r7+ssq*2] 3492 vpdpwssd m19, m12, m5 3493 mova m0, m2 3494 vpdpwssd m16, m13, m2 ; a1 3495 punpcklwd m2, m8, m9 ; 45 3496 mova m4, m6 3497 vpdpwssd m18, m13, m6 3498 punpckhwd m6, m8, m9 3499 movu m8, [r7+ssq*0] 3500 vpdpwssd m17, m13, m3 ; b1 3501 mova m1, m3 3502 vpdpwssd m19, m13, m7 3503 mova m5, m7 3504 vpdpwssd m16, m14, m2 ; a2 3505 punpcklwd m3, m9, m8 ; 56 3506 vpdpwssd m18, m14, m6 3507 punpckhwd m7, m9, m8 3508 vpdpwssd m17, m14, m3 ; b2 3509 vpdpwssd m19, m14, m7 3510 vpermt2b m16, m11, m18 3511 vpermt2b m17, m11, m19 3512 mova [r8+wq*0], m16 3513 mova [r8+wq*2], m17 3514 lea r8, [r8+wq*4] 3515 sub hd, 2 3516 jg .v_w32_loop 3517 add srcq, 64 3518 add tmpq, 64 3519 movzx hd, r5b 3520 sub r5d, 1<<8 3521 jg .v_w32_loop0 3522%if WIN64 3523 pop r8 3524%endif 3525 vzeroupper 3526 RET 3527.hv_w4: 3528 movzx mxd, mxb 3529 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 3530 movzx mxd, myb 3531 shr myd, 16 3532 cmp hd, 4 3533 cmove myd, mxd 3534 mov r5d, r7m 3535 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 3536 mov r6, ssq 3537 sub srcq, 2 3538 shr r5d, 11 3539 neg r6 3540 psllw xmm0, [base+prep_hv_shift+r5*8] 3541 psllw xmm1, 2 3542 mova [tmpq+ 0], xmm0 3543 mova [tmpq+16], xmm1 3544 vpbroadcastd m8, [tmpq+ 4] 3545 mov r3d, 0xf0 3546 vpbroadcastd m9, [tmpq+ 8] 3547 vpbroadcastd m12, xmm1 3548 movu xm3, [srcq+r6 *2] 3549 kmovb k1, r3d 3550 vinserti32x4 ym3, [srcq+r6 *1], 1 3551 vbroadcasti32x4 m2, [srcq+ssq*0] 3552 vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3 3553 movu xm4, [srcq+ssq*2] 3554 vbroadcasti32x4 m5, [spel_h_shufA] 3555 vbroadcasti32x4 m6, [spel_h_shufB] 3556 mova m1, m11 3557 mova m15, [spel_shuf4a] 3558 mova xm2, xm11 3559 pshufb m0, m3, m5 3560 vpdpwssd m1, m8, m0 3561 pshufb xm0, xm4, xm5 3562 vpdpwssd xm2, xm8, xm0 3563 vpbroadcastd m13, [tmpq+20] 3564 pshufb m3, m6 3565 vpbroadcastd m14, [tmpq+24] 3566 pshufb xm4, xm6 3567 mova m7, [spel_shuf4b] 3568 vpdpwssd m1, m9, m3 ; 0 1 2 3 3569 vpdpwssd xm2, xm9, xm4 ; 4 3570 vpermt2b m1, m15, m2 ; 01 12 23 34 3571 mova ym15, [prep_endA] 3572.hv_w4_loop: 3573 lea srcq, [srcq+ssq*4] 3574 movu xm4, [srcq+r6 *1] 3575 vinserti32x4 ym4, [srcq+ssq*0], 1 3576 vbroadcasti32x4 m3, [srcq+ssq*1] 3577 vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3 3578 mova m2, m11 3579 pshufb m3, m4, m5 3580 vpdpwssd m2, m8, m3 3581 mova m3, m10 3582 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 3583 pshufb m4, m6 3584 vpdpwssd m2, m9, m4 ; 5 6 7 8 3585 mova m4, m1 3586 vpermt2b m1, m7, m2 ; 45 56 67 78 3587 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 3588 vshufi32x4 m4, m1, q1032 ; 23 34 45 56 3589 vpdpwssd m3, m13, m4 ; a1 b1 c1 d1 3590 vpermb m3, m15, m3 3591 mova [tmpq], ym3 3592 add tmpq, 32 3593 sub hd, 4 3594 jg .hv_w4_loop 3595 RET 3596.hv_w8: 3597 mova m8, [spel_h_shufA] 3598 movu ym18, [srcq+r6 *2] 3599 vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1 3600 movu ym19, [srcq+ssq*0] 3601 vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3 3602 movu ym20, [srcq+ssq*2] ; 4 3603 movu m9, [spel_h_shufC] 3604 mova m21, [spel_shuf8a] 3605 mova m0, [spel_shuf8b] 3606 vpermb m4, m8, m18 3607 mova m1, m10 3608 vpermb m5, m8, m19 3609 vpdpwssd m1, m12, m4 ; a0 b0 3610 mova m2, m10 3611 vpermb m6, m8, m20 3612 vpdpwssd m2, m12, m5 ; c0 d0 3613 mova m3, m10 3614 vpermb m18, m9, m18 3615 vpdpwssd m3, m12, m6 ; e0 3616 mova m7, [prep_endB] 3617 vpermb m19, m9, m19 3618 vpdpwssd m1, m14, m18 ; a2 b2 3619 vpermb m20, m9, m20 3620 vpdpwssd m2, m14, m19 ; c2 d2 3621 shufpd m4, m18, 0x55 3622 vpdpwssd m3, m14, m20 ; e2 3623 shufpd m5, m19, 0x55 3624 vpdpwssd m1, m13, m4 ; a1 b1 3625 shufpd m6, m20, 0x55 3626 vpdpwssd m2, m13, m5 ; c1 d1 3627 vpdpwssd m3, m13, m6 ; e1 3628 vpermt2b m1, m21, m2 ; 01 12 3629 vpermt2b m2, m21, m3 ; 23 34 3630.hv_w8_loop: 3631 lea srcq, [srcq+ssq*4] 3632 movu ym18, [srcq+r6 *1] 3633 vinserti32x8 m18, [srcq+ssq*0], 1 3634 movu ym19, [srcq+ssq*1] 3635 vinserti32x8 m19, [srcq+ssq*2], 1 3636 mova m3, m10 3637 vpermb m5, m8, m18 3638 mova m4, m10 3639 vpermb m6, m8, m19 3640 vpdpwssd m3, m12, m5 ; f0 g0 3641 mova m20, m11 3642 vpdpwssd m4, m12, m6 ; h0 i0 3643 mova m21, m11 3644 vpdpwssd m20, m15, m1 ; A0 B0 3645 vpermb m18, m9, m18 3646 vpdpwssd m21, m15, m2 ; C0 D0 3647 vpermb m19, m9, m19 3648 vpdpwssd m3, m14, m18 ; f2 g2 3649 vpdpwssd m4, m14, m19 ; h2 i2 3650 shufpd m5, m18, 0x55 3651 vpdpwssd m20, m16, m2 ; A1 B1 3652 shufpd m6, m19, 0x55 3653 vpdpwssd m3, m13, m5 ; f1 g1 3654 vpdpwssd m4, m13, m6 ; h1 i1 3655 vpermt2b m2, m0, m3 ; 45 56 3656 vpdpwssd m21, m16, m2 ; C1 D1 3657 mova m1, m2 3658 vpermt2b m2, m0, m4 ; 67 78 3659 vpdpwssd m20, m17, m1 ; A2 B2 3660 vpdpwssd m21, m17, m2 ; A2 B2 3661 vpermt2b m20, m7, m21 3662 mova [tmpq], m20 3663 add tmpq, 64 3664 sub hd, 4 3665 jg .hv_w8_loop 3666 vzeroupper 3667 RET 3668.hv: 3669 vpbroadcastd m11, [pd_128] 3670 cmp wd, 4 3671 je .hv_w4 3672 shr mxd, 16 3673 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 3674 movzx mxd, myb 3675 shr myd, 16 3676 cmp hd, 6 3677 cmovs myd, mxd 3678 mov r5d, r7m 3679 pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 3680 mov r6, ssq 3681 sub srcq, 4 3682 shr r5d, 11 3683 neg r6 3684 psllw xmm0, [base+prep_hv_shift+r5*8] 3685 psllw xmm1, 2 3686 mova [tmpq+ 0], xmm0 3687 mova [tmpq+16], xmm1 3688 vpbroadcastd m12, xmm0 3689 vpbroadcastd m13, [tmpq+ 4] 3690 vpbroadcastd m14, [tmpq+ 8] 3691 vpbroadcastd m15, xmm1 3692 vpbroadcastd m16, [tmpq+20] 3693 vpbroadcastd m17, [tmpq+24] 3694 cmp wd, 16 3695 jl .hv_w8 3696 vbroadcasti32x4 m8, [spel_h_shufA] 3697 vbroadcasti32x4 m9, [spel_h_shufB] 3698 jg .hv_w32 3699 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] 3700 vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 3701 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 3702 movu ym18, [srcq+r6 *1+ 0] 3703 movu ym19, [srcq+r6 *1+12] 3704 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 3705 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2 3706 movu ym20, [srcq+ssq*1+ 0] 3707 movu ym21, [srcq+ssq*1+12] 3708 lea srcq, [srcq+ssq*2] 3709 vinserti32x8 m20, [srcq+ssq*0+ 0], 1 3710 vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4 3711 pshufb m2, m8 3712 mova m1, m10 3713 pshufb m3, m18, m8 3714 vpdpwssd m1, m14, m2 ; a2 3715 mova m2, m10 3716 pshufb m4, m19, m9 3717 vpdpwssd m2, m12, m3 ; b0 c0 3718 mova m3, m10 3719 pshufb m5, m20, m8 3720 vpdpwssd m3, m14, m4 ; b2' c2' 3721 mova m4, m10 3722 pshufb m7, m21, m9 3723 vpdpwssd m4, m12, m5 ; d0 e0 3724 mova m5, m10 3725 pshufb m0, m6, m8 3726 vpdpwssd m5, m14, m7 ; d2' e2' 3727 mova m7, [spel_shuf16] 3728 pshufb m18, m9 3729 vpdpwssd m1, m12, m0 ; a0 3730 pshufb m19, m8 3731 vpdpwssd m2, m13, m18 ; b1 c1 3732 pshufb m20, m9 3733 vpdpwssd m3, m13, m19 ; b1' c1' 3734 pshufb m21, m8 3735 vpdpwssd m4, m13, m20 ; d1 e1 3736 pshufb m6, m9 3737 vpdpwssd m5, m13, m21 ; d1' e1' 3738 mova m0, [prep_endB] 3739 shufpd m18, m19, 0x55 3740 vpdpwssd m1, m13, m6 ; a1 3741 shufpd m20, m21, 0x55 3742 vpdpwssd m2, m14, m18 ; b2 c2 3743 vpdpwssd m3, m12, m18 ; b0' c0' 3744 vpdpwssd m4, m14, m20 ; d2 e2 3745 vpdpwssd m5, m12, m20 ; d0' e0' 3746 pslldq m1, 1 3747 vpermt2b m2, m7, m3 ; 12 3748 vpermt2b m4, m7, m5 ; 34 3749 vpshrdd m1, m2, 16 ; 01 3750 vpshrdd m3, m2, m4, 16 ; 23 3751.hv_w16_loop: 3752 movu ym18, [srcq+ssq*1+ 0] 3753 movu ym19, [srcq+ssq*1+12] 3754 lea srcq, [srcq+ssq*2] 3755 vinserti32x8 m18, [srcq+ssq*0+ 0], 1 3756 vinserti32x8 m19, [srcq+ssq*0+12], 1 3757 mova m5, m10 3758 mova m6, m10 3759 pshufb m21, m18, m8 3760 vpdpwssd m5, m12, m21 ; f0 g0 3761 pshufb m20, m19, m9 3762 mova m21, m11 3763 vpdpwssd m6, m14, m20 ; f2' g2' 3764 mova m20, m11 3765 vpdpwssd m21, m15, m2 ; B0 3766 mova m2, m4 3767 vpdpwssd m20, m15, m1 ; A0 3768 mova m1, m3 3769 pshufb m18, m9 3770 vpdpwssd m5, m13, m18 ; f1 g1 3771 pshufb m19, m8 3772 vpdpwssd m6, m13, m19 ; f1' g1' 3773 vpdpwssd m21, m16, m4 ; B1 3774 vpdpwssd m20, m16, m3 ; A1 3775 shufpd m18, m19, 0x55 3776 vpdpwssd m5, m14, m18 ; f2 g2 3777 vpdpwssd m6, m12, m18 ; f0' g0' 3778 mova m4, m7 3779 vpermi2b m4, m5, m6 ; 56 3780 vpshrdd m3, m2, m4, 16 ; 45 3781 vpdpwssd m21, m17, m4 ; B2 3782 vpdpwssd m20, m17, m3 ; A2 3783 vpermt2b m20, m0, m21 3784 mova [tmpq], m20 3785 add tmpq, 64 3786 sub hd, 2 3787 jg .hv_w16_loop 3788 vzeroupper 3789 RET 3790.hv_w32: 3791 WIN64_SPILL_XMM 29 3792%if WIN64 3793 push r8 3794%endif 3795 mova m27, [spel_shuf32] 3796 lea r5d, [hq+wq*8-256] 3797 mova m28, [prep_endC] 3798.hv_w32_loop0: 3799 movu m18, [srcq+r6 *2+ 0] 3800 movu m7, [srcq+r6 *2+12] 3801 movu m6, [srcq+r6 *1+ 0] 3802 movu m20, [srcq+r6 *1+12] 3803 lea r7, [srcq+ssq*2] 3804 movu m19, [srcq+ssq*0+ 0] 3805 movu m21, [srcq+ssq*0+12] 3806 movu m22, [srcq+ssq*1+ 0] 3807 movu m24, [srcq+ssq*1+12] 3808 mov r8, tmpq 3809 movu m23, [r7 +ssq*0+ 0] 3810 movu m25, [r7 +ssq*0+12] 3811 pshufb m1, m18, m8 3812 mova m0, m10 3813 pshufb m2, m7, m9 3814 vpdpwssd m0, m12, m1 ; a0 3815 mova m1, m10 3816 pshufb m4, m6, m8 3817 vpdpwssd m1, m14, m2 ; a2' 3818 mova m2, m10 3819 pshufb m3, m19, m8 3820 vpdpwssd m2, m12, m4 ; b0 3821 mova m4, m10 3822 pshufb m5, m20, m9 3823 vpdpwssd m4, m12, m3 ; c0 3824 mova m3, m10 3825 pshufb m26, m21, m9 3826 vpdpwssd m3, m14, m5 ; b2' 3827 mova m5, m10 3828 pshufb m18, m9 3829 vpdpwssd m5, m14, m26 ; c2' 3830 pshufb m7, m8 3831 vpdpwssd m0, m13, m18 ; a1 3832 pshufb m6, m9 3833 vpdpwssd m1, m13, m7 ; a1' 3834 pshufb m19, m9 3835 vpdpwssd m2, m13, m6 ; b1 3836 pshufb m20, m8 3837 vpdpwssd m4, m13, m19 ; c1 3838 pshufb m21, m8 3839 vpdpwssd m3, m13, m20 ; b1' 3840 shufpd m18, m7, 0x55 3841 vpdpwssd m5, m13, m21 ; c1' 3842 shufpd m6, m20, 0x55 3843 vpdpwssd m0, m14, m18 ; a2 3844 shufpd m19, m21, 0x55 3845 vpdpwssd m1, m12, m18 ; a0' 3846 pshufb m18, m22, m8 3847 vpdpwssd m2, m14, m6 ; b2 3848 pshufb m7, m23, m8 3849 vpdpwssd m4, m14, m19 ; c2 3850 vpdpwssd m3, m12, m6 ; b0' 3851 mova m6, m10 3852 vpdpwssd m5, m12, m19 ; c0' 3853 pshufb m19, m24, m9 3854 vpdpwssd m6, m12, m18 ; d0 3855 mova m18, m10 3856 pshufb m26, m25, m9 3857 vpdpwssd m18, m12, m7 ; e0 3858 mova m7, m10 3859 pshufb m22, m9 3860 vpdpwssd m7, m14, m19 ; d2' 3861 mova m19, m10 3862 pshufb m23, m9 3863 vpdpwssd m19, m14, m26 ; e2' 3864 pshufb m24, m8 3865 vpdpwssd m6, m13, m22 ; d1 3866 pshufb m25, m8 3867 vpdpwssd m18, m13, m23 ; e1 3868 shufpd m22, m24, 0x55 3869 vpdpwssd m7, m13, m24 ; d1' 3870 shufpd m23, m25, 0x55 3871 vpdpwssd m19, m13, m25 ; e1' 3872 pslldq m0, 1 3873 vpdpwssd m6, m14, m22 ; d2 3874 pslldq m1, 1 3875 vpdpwssd m18, m14, m23 ; e2 3876 vpermt2b m2, m27, m4 ; 12 3877 vpdpwssd m7, m12, m22 ; d0' 3878 vpermt2b m3, m27, m5 ; 12' 3879 vpdpwssd m19, m12, m23 ; e0' 3880 vpshrdd m0, m2, 16 ; 01 3881 vpermt2b m6, m27, m18 ; 34 3882 vpshrdd m1, m3, 16 ; 01' 3883 vpermt2b m7, m27, m19 ; 34' 3884 vpshrdd m4, m2, m6, 16 ; 23 3885 vpshrdd m5, m3, m7, 16 ; 23' 3886.hv_w32_loop: 3887 movu m22, [r7+ssq*1+ 0] 3888 movu m24, [r7+ssq*1+12] 3889 lea r7, [r7+ssq*2] 3890 movu m23, [r7+ssq*0+ 0] 3891 movu m25, [r7+ssq*0+12] 3892 mova m19, m11 3893 vpdpwssd m19, m15, m2 ; B0 3894 mova m21, m11 3895 vpdpwssd m21, m15, m3 ; B0' 3896 mova m18, m11 3897 vpdpwssd m18, m15, m0 ; A0 3898 mova m20, m11 3899 vpdpwssd m20, m15, m1 ; A0' 3900 mova m2, m6 3901 vpdpwssd m19, m16, m6 ; B1 3902 mova m3, m7 3903 vpdpwssd m21, m16, m7 ; B1' 3904 mova m0, m4 3905 vpdpwssd m18, m16, m4 ; A1 3906 mova m1, m5 3907 pshufb m4, m22, m8 3908 vpdpwssd m20, m16, m5 ; A1' 3909 mova m6, m10 3910 pshufb m7, m23, m8 3911 vpdpwssd m6, m12, m4 ; f0 3912 mova m4, m10 3913 pshufb m5, m24, m9 3914 vpdpwssd m4, m12, m7 ; g0 3915 mova m7, m10 3916 pshufb m26, m25, m9 3917 vpdpwssd m7, m14, m5 ; f2' 3918 mova m5, m10 3919 pshufb m22, m9 3920 vpdpwssd m5, m14, m26 ; g2' 3921 pshufb m23, m9 3922 vpdpwssd m6, m13, m22 ; f1 3923 pshufb m24, m8 3924 vpdpwssd m4, m13, m23 ; g1 3925 pshufb m25, m8 3926 vpdpwssd m7, m13, m24 ; f1' 3927 shufpd m22, m24, 0x55 3928 vpdpwssd m5, m13, m25 ; g1' 3929 shufpd m23, m25, 0x55 3930 vpdpwssd m6, m14, m22 ; f2 3931 vpdpwssd m4, m14, m23 ; g2 3932 vpdpwssd m7, m12, m22 ; f0' 3933 vpdpwssd m5, m12, m23 ; g0' 3934 vpermt2b m6, m27, m4 ; 56 3935 vpermt2b m7, m27, m5 ; 56' 3936 vpdpwssd m19, m17, m6 ; B2 3937 vpshrdd m4, m2, m6, 16 ; 45 3938 vpdpwssd m21, m17, m7 ; B2' 3939 vpshrdd m5, m3, m7, 16 ; 45' 3940 vpdpwssd m18, m17, m4 ; A2 3941 vpdpwssd m20, m17, m5 ; A2' 3942 vpermt2b m19, m28, m21 3943 vpermt2b m18, m28, m20 3944 mova [r8+wq*0], m18 3945 mova [r8+wq*2], m19 3946 lea r8, [r8+wq*4] 3947 sub hd, 2 3948 jg .hv_w32_loop 3949 add srcq, 64 3950 add tmpq, 64 3951 movzx hd, r5b 3952 sub r5d, 1<<8 3953 jg .hv_w32_loop0 3954%if WIN64 3955 pop r8 3956%endif 3957 RET 3958 3959PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc 3960PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc 3961PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc 3962PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc 3963PREP_8TAP_FN sharp, SHARP, SHARP 3964 3965cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my 3966%define base r7-prep_avx512icl 3967 imul mxd, mxm, 0x010101 3968 add mxd, t0d ; 8tap_h, mx, 4tap_h 3969 imul myd, mym, 0x010101 3970 add myd, t1d ; 8tap_v, my, 4tap_v 3971 lea r7, [prep_avx512icl] 3972 mov wd, wm 3973 movifnidn hd, hm 3974 test mxd, 0xf00 3975 jnz .h 3976 test myd, 0xf00 3977 jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep 3978.v: 3979 movzx mxd, myb 3980 shr myd, 16 3981 cmp hd, 4 3982 cmove myd, mxd 3983 mov r5d, r7m 3984 vpbroadcastd m10, [prep_8tap_rnd] 3985 pmovsxbw xmm0, [base+subpel_filters+myq*8] 3986 tzcnt r6d, wd 3987 shr r5d, 11 3988 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] 3989 psllw xmm0, [base+prep_hv_shift+r5*8] 3990 add r7, r6 3991 lea r6, [strideq*3] 3992 sub srcq, r6 3993 mova [tmpq], xmm0 3994 vpbroadcastd m12, xmm0 3995 vpbroadcastd m13, [tmpq+ 4] 3996 vpbroadcastd m14, [tmpq+ 8] 3997 vpbroadcastd m15, [tmpq+12] 3998 jmp r7 3999.v_w4: 4000 mov r3d, 0x330c 4001 movq xm1, [srcq+strideq*0] 4002 kmovw k1, r3d 4003 vpbroadcastq ym1{k1}, [srcq+strideq*1] 4004 vpbroadcastq m0, [srcq+r6 ] 4005 vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3 4006 lea srcq, [srcq+strideq*4] 4007 vpbroadcastq ym0{k1}, [srcq+strideq*0] 4008 vpbroadcastq m2, [srcq+strideq*1] 4009 vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6 4010 mova ym5, [prep_endA] 4011 vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4 4012 vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5 4013 punpcklwd m1, m3 ; 01 12 23 34 4014 punpcklwd m2, m0 ; 23 34 45 56 4015.v_w4_loop: 4016 movq xm4, [srcq+r6 ] 4017 lea srcq, [srcq+strideq*4] 4018 vpbroadcastq ym4{k1}, [srcq+strideq*0] 4019 vpbroadcastq m3, [srcq+strideq*1] 4020 vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a 4021 mova m3, m10 4022 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 4023 valignq m1, m4, m0, 6 ; 6 7 8 9 4024 vpdpwssd m3, m13, m2 ; a1 b1 c1 d1 4025 mova m0, m4 4026 punpcklwd m4, m1, m4 ; 67 78 89 9a 4027 vpdpwssd m3, m15, m4 ; a3 b3 c3 d3 4028 vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78 4029 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 4030 mova m2, m4 4031 vpermb m3, m5, m3 4032 mova [tmpq], ym3 4033 add tmpq, 32 4034 sub hd, 4 4035 jg .v_w4_loop 4036 RET 4037.v_w8: 4038 movu xm0, [srcq+strideq*0] 4039 mov r3d, 0x33 4040 vbroadcasti32x4 ym1, [srcq+strideq*1] 4041 kmovb k1, r3d 4042 mova m7, [spel_v_shuf8] 4043 vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 4044 add srcq, r6 4045 vbroadcasti32x4 ym2, [srcq+strideq*0] 4046 vbroadcasti32x4 m3, [srcq+strideq*1] 4047 vbroadcasti32x4 ym0, [srcq+strideq*2] 4048 vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4 4049 vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6 4050 mova m8, [prep_endB] 4051 vpermb m1, m7, m1 ; 01 12 4052 vpermb m2, m7, m2 ; 23 34 4053 vpermb m3, m7, m0 ; 45 56 4054.v_w8_loop: 4055 lea srcq, [srcq+strideq*4] 4056 vbroadcasti32x4 ym4, [srcq+strideq*0] 4057 movu xm5, [srcq+strideq*1] 4058 vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8 4059 vbroadcasti32x4 ym0, [srcq+strideq*2] 4060 vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a 4061 mova m5, m10 4062 vpdpwssd m5, m12, m1 ; a0 b0 4063 mova m6, m10 4064 vpdpwssd m6, m12, m2 ; c0 d0 4065 mova m1, m3 4066 vpdpwssd m5, m13, m2 ; c1 d1 4067 vpdpwssd m6, m13, m3 ; c1 d1 4068 vpermb m2, m7, m4 ; 67 78 4069 vpdpwssd m5, m14, m3 ; a2 b2 4070 vpermb m3, m7, m0 ; 89 9a 4071 vpdpwssd m6, m14, m2 ; c2 d2 4072 vpdpwssd m5, m15, m2 ; a3 b3 4073 vpdpwssd m6, m15, m3 ; c3 d3 4074 vpermt2b m5, m8, m6 4075 mova [tmpq], m5 4076 add tmpq, 64 4077 sub hd, 4 4078 jg .v_w8_loop 4079 RET 4080.v_w16: 4081 vbroadcasti32x8 m0, [srcq+strideq*1] 4082 vinserti32x8 m1, m0, [srcq+strideq*2], 1 4083 vinserti32x8 m0, [srcq+strideq*0], 0 4084 mova m8, [spel_v_shuf16] 4085 add srcq, r6 4086 movu ym3, [srcq+strideq*0] 4087 vinserti32x8 m3, [srcq+strideq*1], 1 4088 movu ym5, [srcq+strideq*2] 4089 add srcq, r6 4090 vinserti32x8 m5, [srcq+strideq*0], 1 4091 mova m11, [prep_endA] 4092 vpermb m1, m8, m1 ; 12 4093 vpermb m0, m8, m0 ; 01 4094 vpermb m3, m8, m3 ; 34 4095 vpermb m5, m8, m5 ; 56 4096 vpshrdd m2, m1, m3, 16 ; 23 4097 vpshrdd m4, m3, m5, 16 ; 45 4098.v_w16_loop: 4099 mova m7, m10 4100 vpdpwssd m7, m12, m1 ; b0 4101 mova m6, m10 4102 vpdpwssd m6, m12, m0 ; a0 4103 mova m1, m3 4104 vpdpwssd m7, m13, m3 ; b1 4105 mova m0, m2 4106 vpdpwssd m6, m13, m2 ; a1 4107 mova m3, m5 4108 vpdpwssd m7, m14, m5 ; b2 4109 mova m2, m4 4110 vpdpwssd m6, m14, m4 ; a2 4111 movu ym5, [srcq+strideq*1] 4112 lea srcq, [srcq+strideq*2] 4113 vinserti32x8 m5, [srcq+strideq*0], 1 4114 vpermb m5, m8, m5 ; 78 4115 vpshrdd m4, m3, m5, 16 ; 67 4116 vpdpwssd m7, m15, m5 ; b3 4117 vpdpwssd m6, m15, m4 ; a3 4118 vpermt2b m6, m11, m7 4119 mova [tmpq], m6 4120 add tmpq, 64 4121 sub hd, 2 4122 jg .v_w16_loop 4123 RET 4124.v_w32: 4125.v_w64: 4126.v_w128: 4127 WIN64_PUSH_XMM 23 4128%if WIN64 4129 push r8 4130%endif 4131 mova m11, [prep_endC] 4132 lea r5, [hq+wq*8-256] 4133.v_w32_loop0: 4134 movu m16, [srcq+strideq*0] 4135 movu m17, [srcq+strideq*1] 4136 lea r7, [srcq+r6] 4137 movu m18, [srcq+strideq*2] 4138 movu m19, [r7 +strideq*0] 4139 mov r8, tmpq 4140 movu m20, [r7 +strideq*1] 4141 movu m21, [r7 +strideq*2] 4142 add r7, r6 4143 movu m22, [r7 +strideq*0] 4144 punpcklwd m0, m16, m17 ; 01l 4145 punpckhwd m16, m17 ; 01h 4146 punpcklwd m1, m17, m18 ; 12l 4147 punpckhwd m17, m18 ; 12h 4148 punpcklwd m2, m18, m19 ; 23l 4149 punpckhwd m18, m19 ; 23h 4150 punpcklwd m3, m19, m20 ; 34l 4151 punpckhwd m19, m20 ; 34h 4152 punpcklwd m4, m20, m21 ; 45l 4153 punpckhwd m20, m21 ; 45h 4154 punpcklwd m5, m21, m22 ; 56l 4155 punpckhwd m21, m22 ; 56h 4156.v_w32_loop: 4157 mova m6, m10 4158 vpdpwssd m6, m12, m0 ; a0l 4159 mova m8, m10 4160 vpdpwssd m8, m12, m16 ; a0h 4161 mova m7, m10 4162 vpdpwssd m7, m12, m1 ; b0l 4163 mova m9, m10 4164 vpdpwssd m9, m12, m17 ; b0h 4165 mova m0, m2 4166 vpdpwssd m6, m13, m2 ; a1l 4167 mova m16, m18 4168 vpdpwssd m8, m13, m18 ; a1h 4169 mova m1, m3 4170 vpdpwssd m7, m13, m3 ; b1l 4171 mova m17, m19 4172 vpdpwssd m9, m13, m19 ; b1h 4173 mova m2, m4 4174 vpdpwssd m6, m14, m4 ; a2l 4175 mova m18, m20 4176 vpdpwssd m8, m14, m20 ; a2h 4177 mova m3, m5 4178 vpdpwssd m7, m14, m5 ; b2l 4179 mova m19, m21 4180 vpdpwssd m9, m14, m21 ; b2h 4181 movu m21, [r7+strideq*1] 4182 lea r7, [r7+strideq*2] 4183 punpcklwd m4, m22, m21 ; 67l 4184 punpckhwd m20, m22, m21 ; 67h 4185 movu m22, [r7+strideq*0] 4186 vpdpwssd m6, m15, m4 ; a3l 4187 vpdpwssd m8, m15, m20 ; a3h 4188 punpcklwd m5, m21, m22 ; 78l 4189 punpckhwd m21, m22 ; 78h 4190 vpdpwssd m7, m15, m5 ; b3l 4191 vpdpwssd m9, m15, m21 ; b3h 4192 vpermt2b m6, m11, m8 4193 vpermt2b m7, m11, m9 4194 mova [r8+wq*0], m6 4195 mova [r8+wq*2], m7 4196 lea r8, [r8+wq*4] 4197 sub hd, 2 4198 jg .v_w32_loop 4199 add srcq, 64 4200 add tmpq, 64 4201 movzx hd, r5b 4202 sub r5d, 1<<8 4203 jg .v_w32_loop0 4204%if WIN64 4205 pop r8 4206%endif 4207 RET 4208.h_w4: 4209 RESET_STACK_STATE 4210 movzx mxd, mxb 4211 sub srcq, 2 4212 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4213 mov r5d, r7m 4214 vbroadcasti32x4 m4, [spel_h_shufA] 4215 vbroadcasti32x4 m5, [spel_h_shufB] 4216 shr r5d, 11 4217 mova ym9, [prep_endA] 4218 psllw xmm0, [base+prep_hv_shift+r5*8] 4219 mova [tmpq], xmm0 4220 vpbroadcastd m6, [tmpq+4] 4221 vpbroadcastd m7, [tmpq+8] 4222.h_w4_loop: 4223 movu xm2, [srcq+strideq*0] 4224 vinserti32x4 ym2, [srcq+strideq*1], 1 4225 vinserti32x4 m2, [srcq+strideq*2], 2 4226 vinserti32x4 m2, [srcq+r6 ], 3 4227 lea srcq, [srcq+strideq*4] 4228 mova m0, m10 4229 pshufb m1, m2, m4 4230 vpdpwssd m0, m6, m1 4231 pshufb m2, m5 4232 vpdpwssd m0, m7, m2 4233 vpermb m0, m9, m0 4234 mova [tmpq], ym0 4235 add tmpq, 32 4236 sub hd, 4 4237 jg .h_w4_loop 4238 RET 4239.h_w8: 4240 mova m6, [spel_h_shufA] 4241 movu m7, [spel_h_shufB] 4242 movu m8, [spel_h_shufC] 4243 mova m9, [spel_h_shufD] 4244 mova m11, [prep_endB] 4245.h_w8_loop: 4246 movu ym4, [srcq+strideq*0] 4247 vinserti32x8 m4, [srcq+strideq*1], 1 4248 movu ym5, [srcq+strideq*2] 4249 vinserti32x8 m5, [srcq+r6 ], 1 4250 lea srcq, [srcq+strideq*4] 4251 mova m0, m10 4252 mova m1, m10 4253 vpermb m2, m6, m4 4254 vpermb m3, m6, m5 4255 vpdpwssd m0, m12, m2 4256 vpdpwssd m1, m12, m3 4257 vpermb m2, m7, m4 4258 vpermb m3, m7, m5 4259 vpdpwssd m0, m13, m2 4260 vpdpwssd m1, m13, m3 4261 vpermb m2, m8, m4 4262 vpermb m3, m8, m5 4263 vpdpwssd m0, m14, m2 4264 vpdpwssd m1, m14, m3 4265 vpermb m2, m9, m4 4266 vpermb m3, m9, m5 4267 vpdpwssd m0, m15, m2 4268 vpdpwssd m1, m15, m3 4269 vpermt2b m0, m11, m1 4270 mova [tmpq], m0 4271 add tmpq, 64 4272 sub hd, 4 4273 jg .h_w8_loop 4274 RET 4275.h: 4276 vpbroadcastd m10, [prep_8tap_rnd] 4277 test myd, 0xf00 4278 jnz .hv 4279 lea r6, [strideq*3] 4280 cmp wd, 4 4281 je .h_w4 4282 shr mxd, 16 4283 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4284 mov r5d, r7m 4285 sub srcq, 6 4286 shr r5d, 11 4287 psllw xmm0, [base+prep_hv_shift+r5*8] 4288 mova [tmpq], xmm0 4289 vpbroadcastd m12, xmm0 4290 vpbroadcastd m13, [tmpq+ 4] 4291 vpbroadcastd m14, [tmpq+ 8] 4292 vpbroadcastd m15, [tmpq+12] 4293 cmp wd, 16 4294 jl .h_w8 4295 vbroadcasti32x4 m6, [spel_h_shufA] 4296 vbroadcasti32x4 m7, [spel_h_shufB] 4297 mova m11, [prep_endC] 4298 jg .h_w32 4299.h_w16_loop: 4300 movu ym2, [srcq+strideq*0+ 0] 4301 vinserti32x8 m2, [srcq+strideq*1+ 0], 1 4302 movu ym3, [srcq+strideq*0+16] 4303 vinserti32x8 m3, [srcq+strideq*1+16], 1 4304 lea srcq, [srcq+strideq*2] 4305 mova m0, m10 4306 mova m1, m10 4307 pshufb m4, m2, m6 4308 vpdpwssd m0, m12, m4 ; a0 4309 pshufb m4, m3, m6 4310 vpdpwssd m1, m14, m4 ; b2 4311 pshufb m4, m2, m7 4312 vpdpwssd m0, m13, m4 ; a1 4313 pshufb m4, m3, m7 4314 vpdpwssd m1, m15, m4 ; b3 4315 shufpd m2, m3, 0x55 4316 pshufb m4, m2, m6 4317 vpdpwssd m0, m14, m4 ; a2 4318 vpdpwssd m1, m12, m4 ; b0 4319 pshufb m2, m7 4320 vpdpwssd m0, m15, m2 ; a3 4321 vpdpwssd m1, m13, m2 ; b1 4322 vpermt2b m0, m11, m1 4323 mova [tmpq], m0 4324 add tmpq, 64 4325 sub hd, 2 4326 jg .h_w16_loop 4327 RET 4328.h_w32: 4329 lea srcq, [srcq+wq*2] 4330 neg wq 4331.h_w32_loop0: 4332 mov r6, wq 4333.h_w32_loop: 4334 movu m2, [srcq+r6*2+ 0] 4335 movu m3, [srcq+r6*2+ 8] 4336 mova m0, m10 4337 mova m1, m10 4338 pshufb m4, m2, m6 4339 vpdpwssd m0, m12, m4 ; a0 4340 pshufb m4, m3, m6 4341 vpdpwssd m1, m12, m4 ; b0 4342 vpdpwssd m0, m14, m4 ; a2 4343 movu m4, [srcq+r6*2+16] 4344 pshufb m3, m7 4345 vpdpwssd m1, m13, m3 ; b1 4346 vpdpwssd m0, m15, m3 ; a3 4347 pshufb m3, m4, m6 4348 vpdpwssd m1, m14, m3 ; b2 4349 pshufb m2, m7 4350 vpdpwssd m0, m13, m2 ; a1 4351 pshufb m4, m7 4352 vpdpwssd m1, m15, m4 ; b3 4353 vpermt2b m0, m11, m1 4354 mova [tmpq], m0 4355 add tmpq, 64 4356 add r6, 32 4357 jl .h_w32_loop 4358 add srcq, strideq 4359 dec hd 4360 jg .h_w32_loop0 4361 RET 4362.hv: 4363 vpbroadcastd m11, [pd_128] 4364 cmp wd, 4 4365 jg .hv_w8 4366 movzx mxd, mxb 4367 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4368 movzx mxd, myb 4369 shr myd, 16 4370 cmp hd, 4 4371 cmove myd, mxd 4372 mov r5d, r7m 4373 pmovsxbw xmm1, [base+subpel_filters+myq*8] 4374 lea r6, [strideq*3] 4375 sub srcq, 2 4376 shr r5d, 11 4377 sub srcq, r6 4378 psllw xmm0, [base+prep_hv_shift+r5*8] 4379 psllw xmm1, 2 4380 mova [tmpq+ 0], xmm0 4381 mova [tmpq+16], xmm1 4382 vpbroadcastd m12, xmm1 4383 movu xm16, [srcq+strideq*0] 4384 mov r3d, 0xff0 4385 vinserti128 ym16, [srcq+strideq*1], 1 4386 kmovw k1, r3d 4387 vbroadcasti32x4 m18, [srcq+strideq*2] 4388 add srcq, r6 4389 vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3 4390 movu xm17, [srcq+strideq*1] 4391 vbroadcasti32x4 ym18, [srcq+strideq*2] 4392 add srcq, r6 4393 vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2 4394 vbroadcasti32x4 m5, [spel_h_shufA] 4395 vbroadcasti32x4 m6, [spel_h_shufB] 4396 vpbroadcastd m8, [tmpq+ 4] 4397 vpbroadcastd m9, [tmpq+ 8] 4398 mova m1, m10 4399 mova m19, [spel_shuf4a] 4400 mova m2, m10 4401 pshufb m0, m16, m5 4402 vpdpwssd m1, m8, m0 4403 pshufb m0, m17, m5 4404 vpdpwssd m2, m8, m0 4405 vpbroadcastd m13, [tmpq+20] 4406 pshufb m16, m6 4407 vpbroadcastd m14, [tmpq+24] 4408 pshufb m17, m6 4409 vpbroadcastd m15, [tmpq+28] 4410 vpdpwssd m1, m9, m16 ; 0 1 2 3 4411 vpdpwssd m2, m9, m17 ; 4 5 6 4412 mova m7, [spel_shuf4b] 4413 vpermt2b m1, m19, m2 ; 01 12 23 34 4414 vpermb m2, m19, m2 ; 45 56 4415 mova ym19, [prep_endA] 4416 vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56 4417.hv_w4_loop: 4418 movu xm17, [srcq+strideq*1] 4419 vinserti128 ym17, [srcq+strideq*2], 1 4420 vbroadcasti32x4 m16, [srcq+r6 ] 4421 lea srcq, [srcq+strideq*4] 4422 vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3 4423 mova m18, m10 4424 pshufb m16, m17, m5 4425 vpdpwssd m18, m8, m16 4426 mova m16, m11 4427 vpdpwssd m16, m12, m1 ; a0 b0 c0 d0 4428 pshufb m17, m6 4429 vpdpwssd m18, m9, m17 ; 7 8 9 a 4430 mova m1, m2 4431 vpdpwssd m16, m13, m2 ; a1 b1 c1 d1 4432 vpermt2b m2, m7, m18 ; 67 78 89 9a 4433 vpdpwssd m16, m15, m2 ; a3 b3 c3 d3 4434 vshufi32x4 m1, m2, q1032 ; 45 56 67 78 4435 vpdpwssd m16, m14, m1 ; a2 b2 c2 d2 4436 vpermb m16, m19, m16 4437 mova [tmpq], ym16 4438 add tmpq, 32 4439 sub hd, 4 4440 jg .hv_w4_loop 4441 vzeroupper 4442 RET 4443.hv_w8: 4444 shr mxd, 16 4445 pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4446 movzx mxd, myb 4447 shr myd, 16 4448 cmp hd, 6 4449 cmovs myd, mxd 4450 mov r5d, r7m 4451 pmovsxbw xmm1, [base+subpel_filters+myq*8] 4452 lea r6, [strideq*3] 4453 sub srcq, 6 4454 shr r5d, 11 4455 sub srcq, r6 4456 psllw xmm0, [base+prep_hv_shift+r5*8] 4457 psllw xmm1, 2 4458 mova [tmpq+ 0], xmm0 4459 mova [tmpq+16], xmm1 4460 vpbroadcastd m12, xmm0 4461 vpbroadcastd m13, [tmpq+ 4] 4462 vpbroadcastd m14, [tmpq+ 8] 4463 vpbroadcastd m15, [tmpq+12] 4464 vpbroadcastd m16, xmm1 4465 vpbroadcastd m17, [tmpq+20] 4466 vpbroadcastd m18, [tmpq+24] 4467 vpbroadcastd m19, [tmpq+28] 4468 cmp wd, 8 4469 jg .hv_w16 4470 WIN64_SPILL_XMM 23 4471 mova m5, [spel_h_shufA] 4472 movu ym0, [srcq+strideq*0] 4473 vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 4474 movu ym9, [srcq+strideq*2] 4475 add srcq, r6 4476 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 4477 movu ym20, [srcq+strideq*1] 4478 vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 4479 add srcq, r6 4480 movu ym21, [srcq+strideq*0] ; 6 4481 movu m6, [spel_h_shufB] 4482 movu m7, [spel_h_shufC] 4483 mova ym22, [prep_endB] 4484 vpermb m8, m5, m0 4485 mova m1, m10 4486 vpdpwssd m1, m12, m8 ; a0 b0 4487 vpermb m8, m5, m9 4488 mova m2, m10 4489 vpdpwssd m2, m12, m8 ; c0 d0 4490 vpermb m8, m5, m20 4491 mova m3, m10 4492 vpdpwssd m3, m12, m8 ; e0 f0 4493 vpermb m8, m5, m21 4494 mova m4, m10 4495 vpdpwssd m4, m12, m8 ; g0 4496 vpermb m8, m6, m0 4497 vpdpwssd m1, m13, m8 ; a1 b1 4498 vpermb m8, m6, m9 4499 vpdpwssd m2, m13, m8 ; c1 d1 4500 vpermb m8, m6, m20 4501 vpdpwssd m3, m13, m8 ; e1 f1 4502 vpermb m8, m6, m21 4503 vpdpwssd m4, m13, m8 ; g1 4504 vpermb m8, m7, m0 4505 vpdpwssd m1, m14, m8 ; a2 b2 4506 vpermb m8, m7, m9 4507 vpdpwssd m2, m14, m8 ; c2 d2 4508 vpermb m8, m7, m20 4509 vpdpwssd m3, m14, m8 ; e2 f2 4510 vpermb m8, m7, m21 4511 vpdpwssd m4, m14, m8 ; g2 4512 mova m8, [spel_h_shufD] 4513 vpermb m0, m8, m0 4514 vpdpwssd m1, m15, m0 ; a3 b3 4515 mova m0, [spel_shuf8a] 4516 vpermb m9, m8, m9 4517 vpdpwssd m2, m15, m9 ; c3 d3 4518 mova m9, [spel_shuf8b] 4519 vpermb m20, m8, m20 4520 vpdpwssd m3, m15, m20 ; e3 f3 4521 vpermb m21, m8, m21 4522 vpdpwssd m4, m15, m21 ; g3 4523 vpermt2b m1, m0, m2 ; 01 12 4524 vpermt2b m2, m0, m3 ; 23 34 4525 vpermt2b m3, m0, m4 ; 45 56 4526.hv_w8_loop: 4527 movu ym0, [srcq+strideq*1] 4528 lea srcq, [srcq+strideq*2] 4529 vinserti32x8 m0, [srcq+strideq*0], 1 4530 mova m4, m10 4531 mova m20, m11 4532 vpermb m21, m5, m0 4533 vpdpwssd m4, m12, m21 ; h0 i0 4534 vpermb m21, m6, m0 4535 vpdpwssd m20, m16, m1 ; A0 B0 4536 vpdpwssd m4, m13, m21 ; h1 i1 4537 vpermb m21, m7, m0 4538 mova m1, m2 4539 vpdpwssd m20, m17, m2 ; A1 B1 4540 vpdpwssd m4, m14, m21 ; h2 i2 4541 vpermb m21, m8, m0 4542 mova m2, m3 4543 vpdpwssd m20, m18, m3 ; A2 B2 4544 vpdpwssd m4, m15, m21 ; h3 i3 4545 vpermt2b m3, m9, m4 ; 67 78 4546 vpdpwssd m20, m19, m3 ; A3 B3 4547 vpermb m20, m22, m20 4548 mova [tmpq], ym20 4549 add tmpq, 32 4550 sub hd, 2 4551 jg .hv_w8_loop 4552 RET 4553.hv_w16: 4554 WIN64_SPILL_XMM 27 4555%if WIN64 4556 push r8 4557%endif 4558 vbroadcasti32x4 m20, [spel_h_shufA] 4559 vbroadcasti32x4 m21, [spel_h_shufB] 4560 add wd, wd 4561 mova m9, [spel_shuf16] 4562 mova m26, [prep_endB] 4563 lea r5d, [hq+wq*8-256] 4564.hv_w16_loop0: 4565 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] 4566 vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 4567 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 4568 movu ym6, [srcq+strideq*1+ 0] 4569 movu ym7, [srcq+strideq*1+16] 4570 lea r7, [srcq+r6] 4571 vinserti32x8 m6, [srcq+strideq*2+ 0], 1 4572 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 4573 movu ym22, [r7 +strideq*0+ 0] 4574 movu ym23, [r7 +strideq*0+16] 4575 mov r8, tmpq 4576 vinserti32x8 m22, [r7 +strideq*1+ 0], 1 4577 vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4 4578 movu ym24, [r7 +strideq*2+ 0] 4579 movu ym25, [r7 +strideq*2+16] 4580 add r7, r6 4581 vinserti32x8 m24, [r7 +strideq*0+ 0], 1 4582 vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6 4583 pshufb m0, m4, m20 4584 mova m1, m10 4585 vpdpwssd m1, m12, m0 ; a0 4586 pshufb m0, m6, m20 4587 mova m2, m10 4588 vpdpwssd m2, m12, m0 ; b0 4589 pshufb m0, m7, m20 4590 mova m3, m10 4591 vpdpwssd m3, m14, m0 ; c2 4592 pshufb m0, m4, m21 4593 vpdpwssd m1, m13, m0 ; a1 4594 pshufb m0, m6, m21 4595 vpdpwssd m2, m13, m0 ; b1 4596 pshufb m0, m7, m21 4597 vpdpwssd m3, m15, m0 ; c3 4598 pshufb m0, m5, m20 4599 vpdpwssd m1, m14, m0 ; a2 4600 shufpd m6, m7, 0x55 4601 pshufb m7, m6, m20 4602 vpdpwssd m2, m14, m7 ; b2 4603 vpdpwssd m3, m12, m7 ; c0 4604 pshufb m5, m21 4605 vpdpwssd m1, m15, m5 ; a3 4606 pshufb m6, m21 4607 vpdpwssd m2, m15, m6 ; b3 4608 vpdpwssd m3, m13, m6 ; c1 4609 pshufb m0, m22, m20 4610 mova m4, m10 4611 vpdpwssd m4, m12, m0 ; d0 4612 pshufb m0, m23, m20 4613 mova m5, m10 4614 vpdpwssd m5, m14, m0 ; e2 4615 pshufb m0, m24, m20 4616 mova m6, m10 4617 vpdpwssd m6, m12, m0 ; f0 4618 pshufb m0, m25, m20 4619 mova m7, m10 4620 vpdpwssd m7, m14, m0 ; g2 4621 pshufb m0, m22, m21 4622 vpdpwssd m4, m13, m0 ; d1 4623 pshufb m0, m23, m21 4624 vpdpwssd m5, m15, m0 ; e3 4625 pshufb m0, m24, m21 4626 vpdpwssd m6, m13, m0 ; f1 4627 pshufb m0, m25, m21 4628 vpdpwssd m7, m15, m0 ; g3 4629 shufpd m22, m23, 0x55 4630 pshufb m23, m22, m20 4631 vpdpwssd m4, m14, m23 ; d2 4632 vpdpwssd m5, m12, m23 ; e0 4633 shufpd m24, m25, 0x55 4634 pshufb m25, m24, m20 4635 vpdpwssd m6, m14, m25 ; f2 4636 vpdpwssd m7, m12, m25 ; g0 4637 pshufb m22, m21 4638 vpdpwssd m4, m15, m22 ; d3 4639 vpdpwssd m5, m13, m22 ; e1 4640 pshufb m24, m21 4641 vpdpwssd m6, m15, m24 ; f3 4642 vpdpwssd m7, m13, m24 ; g1 4643 pslldq m1, 1 4644 vpermt2b m2, m9, m3 ; 12 4645 vpermt2b m4, m9, m5 ; 34 4646 vpermt2b m6, m9, m7 ; 56 4647 vpshrdd m1, m2, 16 ; 01 4648 vpshrdd m3, m2, m4, 16 ; 23 4649 vpshrdd m5, m4, m6, 16 ; 45 4650.hv_w16_loop: 4651 movu ym24, [r7+strideq*1+ 0] 4652 movu ym25, [r7+strideq*1+16] 4653 lea r7, [r7+strideq*2] 4654 vinserti32x8 m24, [r7+strideq*0+ 0], 1 4655 vinserti32x8 m25, [r7+strideq*0+16], 1 4656 mova m7, m10 4657 mova m8, m10 4658 pshufb m0, m24, m20 4659 vpdpwssd m7, m12, m0 ; h0 4660 mova m22, m11 4661 pshufb m0, m25, m20 4662 vpdpwssd m8, m14, m0 ; i2 4663 mova m23, m11 4664 vpdpwssd m22, m16, m1 ; A0 4665 mova m1, m3 4666 vpdpwssd m23, m16, m2 ; B0 4667 mova m2, m4 4668 pshufb m0, m24, m21 4669 vpdpwssd m7, m13, m0 ; h1 4670 pshufb m0, m25, m21 4671 vpdpwssd m8, m15, m0 ; i3 4672 vpdpwssd m22, m17, m3 ; A1 4673 mova m3, m5 4674 vpdpwssd m23, m17, m4 ; B1 4675 mova m4, m6 4676 shufpd m24, m25, 0x55 4677 pshufb m25, m24, m20 4678 vpdpwssd m7, m14, m25 ; h2 4679 vpdpwssd m8, m12, m25 ; i0 4680 vpdpwssd m22, m18, m5 ; A2 4681 vpdpwssd m23, m18, m6 ; B2 4682 pshufb m24, m21 4683 vpdpwssd m7, m15, m24 ; h3 4684 vpdpwssd m8, m13, m24 ; i1 4685 vpermt2b m7, m9, m8 ; 78 4686 vpshrdd m5, m6, m7, 16 ; 67 4687 vpdpwssd m22, m19, m5 ; A3 4688 vpdpwssd m23, m19, m7 ; B3 4689 mova m6, m7 4690 vpermt2b m22, m26, m23 4691 mova [r8+wq*0], ym22 4692 vextracti32x8 [r8+wq*1], m22, 1 4693 lea r8, [r8+wq*2] 4694 sub hd, 2 4695 jg .hv_w16_loop 4696 add srcq, 32 4697 add tmpq, 32 4698 movzx hd, r5b 4699 sub r5d, 1<<8 4700 jg .hv_w16_loop0 4701%if WIN64 4702 pop r8 4703%endif 4704 RET 4705 4706%if WIN64 4707DECLARE_REG_TMP 5 4708%else 4709DECLARE_REG_TMP 7 4710%endif 4711 4712cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts 4713%define base r6-pd_0to7 4714 mov t0d, r7m 4715 lea r6, [pd_0to7] 4716 shr t0d, 11 4717 vpbroadcastd m8, [base+warp_8x8t_rnd_v] 4718 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] 4719 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main 4720 psrad m14, m16, 15 4721 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4722 psrad m16, 15 4723 packssdw m14, m16 4724 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4725 psrad m15, m16, 15 4726 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4727 add tsq, tsq 4728 psrad m16, 15 4729 packssdw m15, m16 4730 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end 4731 4732cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd 4733 mov t0d, r7m ; pixel_max 4734 lea r6, [pd_0to7] 4735 shr t0d, 11 4736 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] 4737 vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] 4738 call .main 4739 psrad m14, m16, 13 4740 call .main2 4741 psrad m16, 13 4742 packusdw m14, m16 4743 call .main2 4744 psrad m15, m16, 13 4745 call .main2 4746 vpbroadcastd m0, [base+bidir_shift+t0*4] 4747 vpsrlvw m14, m0 4748 psrad m16, 13 4749 packusdw m15, m16 4750 vpsrlvw m15, m0 4751.end: 4752 mova m0, [base+warp8x8_end] 4753 vpermb m16, m0, m14 4754 lea r2, [dsq*3] 4755 mova [dstq+dsq*0], xm16 4756 vextracti128 [dstq+dsq*1], ym16, 1 4757 vextracti32x4 [dstq+dsq*2], m16, 2 4758 vextracti32x4 [dstq+r2 ], m16, 3 4759 vpermb m16, m0, m15 4760 lea dstq, [dstq+dsq*4] 4761 mova [dstq+dsq*0], xm16 4762 vextracti128 [dstq+dsq*1], ym16, 1 4763 vextracti32x4 [dstq+dsq*2], m16, 2 4764 vextracti32x4 [dstq+r2 ], m16, 3 4765 RET 4766.main: 4767 vpbroadcastd ym3, [base+pd_512] 4768%if WIN64 4769 mov abcdq, r5mp 4770 vpaddd ym18, ym3, r6m {1to8} ; mx 4771%else 4772 add r5d, 512 4773 vpbroadcastd ym18, r5d 4774%endif 4775 vpaddd ym20, ym3, r7m {1to8} ; my 4776 mova ym16, [base+pd_0to7] 4777 vpbroadcastd ym19, [abcdq+4*0] ; alpha 4778 vpbroadcastd ym21, [abcdq+4*1] ; gamma 4779 lea r4, [ssq*3+6] 4780 vpdpwssd ym18, ym19, ym16 ; tmx 4781 vpdpwssd ym20, ym21, ym16 ; tmy 4782 sub srcq, r4 4783 mova m10, [base+warp8x8_permA] 4784 lea r4, [mc_warp_filter+64*8] 4785 vbroadcasti32x4 m12, [base+warp8x8_permC] 4786 kxnorb k1, k1, k1 4787 vbroadcasti32x4 m13, [base+warp8x8_permD] 4788 movu ym5, [srcq+0] 4789 vinserti32x8 m5, [srcq+8], 1 4790 psrad ym17, ym18, 10 4791 mova m11, [base+warp8x8_permB] 4792 kmovb k2, k1 4793 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 4794 psrad ym19, 16 ; beta 4795 psrad ym21, 16 ; delta 4796 paddd ym18, ym19 4797 vpermb m4, m10, m5 4798 vpbroadcastq m9, [base+warp_shift_h+t0*8] 4799 pshufd m3, m3, q3120 4800 paddd m7, m1, m1 4801 pshufb m2, m3, m12 4802 vpdpwssd m1, m4, m2 4803 vpermb m5, m11, m5 4804 vshufi32x4 m4, m5, q1021 4805 pshufb m3, m13 4806 vpdpwssd m1, m4, m3 4807 call .h 4808 psllq m2, m1, 32 4809 paddd m1, m2 4810 vpmultishiftqb m1, m9, m1 4811 vpshrdq m1, m0, 48 ; 01 12 4812 call .h 4813 vpshrdq m2, m1, m0, 48 ; 23 34 4814 call .h 4815 vpshrdq m3, m2, m0, 48 ; 45 56 4816.main2: 4817 call .h 4818 psrad ym6, ym20, 10 4819 kmovb k1, k2 4820 paddd ym17, ym20, ym21 ; my += delta 4821 vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 4822 psrad ym16, ym17, 10 4823 kmovb k2, k1 4824 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 4825 shufps m5, m20, m6, q2020 4826 mova m16, m8 4827 pshufb m4, m5, m12 4828 vpdpwssd m16, m1, m4 ; a0 b0 4829 pshufb m5, m13 4830 mova m1, m2 4831 vpdpwssd m16, m2, m5 ; a1 b1 4832 shufps m6, m20, m6, q3131 4833 paddd ym20, ym17, ym21 4834 pshufb m4, m6, m12 4835 mova m2, m3 4836 vpdpwssd m16, m3, m4 ; a2 b2 4837 vpshrdq m3, m0, 48 ; 67 78 4838 pshufb m6, m13 4839 vpdpwssd m16, m3, m6 ; a3 b3 4840 ret 4841ALIGN function_align 4842.h: 4843 movu ym16, [srcq+ssq*1] 4844 psrad ym6, ym18, 10 4845 lea srcq, [srcq+ssq*2] 4846 vinserti32x8 m5, m16, [srcq+ssq*0], 1 4847 kmovb k1, k2 4848 paddd ym17, ym18, ym19 ; mx += beta 4849 vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 4850 psrad ym16, ym17, 10 4851 kmovb k2, k1 4852 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 4853 vpermb m4, m10, m5 4854 shufps m16, m18, m6, q2020 4855 shufps m6, m18, m6, q3131 4856 mova m0, m7 4857 pshufb m18, m16, m12 4858 vpdpwssd m0, m4, m18 ; a0 b0 4859 vpermb m5, m11, m5 4860 pshufb m18, m6, m13 4861 vpdpwssd m0, m5, m18 ; a3 b3 4862 paddd ym18, ym17, ym19 4863 vshufi32x4 m17, m4, m5, q1021 4864 pshufb m16, m13 4865 vpdpwssd m0, m17, m16 ; a1 b1 4866 vshufi32x4 m4, m5, q2132 4867 pshufb m6, m12 4868 vpdpwssd m0, m4, m6 ; a2 b2 4869 vpmultishiftqb m0, m9, m0 ; a a b b 4870 ret 4871 4872%macro BIDIR_FN 0 4873 call .main 4874 lea stride3q, [strideq*3] 4875 jmp wq 4876.w4: 4877 movq [dstq ], xm0 4878 movhps [dstq+strideq*1], xm0 4879 vextracti32x4 xm2, ym0, 1 4880 movq [dstq+strideq*2], xm2 4881 movhps [dstq+stride3q ], xm2 4882 cmp hd, 8 4883 jl .w4_end 4884 vextracti32x4 xm2, m0, 2 4885 lea dstq, [dstq+strideq*4] 4886 movq [dstq ], xm2 4887 movhps [dstq+strideq*1], xm2 4888 vextracti32x4 xm0, m0, 3 4889 movq [dstq+strideq*2], xm0 4890 movhps [dstq+stride3q ], xm0 4891 je .w4_end 4892 lea dstq, [dstq+strideq*4] 4893 movq [dstq ], xm1 4894 movhps [dstq+strideq*1], xm1 4895 vextracti32x4 xm0, ym1, 1 4896 movq [dstq+strideq*2], xm0 4897 movhps [dstq+stride3q ], xm0 4898 vextracti32x4 xm0, m1, 2 4899 lea dstq, [dstq+strideq*4] 4900 movq [dstq ], xm0 4901 movhps [dstq+strideq*1], xm0 4902 vextracti32x4 xm1, m1, 3 4903 movq [dstq+strideq*2], xm1 4904 movhps [dstq+stride3q ], xm1 4905.w4_end: 4906 RET 4907.w8_loop: 4908 call .main 4909 lea dstq, [dstq+strideq*4] 4910.w8: 4911 mova [dstq+strideq*0], xm0 4912 vextracti32x4 [dstq+strideq*1], ym0, 1 4913 vextracti32x4 [dstq+strideq*2], m0, 2 4914 vextracti32x4 [dstq+stride3q ], m0, 3 4915 sub hd, 8 4916 jl .w8_end 4917 lea dstq, [dstq+strideq*4] 4918 mova [dstq+strideq*0], xm1 4919 vextracti32x4 [dstq+strideq*1], ym1, 1 4920 vextracti32x4 [dstq+strideq*2], m1, 2 4921 vextracti32x4 [dstq+stride3q ], m1, 3 4922 jg .w8_loop 4923.w8_end: 4924 RET 4925.w16_loop: 4926 call .main 4927 lea dstq, [dstq+strideq*4] 4928.w16: 4929 mova [dstq+strideq*0], ym0 4930 vextracti32x8 [dstq+strideq*1], m0, 1 4931 mova [dstq+strideq*2], ym1 4932 vextracti32x8 [dstq+stride3q ], m1, 1 4933 sub hd, 4 4934 jg .w16_loop 4935 RET 4936.w32_loop: 4937 call .main 4938 lea dstq, [dstq+strideq*2] 4939.w32: 4940 mova [dstq+strideq*0], m0 4941 mova [dstq+strideq*1], m1 4942 sub hd, 2 4943 jg .w32_loop 4944 RET 4945.w64_loop: 4946 call .main 4947 add dstq, strideq 4948.w64: 4949 mova [dstq+64*0], m0 4950 mova [dstq+64*1], m1 4951 dec hd 4952 jg .w64_loop 4953 RET 4954.w128_loop: 4955 call .main 4956 add dstq, strideq 4957.w128: 4958 mova [dstq+64*0], m0 4959 mova [dstq+64*1], m1 4960 call .main 4961 mova [dstq+64*2], m0 4962 mova [dstq+64*3], m1 4963 dec hd 4964 jg .w128_loop 4965 RET 4966%endmacro 4967 4968%if WIN64 4969DECLARE_REG_TMP 5 4970%else 4971DECLARE_REG_TMP 7 4972%endif 4973 4974cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 4975%define base r6-avg_avx512icl_table 4976 lea r6, [avg_avx512icl_table] 4977 tzcnt wd, wm 4978 mov t0d, r6m ; pixel_max 4979 movsxd wq, [r6+wq*4] 4980 shr t0d, 11 4981 vpbroadcastd m2, [base+avg_round+t0*4] 4982 vpbroadcastd m3, [base+avg_shift+t0*4] 4983 movifnidn hd, hm 4984 add wq, r6 4985 BIDIR_FN 4986ALIGN function_align 4987.main: 4988 mova m0, [tmp1q+64*0] 4989 paddsw m0, [tmp2q+64*0] 4990 mova m1, [tmp1q+64*1] 4991 paddsw m1, [tmp2q+64*1] 4992 add tmp1q, 64*2 4993 add tmp2q, 64*2 4994 pmaxsw m0, m2 4995 pmaxsw m1, m2 4996 psubsw m0, m2 4997 psubsw m1, m2 4998 vpsrlvw m0, m3 4999 vpsrlvw m1, m3 5000 ret 5001 5002cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 5003%define base r6-w_avg_avx512icl_table 5004 lea r6, [w_avg_avx512icl_table] 5005 tzcnt wd, wm 5006 mov t0d, r7m ; pixel_max 5007 shr t0d, 11 5008 movsxd wq, [r6+wq*4] 5009 vpbroadcastd m5, [base+w_avg_round+t0*4] 5010 vpbroadcastd m7, [base+bidir_shift+t0*4] 5011 add wq, r6 5012 mov r6d, r6m ; weight 5013 lea t0d, [r6-16] 5014 shl r6d, 16 5015 sub r6d, t0d ; 16-weight, weight 5016 movifnidn hd, hm 5017 vpbroadcastd m6, r6d 5018 BIDIR_FN 5019ALIGN function_align 5020.main: 5021 mova m3, [tmp1q+64*0] 5022 mova m1, [tmp2q+64*0] 5023 mova m0, [tmp1q+64*1] 5024 mova m4, [tmp2q+64*1] 5025 add tmp1q, 64*2 5026 add tmp2q, 64*2 5027 punpcklwd m2, m1, m3 5028 punpckhwd m1, m3 5029 punpcklwd m3, m4, m0 5030 punpckhwd m4, m0 5031 mova m0, m5 5032 vpdpwssd m0, m6, m2 5033 mova m2, m5 5034 vpdpwssd m2, m6, m1 5035 mova m1, m5 5036 vpdpwssd m1, m6, m3 5037 mova m3, m5 5038 vpdpwssd m3, m6, m4 5039 REPX {psrad x, 2}, m0, m2, m1, m3 5040 packusdw m0, m2 5041 packusdw m1, m3 5042 vpsrlvw m0, m7 5043 vpsrlvw m1, m7 5044 ret 5045 5046cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 5047%define base r7-mask_avx512icl_table 5048 lea r7, [mask_avx512icl_table] 5049 tzcnt wd, wm 5050 mov r6d, r7m ; pixel_max 5051 movifnidn hd, hm 5052 shr r6d, 11 5053 movsxd wq, [r7+wq*4] 5054 vpbroadcastd m8, [base+pw_64] 5055 vpbroadcastd m9, [base+mask_round+r6*4] 5056 vpbroadcastd m10, [base+bidir_shift+r6*4] 5057 mov maskq, maskmp 5058 add wq, r7 5059 BIDIR_FN 5060ALIGN function_align 5061.main: 5062 pmovzxbw m1, [maskq+32*0] 5063 mova m4, [tmp1q+64*0] 5064 mova m2, [tmp2q+64*0] 5065 pmovzxbw m6, [maskq+32*1] 5066 mova m5, [tmp1q+64*1] 5067 mova m3, [tmp2q+64*1] 5068 add maskq, 32*2 5069 add tmp1q, 64*2 5070 add tmp2q, 64*2 5071 punpcklwd m7, m4, m2 5072 punpckhwd m4, m2 5073 psubw m0, m8, m1 5074 punpcklwd m2, m1, m0 ; m, 64-m 5075 punpckhwd m1, m0 5076 mova m0, m9 5077 vpdpwssd m0, m7, m2 5078 mova m2, m9 5079 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) 5080 punpcklwd m7, m5, m3 5081 punpckhwd m5, m3 5082 psubw m1, m8, m6 5083 punpcklwd m3, m6, m1 5084 punpckhwd m6, m1 5085 mova m1, m9 5086 vpdpwssd m1, m7, m3 5087 mova m3, m9 5088 vpdpwssd m3, m5, m6 5089 REPX {psrad x, 4}, m0, m2, m1, m3 5090 packusdw m0, m2 5091 packusdw m1, m3 5092 vpsrlvw m0, m10 5093 vpsrlvw m1, m10 5094 ret 5095 5096cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5097%define base r7-w_mask_420_avx512icl_table 5098 lea r7, [w_mask_420_avx512icl_table] 5099 tzcnt wd, wm 5100 mov r6d, r8m ; pixel_max 5101 movifnidn hd, hm 5102 shr r6d, 11 5103 movsxd wq, [r7+wq*4] 5104 vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5105 vpbroadcastd m11, [base+pw_64] 5106 vpbroadcastd m12, [base+mask_round+r6*4] 5107 vpbroadcastd m13, [base+bidir_shift+r6*4] 5108 mov r6d, r7m ; sign 5109 vpbroadcastd m14, [base+w_mask_round+r6*4] 5110 mova ym15, [w_mask_end42x] 5111 mov maskq, maskmp 5112 add wq, r7 5113 call .main 5114 lea stride3q, [strideq*3] 5115 jmp wq 5116.w4: 5117 mova m4, [w_mask_shuf4] 5118 vpermt2b m2, m4, m3 5119 mova m3, m14 5120 vpdpbusd m3, m2, [pb_64] {1to16} 5121 vpermb m3, m15, m3 5122 movq [dstq+strideq*0], xm0 5123 movhps [dstq+strideq*1], xm0 5124 vextracti32x4 xm2, ym0, 1 5125 movq [dstq+strideq*2], xm2 5126 movhps [dstq+stride3q ], xm2 5127 mova [maskq], xm3 5128 cmp hd, 8 5129 jl .w4_end 5130 vextracti32x4 xm2, m0, 2 5131 lea dstq, [dstq+strideq*4] 5132 movq [dstq+strideq*0], xm2 5133 movhps [dstq+strideq*1], xm2 5134 vextracti32x4 xm0, m0, 3 5135 movq [dstq+strideq*2], xm0 5136 movhps [dstq+stride3q ], xm0 5137 je .w4_end 5138 lea dstq, [dstq+strideq*4] 5139 movq [dstq+strideq*0], xm1 5140 movhps [dstq+strideq*1], xm1 5141 vextracti32x4 xm2, ym1, 1 5142 movq [dstq+strideq*2], xm2 5143 movhps [dstq+stride3q ], xm2 5144 vextracti32x4 xm2, m1, 2 5145 lea dstq, [dstq+strideq*4] 5146 movq [dstq+strideq*0], xm2 5147 movhps [dstq+strideq*1], xm2 5148 vextracti32x4 xm1, m1, 3 5149 movq [dstq+strideq*2], xm1 5150 movhps [dstq+stride3q ], xm1 5151.w4_end: 5152 RET 5153.w8: 5154 mova m8, [w_mask_shuf8] 5155 vpbroadcastd m9, [pb_64] 5156 jmp .w8_start 5157.w8_loop: 5158 call .main 5159 lea dstq, [dstq+strideq*4] 5160 add maskq, 16 5161.w8_start: 5162 vpermt2b m2, m8, m3 5163 mova m3, m14 5164 vpdpbusd m3, m2, m9 5165 vpermb m3, m15, m3 5166 mova [dstq+strideq*0], xm0 5167 vextracti32x4 [dstq+strideq*1], ym0, 1 5168 vextracti32x4 [dstq+strideq*2], m0, 2 5169 vextracti32x4 [dstq+stride3q ], m0, 3 5170 mova [maskq], xm3 5171 sub hd, 8 5172 jl .w8_end 5173 lea dstq, [dstq+strideq*4] 5174 mova [dstq+strideq*0], xm1 5175 vextracti32x4 [dstq+strideq*1], ym1, 1 5176 vextracti32x4 [dstq+strideq*2], m1, 2 5177 vextracti32x4 [dstq+stride3q ], m1, 3 5178 jg .w8_loop 5179.w8_end: 5180 RET 5181.w16: 5182 mova m8, [w_mask_shuf16] 5183 vpbroadcastd m9, [pb_64] 5184 jmp .w16_start 5185.w16_loop: 5186 call .main 5187 lea dstq, [dstq+strideq*4] 5188 add maskq, 16 5189.w16_start: 5190 vpermt2b m2, m8, m3 5191 mova m3, m14 5192 vpdpbusd m3, m2, m9 5193 vpermb m3, m15, m3 5194 mova [dstq+strideq*0], ym0 5195 vextracti32x8 [dstq+strideq*1], m0, 1 5196 mova [dstq+strideq*2], ym1 5197 vextracti32x8 [dstq+stride3q ], m1, 1 5198 mova [maskq], xm3 5199 sub hd, 4 5200 jg .w16_loop 5201 RET 5202.w32_loop: 5203 call .main 5204 lea dstq, [dstq+strideq*4] 5205 add maskq, 32 5206.w32: 5207 paddw m2, m3 5208 mova m8, m14 5209 vpdpwssd m8, m11, m2 5210 mova [dstq+strideq*0], m0 5211 mova [dstq+strideq*1], m1 5212 call .main 5213 paddw m2, m3 5214 mova m3, m14 5215 vpdpwssd m3, m11, m2 5216 vpermt2b m8, m15, m3 5217 mova [dstq+strideq*2], m0 5218 mova [dstq+stride3q ], m1 5219 mova [maskq], ym8 5220 sub hd, 4 5221 jg .w32_loop 5222 RET 5223.w64_loop: 5224 call .main 5225 lea dstq, [dstq+strideq*2] 5226 add maskq, 32 5227.w64: 5228 mova m8, m2 5229 mova m9, m3 5230 mova [dstq+strideq*0+64*0], m0 5231 mova [dstq+strideq*0+64*1], m1 5232 call .main 5233 paddw m8, m2 5234 paddw m9, m3 5235 mova m2, m14 5236 vpdpwssd m2, m11, m8 5237 mova m3, m14 5238 vpdpwssd m3, m11, m9 5239 vpermt2b m2, m15, m3 5240 mova [dstq+strideq*1+64*0], m0 5241 mova [dstq+strideq*1+64*1], m1 5242 mova [maskq], ym2 5243 sub hd, 2 5244 jg .w64_loop 5245 RET 5246.w128_loop: 5247 call .main 5248 lea dstq, [dstq+strideq*2] 5249 add maskq, 64 5250.w128: 5251 mova m16, m2 5252 mova m8, m3 5253 mova [dstq+strideq*0+64*0], m0 5254 mova [dstq+strideq*0+64*1], m1 5255 call .main 5256 mova m17, m2 5257 mova m9, m3 5258 mova [dstq+strideq*0+64*2], m0 5259 mova [dstq+strideq*0+64*3], m1 5260 call .main 5261 paddw m2, m16 5262 paddw m3, m8 5263 mova m16, m14 5264 vpdpwssd m16, m11, m2 5265 mova m8, m14 5266 vpdpwssd m8, m11, m3 5267 mova [dstq+strideq*1+64*0], m0 5268 mova [dstq+strideq*1+64*1], m1 5269 call .main 5270 paddw m2, m17 5271 paddw m3, m9 5272 mova m17, m14 5273 vpdpwssd m17, m11, m2 5274 mova m9, m14 5275 vpdpwssd m9, m11, m3 5276 vpermt2b m16, m15, m8 5277 vpermt2b m17, m15, m9 5278 mova [dstq+strideq*1+64*2], m0 5279 mova [dstq+strideq*1+64*3], m1 5280 mova [maskq+32*0], ym16 5281 mova [maskq+32*1], ym17 5282 sub hd, 2 5283 jg .w128_loop 5284 vzeroupper 5285 RET 5286ALIGN function_align 5287.main: 5288 mova m1, [tmp1q+64*0] 5289 mova m3, [tmp2q+64*0] 5290 mova m4, [tmp1q+64*1] 5291 mova m7, [tmp2q+64*1] 5292 add tmp1q, 64*2 5293 add tmp2q, 64*2 5294 psubsw m6, m1, m3 5295 punpcklwd m5, m3, m1 5296 pabsw m6, m6 5297 punpckhwd m3, m1 5298 psubusw m6, m10, m6 5299 psrlw m6, 10 ; 64-m 5300 psubw m2, m11, m6 ; m 5301 punpcklwd m1, m6, m2 5302 punpckhwd m6, m2 5303 mova m0, m12 5304 vpdpwssd m0, m5, m1 5305 mova m1, m12 5306 vpdpwssd m1, m3, m6 5307 psubsw m5, m4, m7 5308 punpcklwd m6, m7, m4 5309 pabsw m5, m5 5310 punpckhwd m7, m4 5311 psubusw m5, m10, m5 5312 psrlw m5, 10 5313 psubw m3, m11, m5 5314 punpcklwd m4, m5, m3 5315 psrad m0, 4 5316 punpckhwd m5, m3 5317 psrad m1, 4 5318 packusdw m0, m1 5319 mova m1, m12 5320 vpdpwssd m1, m6, m4 5321 mova m4, m12 5322 vpdpwssd m4, m7, m5 5323 psrad m1, 4 5324 psrad m4, 4 5325 packusdw m1, m4 5326 vpsrlvw m0, m13 5327 vpsrlvw m1, m13 5328 ret 5329 5330cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 5331%define base r7-w_mask_422_avx512icl_table 5332 lea r7, [w_mask_422_avx512icl_table] 5333 tzcnt wd, wm 5334 mov r6d, r8m ; pixel_max 5335 movifnidn hd, hm 5336 shr r6d, 11 5337 movsxd wq, [r7+wq*4] 5338 vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5339 vpbroadcastd m9, [base+pw_64] 5340 vpbroadcastd m10, [base+mask_round+r6*4] 5341 vpbroadcastd m11, [base+bidir_shift+r6*4] 5342 mov r6d, r7m ; sign 5343 vpbroadcastd m12, [base+w_mask_round+r6*4] 5344 mova ym13, [w_mask_end42x] 5345 mov maskq, maskmp 5346 add wq, r7 5347 paddw m14, m9, m9 ; pw_128 5348 call .main 5349 lea stride3q, [strideq*3] 5350 jmp wq 5351.w4: 5352 movq [dstq+strideq*0], xm0 5353 movhps [dstq+strideq*1], xm0 5354 vextracti32x4 xm2, ym0, 1 5355 movq [dstq+strideq*2], xm2 5356 movhps [dstq+stride3q ], xm2 5357 cmp hd, 8 5358 jl .w4_end 5359 vextracti32x4 xm2, m0, 2 5360 lea dstq, [dstq+strideq*4] 5361 movq [dstq+strideq*0], xm2 5362 movhps [dstq+strideq*1], xm2 5363 vextracti32x4 xm0, m0, 3 5364 movq [dstq+strideq*2], xm0 5365 movhps [dstq+stride3q ], xm0 5366 je .w4_end 5367 lea dstq, [dstq+strideq*4] 5368 movq [dstq+strideq*0], xm1 5369 movhps [dstq+strideq*1], xm1 5370 vextracti32x4 xm2, ym1, 1 5371 movq [dstq+strideq*2], xm2 5372 movhps [dstq+stride3q ], xm2 5373 vextracti32x4 xm2, m1, 2 5374 lea dstq, [dstq+strideq*4] 5375 movq [dstq+strideq*0], xm2 5376 movhps [dstq+strideq*1], xm2 5377 vextracti32x4 xm1, m1, 3 5378 movq [dstq+strideq*2], xm1 5379 movhps [dstq+stride3q ], xm1 5380.w4_end: 5381 RET 5382.w8_loop: 5383 call .main 5384 lea dstq, [dstq+strideq*4] 5385.w8: 5386 mova [dstq+strideq*0], xm0 5387 vextracti32x4 [dstq+strideq*1], ym0, 1 5388 vextracti32x4 [dstq+strideq*2], m0, 2 5389 vextracti32x4 [dstq+stride3q ], m0, 3 5390 sub hd, 8 5391 jl .w8_end 5392 lea dstq, [dstq+strideq*4] 5393 mova [dstq+strideq*0], xm1 5394 vextracti32x4 [dstq+strideq*1], ym1, 1 5395 vextracti32x4 [dstq+strideq*2], m1, 2 5396 vextracti32x4 [dstq+stride3q ], m1, 3 5397 jg .w8_loop 5398.w8_end: 5399 RET 5400.w16_loop: 5401 call .main 5402 lea dstq, [dstq+strideq*4] 5403.w16: 5404 mova [dstq+strideq*0], ym0 5405 vextracti32x8 [dstq+strideq*1], m0, 1 5406 mova [dstq+strideq*2], ym1 5407 vextracti32x8 [dstq+stride3q ], m1, 1 5408 sub hd, 4 5409 jg .w16_loop 5410 RET 5411.w32_loop: 5412 call .main 5413 lea dstq, [dstq+strideq*2] 5414.w32: 5415 mova [dstq+strideq*0], m0 5416 mova [dstq+strideq*1], m1 5417 sub hd, 2 5418 jg .w32_loop 5419 RET 5420.w64_loop: 5421 call .main 5422 add dstq, strideq 5423.w64: 5424 mova [dstq+64*0], m0 5425 mova [dstq+64*1], m1 5426 dec hd 5427 jg .w64_loop 5428 RET 5429.w128_loop: 5430 call .main 5431 add dstq, strideq 5432.w128: 5433 mova [dstq+64*0], m0 5434 mova [dstq+64*1], m1 5435 call .main 5436 mova [dstq+64*2], m0 5437 mova [dstq+64*3], m1 5438 dec hd 5439 jg .w128_loop 5440 RET 5441ALIGN function_align 5442.main: 5443 mova m1, [tmp1q+64*0] 5444 mova m3, [tmp2q+64*0] 5445 mova m4, [tmp1q+64*1] 5446 mova m7, [tmp2q+64*1] 5447 add tmp1q, 64*2 5448 add tmp2q, 64*2 5449 psubsw m6, m1, m3 5450 punpcklwd m5, m3, m1 5451 pabsw m6, m6 5452 punpckhwd m3, m1 5453 psubusw m6, m8, m6 5454 psrlw m6, 10 5455 psubw m2, m9, m6 5456 punpcklwd m1, m6, m2 5457 punpckhwd m6, m2 5458 mova m0, m10 5459 vpdpwssd m0, m5, m1 5460 mova m1, m10 5461 vpdpwssd m1, m3, m6 5462 psubsw m5, m4, m7 5463 punpcklwd m6, m7, m4 5464 pabsw m5, m5 5465 punpckhwd m7, m4 5466 psubusw m5, m8, m5 5467 psrlw m5, 10 5468 psubw m3, m9, m5 5469 punpcklwd m4, m5, m3 5470 psrad m0, 4 5471 punpckhwd m5, m3 5472 psrad m1, 4 5473 packusdw m0, m1 5474 mova m1, m10 5475 vpdpwssd m1, m6, m4 5476 mova m4, m10 5477 vpdpwssd m4, m7, m5 5478 mova m5, m12 5479 vpdpwssd m5, m14, m2 5480 mova m2, m12 5481 vpdpwssd m2, m14, m3 5482 psrad m1, 4 5483 psrad m4, 4 5484 packusdw m1, m4 5485 vpermt2b m5, m13, m2 5486 vpsrlvw m0, m11 5487 vpsrlvw m1, m11 5488 mova [maskq], ym5 5489 add maskq, 32 5490 ret 5491 5492cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 5493%define base r7-w_mask_444_avx512icl_table 5494 lea r7, [w_mask_444_avx512icl_table] 5495 tzcnt wd, wm 5496 mov r6d, r8m ; pixel_max 5497 movifnidn hd, hm 5498 shr r6d, 11 5499 movsxd wq, [r7+wq*4] 5500 vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5501 vpbroadcastd m9, [base+pw_64] 5502 vpbroadcastd m10, [base+mask_round+r6*4] 5503 mova m11, [w_mask_end444] 5504 vpbroadcastd m12, [base+bidir_shift+r6*4] 5505 mov maskq, maskmp 5506 add wq, r7 5507 call .main 5508 lea stride3q, [strideq*3] 5509 jmp wq 5510.w4: 5511 movq [dstq+strideq*0], xm0 5512 movhps [dstq+strideq*1], xm0 5513 vextracti32x4 xm2, ym0, 1 5514 movq [dstq+strideq*2], xm2 5515 movhps [dstq+stride3q ], xm2 5516 cmp hd, 8 5517 jl .w4_end 5518 vextracti32x4 xm2, m0, 2 5519 lea dstq, [dstq+strideq*4] 5520 movq [dstq+strideq*0], xm2 5521 movhps [dstq+strideq*1], xm2 5522 vextracti32x4 xm0, m0, 3 5523 movq [dstq+strideq*2], xm0 5524 movhps [dstq+stride3q ], xm0 5525 je .w4_end 5526 lea dstq, [dstq+strideq*4] 5527 movq [dstq+strideq*0], xm1 5528 movhps [dstq+strideq*1], xm1 5529 vextracti32x4 xm2, ym1, 1 5530 movq [dstq+strideq*2], xm2 5531 movhps [dstq+stride3q ], xm2 5532 vextracti32x4 xm2, m1, 2 5533 lea dstq, [dstq+strideq*4] 5534 movq [dstq+strideq*0], xm2 5535 movhps [dstq+strideq*1], xm2 5536 vextracti32x4 xm1, m1, 3 5537 movq [dstq+strideq*2], xm1 5538 movhps [dstq+stride3q ], xm1 5539.w4_end: 5540 RET 5541.w8_loop: 5542 call .main 5543 lea dstq, [dstq+strideq*4] 5544.w8: 5545 mova [dstq+strideq*0], xm0 5546 vextracti32x4 [dstq+strideq*1], ym0, 1 5547 vextracti32x4 [dstq+strideq*2], m0, 2 5548 vextracti32x4 [dstq+stride3q ], m0, 3 5549 sub hd, 8 5550 jl .w8_end 5551 lea dstq, [dstq+strideq*4] 5552 mova [dstq+strideq*0], xm1 5553 vextracti32x4 [dstq+strideq*1], ym1, 1 5554 vextracti32x4 [dstq+strideq*2], m1, 2 5555 vextracti32x4 [dstq+stride3q ], m1, 3 5556 jg .w8_loop 5557.w8_end: 5558 RET 5559.w16_loop: 5560 call .main 5561 lea dstq, [dstq+strideq*4] 5562.w16: 5563 mova [dstq+strideq*0], ym0 5564 vextracti32x8 [dstq+strideq*1], m0, 1 5565 mova [dstq+strideq*2], ym1 5566 vextracti32x8 [dstq+stride3q ], m1, 1 5567 sub hd, 4 5568 jg .w16_loop 5569 RET 5570.w32_loop: 5571 call .main 5572 lea dstq, [dstq+strideq*2] 5573.w32: 5574 mova [dstq+strideq*0], m0 5575 mova [dstq+strideq*1], m1 5576 sub hd, 2 5577 jg .w32_loop 5578 RET 5579.w64_loop: 5580 call .main 5581 add dstq, strideq 5582.w64: 5583 mova [dstq+64*0], m0 5584 mova [dstq+64*1], m1 5585 dec hd 5586 jg .w64_loop 5587 RET 5588.w128_loop: 5589 call .main 5590 add dstq, strideq 5591.w128: 5592 mova [dstq+64*0], m0 5593 mova [dstq+64*1], m1 5594 call .main 5595 mova [dstq+64*2], m0 5596 mova [dstq+64*3], m1 5597 dec hd 5598 jg .w128_loop 5599 RET 5600ALIGN function_align 5601.main: 5602 mova m1, [tmp1q+64*0] 5603 mova m3, [tmp2q+64*0] 5604 mova m4, [tmp1q+64*1] 5605 mova m7, [tmp2q+64*1] 5606 add tmp1q, 64*2 5607 add tmp2q, 64*2 5608 psubsw m6, m1, m3 5609 punpcklwd m5, m3, m1 5610 pabsw m6, m6 5611 punpckhwd m3, m1 5612 psubusw m6, m8, m6 5613 psrlw m6, 10 5614 psubw m2, m9, m6 5615 punpcklwd m1, m6, m2 5616 punpckhwd m6, m2 5617 mova m0, m10 5618 vpdpwssd m0, m5, m1 5619 mova m1, m10 5620 vpdpwssd m1, m3, m6 5621 psubsw m5, m4, m7 5622 punpcklwd m6, m7, m4 5623 pabsw m5, m5 5624 punpckhwd m7, m4 5625 psubusw m5, m8, m5 5626 psrlw m5, 10 5627 psubw m3, m9, m5 5628 punpcklwd m4, m5, m3 5629 psrad m0, 4 5630 punpckhwd m5, m3 5631 psrad m1, 4 5632 packusdw m0, m1 5633 mova m1, m10 5634 vpdpwssd m1, m6, m4 5635 mova m4, m10 5636 vpdpwssd m4, m7, m5 5637 vpermt2b m2, m11, m3 5638 psrad m1, 4 5639 psrad m4, 4 5640 packusdw m1, m4 5641 vpsrlvw m0, m12 5642 vpsrlvw m1, m12 5643 mova [maskq], m2 5644 add maskq, 64 5645 ret 5646 5647cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 5648%define base r6-blend_avx512icl_table 5649 lea r6, [blend_avx512icl_table] 5650 tzcnt wd, wm 5651 movifnidn hd, hm 5652 movsxd wq, [r6+wq*4] 5653 movifnidn maskq, maskmp 5654 vpbroadcastd m6, [base+pw_m512] 5655 add wq, r6 5656 lea r6, [dsq*3] 5657 jmp wq 5658.w4: 5659 pmovzxbw ym19, [maskq] 5660 movq xm16, [dstq+dsq*0] 5661 movhps xm16, [dstq+dsq*1] 5662 vpbroadcastq ym17, [dstq+dsq*2] 5663 vpbroadcastq ym18, [dstq+r6 ] 5664 pmullw ym19, ym6 5665 vpblendd ym16, ym17, 0x30 5666 vpblendd ym16, ym18, 0xc0 5667 psubw ym17, ym16, [tmpq] 5668 add maskq, 16 5669 add tmpq, 32 5670 pmulhrsw ym17, ym19 5671 paddw ym16, ym17 5672 vextracti128 xm17, ym16, 1 5673 movq [dstq+dsq*0], xm16 5674 movhps [dstq+dsq*1], xm16 5675 movq [dstq+dsq*2], xm17 5676 movhps [dstq+r6 ], xm17 5677 lea dstq, [dstq+dsq*4] 5678 sub hd, 4 5679 jg .w4 5680 vzeroupper 5681 RET 5682.w8: 5683 pmovzxbw m2, [maskq] 5684 mova xm0, [dstq+dsq*0] 5685 vinserti32x4 ym0, [dstq+dsq*1], 1 5686 vinserti32x4 m0, [dstq+dsq*2], 2 5687 vinserti32x4 m0, [dstq+r6 ], 3 5688 pmullw m2, m6 5689 psubw m1, m0, [tmpq] 5690 add maskq, 32 5691 add tmpq, 64 5692 pmulhrsw m1, m2 5693 paddw m0, m1 5694 mova [dstq+dsq*0], xm0 5695 vextracti32x4 [dstq+dsq*1], ym0, 1 5696 vextracti32x4 [dstq+dsq*2], m0, 2 5697 vextracti32x4 [dstq+r6 ], m0, 3 5698 lea dstq, [dstq+dsq*4] 5699 sub hd, 4 5700 jg .w8 5701 RET 5702.w16: 5703 pmovzxbw m4, [maskq+32*0] 5704 pmovzxbw m5, [maskq+32*1] 5705 mova ym0, [dstq+dsq*0] 5706 vinserti32x8 m0, [dstq+dsq*1], 1 5707 mova ym1, [dstq+dsq*2] 5708 vinserti32x8 m1, [dstq+r6 ], 1 5709 pmullw m4, m6 5710 pmullw m5, m6 5711 psubw m2, m0, [tmpq+64*0] 5712 psubw m3, m1, [tmpq+64*1] 5713 add maskq, 32*2 5714 add tmpq, 64*2 5715 pmulhrsw m2, m4 5716 pmulhrsw m3, m5 5717 paddw m0, m2 5718 paddw m1, m3 5719 mova [dstq+dsq*0], ym0 5720 vextracti32x8 [dstq+dsq*1], m0, 1 5721 mova [dstq+dsq*2], ym1 5722 vextracti32x8 [dstq+r6 ], m1, 1 5723 lea dstq, [dstq+dsq*4] 5724 sub hd, 4 5725 jg .w16 5726 RET 5727.w32: 5728 pmovzxbw m4, [maskq+32*0] 5729 pmovzxbw m5, [maskq+32*1] 5730 mova m0, [dstq+dsq*0] 5731 mova m1, [dstq+dsq*1] 5732 pmullw m4, m6 5733 pmullw m5, m6 5734 psubw m2, m0, [tmpq+ 64*0] 5735 psubw m3, m1, [tmpq+ 64*1] 5736 add maskq, 32*2 5737 add tmpq, 64*2 5738 pmulhrsw m2, m4 5739 pmulhrsw m3, m5 5740 paddw m0, m2 5741 paddw m1, m3 5742 mova [dstq+dsq*0], m0 5743 mova [dstq+dsq*1], m1 5744 lea dstq, [dstq+dsq*2] 5745 sub hd, 2 5746 jg .w32 5747 RET 5748 5749cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h 5750 lea r5, [blend_v_avx512icl_table] 5751 tzcnt wd, wm 5752 movifnidn hd, hm 5753 movsxd wq, [r5+wq*4] 5754 add wq, r5 5755 jmp wq 5756.w2: 5757 vpbroadcastd xmm2, [obmc_masks_avx2+2*2] 5758.w2_loop: 5759 movd xmm0, [dstq+dsq*0] 5760 pinsrd xmm0, [dstq+dsq*1], 1 5761 movq xmm1, [tmpq] 5762 add tmpq, 4*2 5763 psubw xmm1, xmm0, xmm1 5764 pmulhrsw xmm1, xmm2 5765 paddw xmm0, xmm1 5766 movd [dstq+dsq*0], xmm0 5767 pextrd [dstq+dsq*1], xmm0, 1 5768 lea dstq, [dstq+dsq*2] 5769 sub hd, 2 5770 jg .w2_loop 5771 RET 5772.w4: 5773 vpbroadcastq xmm2, [obmc_masks_avx2+4*2] 5774.w4_loop: 5775 movq xmm0, [dstq+dsq*0] 5776 movhps xmm0, [dstq+dsq*1] 5777 psubw xmm1, xmm0, [tmpq] 5778 add tmpq, 8*2 5779 pmulhrsw xmm1, xmm2 5780 paddw xmm0, xmm1 5781 movq [dstq+dsq*0], xmm0 5782 movhps [dstq+dsq*1], xmm0 5783 lea dstq, [dstq+dsq*2] 5784 sub hd, 2 5785 jg .w4_loop 5786 RET 5787.w8: 5788 vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] 5789.w8_loop: 5790 mova xm0, [dstq+dsq*0] 5791 vinserti32x4 ym0, [dstq+dsq*1], 1 5792 psubw ym1, ym0, [tmpq] 5793 add tmpq, 16*2 5794 pmulhrsw ym1, ym2 5795 paddw ym0, ym1 5796 mova [dstq+dsq*0], xm0 5797 vextracti32x4 [dstq+dsq*1], ym0, 1 5798 lea dstq, [dstq+dsq*2] 5799 sub hd, 2 5800 jg .w8_loop 5801 RET 5802.w16: 5803 vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] 5804.w16_loop: 5805 mova ym0, [dstq+dsq*0] 5806 vinserti32x8 m0, [dstq+dsq*1], 1 5807 psubw m1, m0, [tmpq] 5808 add tmpq, 32*2 5809 pmulhrsw m1, m2 5810 paddw m0, m1 5811 mova [dstq+dsq*0], ym0 5812 vextracti32x8 [dstq+dsq*1], m0, 1 5813 lea dstq, [dstq+dsq*2] 5814 sub hd, 2 5815 jg .w16_loop 5816 RET 5817.w32: 5818 mova m4, [obmc_masks_avx2+32*2] 5819.w32_loop: 5820 mova m0, [dstq+dsq*0] 5821 psubw m2, m0, [tmpq+ 64*0] 5822 mova m1, [dstq+dsq*1] 5823 psubw m3, m1, [tmpq+ 64*1] 5824 add tmpq, 64*2 5825 pmulhrsw m2, m4 5826 pmulhrsw m3, m4 5827 paddw m0, m2 5828 paddw m1, m3 5829 mova [dstq+dsq*0], m0 5830 mova [dstq+dsq*1], m1 5831 lea dstq, [dstq+dsq*2] 5832 sub hd, 2 5833 jg .w32_loop 5834 RET 5835 5836cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask 5837%define base r6-$$ 5838 lea r6, [$$] 5839 tzcnt wd, wm 5840 mov hd, hm 5841 movsxd wq, [base+blend_h_avx512icl_table+wq*4] 5842 lea maskq, [base+obmc_masks_avx2+hq*2] 5843 lea hd, [hq*3] 5844 lea wq, [base+blend_h_avx512icl_table+wq] 5845 shr hd, 2 ; h * 3/4 5846 lea maskq, [maskq+hq*2] 5847 neg hq 5848 jmp wq 5849.w2: 5850 movd xmm0, [dstq+dsq*0] 5851 pinsrd xmm0, [dstq+dsq*1], 1 5852 movd xmm2, [maskq+hq*2] 5853 movq xmm1, [tmpq] 5854 add tmpq, 4*2 5855 punpcklwd xmm2, xmm2 5856 psubw xmm1, xmm0, xmm1 5857 pmulhrsw xmm1, xmm2 5858 paddw xmm0, xmm1 5859 movd [dstq+dsq*0], xmm0 5860 pextrd [dstq+dsq*1], xmm0, 1 5861 lea dstq, [dstq+dsq*2] 5862 add hq, 2 5863 jl .w2 5864 RET 5865.w4: 5866 mova xmm3, [blend_shuf] 5867.w4_loop: 5868 movq xmm0, [dstq+dsq*0] 5869 movhps xmm0, [dstq+dsq*1] 5870 movd xmm2, [maskq+hq*2] 5871 psubw xmm1, xmm0, [tmpq] 5872 add tmpq, 8*2 5873 pshufb xmm2, xmm3 5874 pmulhrsw xmm1, xmm2 5875 paddw xmm0, xmm1 5876 movq [dstq+dsq*0], xmm0 5877 movhps [dstq+dsq*1], xmm0 5878 lea dstq, [dstq+dsq*2] 5879 add hq, 2 5880 jl .w4_loop 5881 RET 5882.w8: 5883 vbroadcasti32x4 ym3, [blend_shuf] 5884 shufpd ym3, ym3, 0x0c 5885.w8_loop: 5886 mova xm0, [dstq+dsq*0] 5887 vinserti32x4 ym0, [dstq+dsq*1], 1 5888 vpbroadcastd ym2, [maskq+hq*2] 5889 psubw ym1, ym0, [tmpq] 5890 add tmpq, 16*2 5891 pshufb ym2, ym3 5892 pmulhrsw ym1, ym2 5893 paddw ym0, ym1 5894 mova [dstq+dsq*0], xm0 5895 vextracti32x4 [dstq+dsq*1], ym0, 1 5896 lea dstq, [dstq+dsq*2] 5897 add hq, 2 5898 jl .w8_loop 5899 RET 5900.w16: 5901 vbroadcasti32x4 m3, [blend_shuf] 5902 shufpd m3, m3, 0xf0 5903.w16_loop: 5904 mova ym0, [dstq+dsq*0] 5905 vinserti32x8 m0, [dstq+dsq*1], 1 5906 vpbroadcastd m2, [maskq+hq*2] 5907 psubw m1, m0, [tmpq] 5908 add tmpq, 32*2 5909 pshufb m2, m3 5910 pmulhrsw m1, m2 5911 paddw m0, m1 5912 mova [dstq+dsq*0], ym0 5913 vextracti32x8 [dstq+dsq*1], m0, 1 5914 lea dstq, [dstq+dsq*2] 5915 add hq, 2 5916 jl .w16_loop 5917 RET 5918.w32: 5919 vpbroadcastw m4, [maskq+hq*2] 5920 vpbroadcastw m5, [maskq+hq*2+2] 5921 mova m0, [dstq+dsq*0] 5922 psubw m2, m0, [tmpq+ 64*0] 5923 mova m1, [dstq+dsq*1] 5924 psubw m3, m1, [tmpq+ 64*1] 5925 add tmpq, 64*2 5926 pmulhrsw m2, m4 5927 pmulhrsw m3, m5 5928 paddw m0, m2 5929 paddw m1, m3 5930 mova [dstq+dsq*0], m0 5931 mova [dstq+dsq*1], m1 5932 lea dstq, [dstq+dsq*2] 5933 add hq, 2 5934 jl .w32 5935 RET 5936.w64: 5937 vpbroadcastw m4, [maskq+hq*2] 5938 mova m0, [dstq+64*0] 5939 psubw m2, m0, [tmpq+64*0] 5940 mova m1, [dstq+64*1] 5941 psubw m3, m1, [tmpq+64*1] 5942 add tmpq, 64*2 5943 pmulhrsw m2, m4 5944 pmulhrsw m3, m4 5945 paddw m0, m2 5946 paddw m1, m3 5947 mova [dstq+64*0], m0 5948 mova [dstq+64*1], m1 5949 add dstq, dsq 5950 inc hq 5951 jl .w64 5952 RET 5953.w128: 5954 vpbroadcastw m8, [maskq+hq*2] 5955 mova m0, [dstq+64*0] 5956 psubw m4, m0, [tmpq+64*0] 5957 mova m1, [dstq+64*1] 5958 psubw m5, m1, [tmpq+64*1] 5959 mova m2, [dstq+64*2] 5960 psubw m6, m2, [tmpq+64*2] 5961 mova m3, [dstq+64*3] 5962 psubw m7, m3, [tmpq+64*3] 5963 add tmpq, 64*4 5964 REPX {pmulhrsw x, m8}, m4, m5, m6, m7 5965 paddw m0, m4 5966 paddw m1, m5 5967 paddw m2, m6 5968 paddw m3, m7 5969 mova [dstq+64*0], m0 5970 mova [dstq+64*1], m1 5971 mova [dstq+64*2], m2 5972 mova [dstq+64*3], m3 5973 add dstq, dsq 5974 inc hq 5975 jl .w128 5976 RET 5977 5978cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ 5979 dst_w, h, src_w, dx, mx0, pxmax 5980 sub dword mx0m, 4<<14 5981 sub dword src_wm, 8 5982 mov r6, ~0 5983 vpbroadcastd m5, dxm 5984 vpbroadcastd m8, mx0m 5985 vpbroadcastd m6, src_wm 5986 kmovq k6, r6 5987 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax 5988 LEA r7, $$ 5989%define base r7-$$ 5990 vpbroadcastd m3, [base+pd_16384] 5991 vpbroadcastd m7, [base+pd_63] 5992 mova m24, [base+resize_permA] 5993 mova m25, [base+resize_permB] 5994 mova m26, [base+resize_permC] 5995 mova m27, [base+resize_permD] 5996 vbroadcasti32x4 m28, [base+resize_shufA] 5997 vbroadcasti32x4 m29, [base+resize_shufB] 5998 mova m30, [base+resize_permE] 5999 vpbroadcastw ym31, pxmaxm 6000 vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] 6001 pslld m5, 4 ; dx*16 6002 pslld m6, 14 6003 pxor m2, m2 6004.loop_y: 6005 xor xd, xd 6006 mova m4, m8 ; per-line working version of mx 6007.loop_x: 6008 pmaxsd m0, m4, m2 6009 psrad m9, m4, 8 ; filter offset (unmasked) 6010 pminsd m0, m6 ; iclip(mx, 0, src_w-8) 6011 psubd m1, m4, m0 ; pshufb offset 6012 psrad m0, 14 ; clipped src_x offset 6013 psrad m1, 14 ; pshufb edge_emu offset 6014 vptestmd k5, m1, m1 6015 pand m9, m7 ; filter offset (masked) 6016 ktestw k5, k5 6017 jz .load 6018 vpbroadcastq m14, [base+pd_0_4] 6019 vpermq m10, m0, q1100 6020 vpermq m11, m0, q3322 6021 vpermq m20, m1, q1100 6022 vpermq m21, m1, q3322 6023 punpckldq m10, m10 6024 punpckldq m11, m11 6025 punpckldq m20, m20 6026 punpckldq m21, m21 6027 paddd m10, m14 6028 paddd m11, m14 6029 paddd m20, m14 6030 paddd m21, m14 6031 vextracti32x8 ym12, m10, 1 6032 vextracti32x8 ym13, m11, 1 6033 vextracti32x8 ym22, m20, 1 6034 vextracti32x8 ym23, m21, 1 6035 kmovq k1, k6 6036 kmovq k2, k6 6037 kmovq k3, k6 6038 kmovq k4, k6 6039 vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 6040 vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 6041 vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B 6042 vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F 6043 kmovq k1, k6 6044 kmovq k2, k6 6045 kmovq k3, k6 6046 kmovq k4, k6 6047 vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] 6048 vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] 6049 vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] 6050 vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] 6051 pshufb m16, m0 6052 pshufb m17, m1 6053 pshufb m18, m14 6054 pshufb m19, m15 6055 mova m20, m24 6056 mova m22, m24 6057 mova m21, m25 6058 mova m23, m25 6059 vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b 6060 vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d 6061 vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb 6062 vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd 6063 mova m15, m26 6064 mova m17, m26 6065 mova m16, m27 6066 mova m18, m27 6067 vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa 6068 vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb 6069 vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc 6070 vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd 6071 kmovq k1, k6 6072 kmovq k2, k6 6073 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] 6074 vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] 6075 pshufb m10, m11, m28 6076 pshufb m11, m11, m29 6077 pshufb m12, m13, m28 6078 pshufb m13, m13, m29 6079 jmp .filter 6080.load: 6081 kmovq k1, k6 6082 kmovq k2, k6 6083 kmovq k3, k6 6084 kmovq k4, k6 6085 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] 6086 vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] 6087 pshufb m10, m11, m28 6088 pshufb m11, m11, m29 6089 pshufb m12, m13, m28 6090 pshufb m13, m13, m29 6091 vpgatherdd m15{k3}, [srcq+m0*2+ 0] 6092 vpgatherdd m16{k4}, [srcq+m0*2+ 4] 6093 kmovq k1, k6 6094 kmovq k2, k6 6095 vpgatherdd m17{k1}, [srcq+m0*2+ 8] 6096 vpgatherdd m18{k2}, [srcq+m0*2+12] 6097.filter: 6098 mova m14, m2 6099 vpdpwssd m14, m15, m10 6100 vpdpwssd m14, m16, m11 6101 vpdpwssd m14, m17, m12 6102 vpdpwssd m14, m18, m13 6103 psubd m14, m3, m14 6104 psrad m14, 15 6105 packusdw m14, m14 6106 vpermq m14, m30, m14 6107 pminsw ym14, ym31 6108 mova [dstq+xq*2], ym14 6109 paddd m4, m5 6110 add xd, 16 6111 cmp xd, dst_wd 6112 jl .loop_x 6113 add dstq, dst_strideq 6114 add srcq, src_strideq 6115 dec hd 6116 jg .loop_y 6117 RET 6118 6119%endif ; ARCH_X86_64 6120