1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33obmc_masks: 34pw_512: times 2 dw 512 35 ; 2 36 db 45, 19, 64, 0 37 ; 4 38 db 39, 25, 50, 14, 59, 5, 64, 0 39 ; 8 40 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 41 ; 16 42 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 43 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 44 ; 32 45 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 46 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 47 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 48 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 49 50warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 51 db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 52 db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 53 db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 54warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 55 db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 56 db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 57 db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 58warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 59warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 60pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 61warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 62pd_16384: dd 16384 63pd_262144: dd 262144 64warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 65warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 66 db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 67bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 68wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 69 db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 70 db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 71 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 72wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 73 db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 74 db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 75 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 76wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 77 db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 78 db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 79 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 80wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 81 db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 82 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 83 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 84wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 85 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 86 db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 87 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 88wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 89 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 90 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 91 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 92bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 93 db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 94 db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 95 db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 96bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 97 db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 98 db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 99 db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 100bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 101 db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 102 db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39 103 db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71 104bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 105 db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 106 db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71 107 db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79 108bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 109 db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 110 db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 111 db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 112bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11 113spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 114 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 115 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 116 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 117spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 118 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 119 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 120 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 121spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 122 db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23 123 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 124 db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39 125spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 126 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 127 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 128 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 129spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 130 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 131 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 132 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 133spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 134 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 135 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 136 db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 137spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 138 db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 139spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 140 db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 141spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 142 db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 143spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 144 db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 145spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 146 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 147 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 148 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 149spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 150 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 151 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 152 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 153spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 154 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 155 db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 156 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 157spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 158 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 159 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 160 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 161spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 162 db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 163spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 164deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 165subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 166 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 167subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 168subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 169subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 170bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 171bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 172blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 173rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 174resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 175resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 176resize_permC: dd 0, 4, 8, 12 177resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 178pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 179 180wm_420_perm64: dq 0xfedcba9876543210 181wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 182 183pb_8x0_8x8: times 8 db 0 184 times 8 db 8 185pb_4: times 4 db 4 186pb_32: times 4 db 32 187pb_127: times 4 db 127 188pw_m128 times 2 dw -128 189pw_m256: times 2 dw -256 190pw_1024: times 2 dw 1024 191pw_2048: times 2 dw 2048 192pw_6903: times 2 dw 6903 193pw_8192: times 2 dw 8192 194pd_32: dd 32 195pd_34: dd 34 196pd_63: dd 63 197pd_512: dd 512 198 199%define pb_m64 (wm_sign+4) 200%define pb_64 (wm_sign+8) 201%define pd_2 (pd_0to7+8) 202 203cextern mc_subpel_filters 204%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 205cextern mc_warp_filter 206cextern resize_filter 207 208%macro BASE_JMP_TABLE 3-* 209 %xdefine %1_%2_table (%%table - %3) 210 %xdefine %%base %1_%2 211 %%table: 212 %rep %0 - 2 213 dw %%base %+ _w%3 - %%base 214 %rotate 1 215 %endrep 216%endmacro 217 218%macro HV_JMP_TABLE 5-* 219 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) 220 %xdefine %%base %1_%3 221 %assign %%types %4 222 %if %%types & 1 223 %xdefine %1_%2_h_%3_table (%%h - %5) 224 %%h: 225 %rep %0 - 4 226 dw %%prefix %+ .h_w%5 - %%base 227 %rotate 1 228 %endrep 229 %rotate 4 230 %endif 231 %if %%types & 2 232 %xdefine %1_%2_v_%3_table (%%v - %5) 233 %%v: 234 %rep %0 - 4 235 dw %%prefix %+ .v_w%5 - %%base 236 %rotate 1 237 %endrep 238 %rotate 4 239 %endif 240 %if %%types & 4 241 %xdefine %1_%2_hv_%3_table (%%hv - %5) 242 %%hv: 243 %rep %0 - 4 244 dw %%prefix %+ .hv_w%5 - %%base 245 %rotate 1 246 %endrep 247 %endif 248%endmacro 249 250%macro BIDIR_JMP_TABLE 2-* 251 %xdefine %1_%2_table (%%table - 2*%3) 252 %xdefine %%base %1_%2_table 253 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 254 %%table: 255 %rep %0 - 2 256 dd %%prefix %+ .w%3 - %%base 257 %rotate 1 258 %endrep 259%endmacro 260 261%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) 262%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) 263 264%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 265 266BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 267BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 268HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 269HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 270HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 271HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 272HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 273HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128 274BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 275BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 276BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 277BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 278BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 279BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 280BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 281BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 282BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 283 284SECTION .text 285 286%macro WRAP_YMM 1+ 287INIT_YMM cpuname 288 %1 289INIT_ZMM cpuname 290%endmacro 291 292INIT_ZMM avx512icl 293cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy 294 movifnidn mxyd, r6m ; mx 295 lea r7, [put_avx512icl] 296 tzcnt wd, wm 297 movifnidn hd, hm 298 test mxyd, mxyd 299 jnz .h 300 mov mxyd, r7m ; my 301 test mxyd, mxyd 302 jnz .v 303.put: 304 movzx wd, word [r7+wq*2+table_offset(put,)] 305 add wq, r7 306 jmp wq 307.put_w2: 308 movzx r6d, word [srcq+ssq*0] 309 movzx r7d, word [srcq+ssq*1] 310 lea srcq, [srcq+ssq*2] 311 mov [dstq+dsq*0], r6w 312 mov [dstq+dsq*1], r7w 313 lea dstq, [dstq+dsq*2] 314 sub hd, 2 315 jg .put_w2 316 RET 317.put_w4: 318 mov r6d, [srcq+ssq*0] 319 mov r7d, [srcq+ssq*1] 320 lea srcq, [srcq+ssq*2] 321 mov [dstq+dsq*0], r6d 322 mov [dstq+dsq*1], r7d 323 lea dstq, [dstq+dsq*2] 324 sub hd, 2 325 jg .put_w4 326 RET 327.put_w8: 328 mov r6, [srcq+ssq*0] 329 mov r7, [srcq+ssq*1] 330 lea srcq, [srcq+ssq*2] 331 mov [dstq+dsq*0], r6 332 mov [dstq+dsq*1], r7 333 lea dstq, [dstq+dsq*2] 334 sub hd, 2 335 jg .put_w8 336 RET 337.put_w16: 338 movu xmm0, [srcq+ssq*0] 339 movu xmm1, [srcq+ssq*1] 340 lea srcq, [srcq+ssq*2] 341 mova [dstq+dsq*0], xmm0 342 mova [dstq+dsq*1], xmm1 343 lea dstq, [dstq+dsq*2] 344 sub hd, 2 345 jg .put_w16 346 RET 347.put_w32: 348 movu ym0, [srcq+ssq*0] 349 movu ym1, [srcq+ssq*1] 350 lea srcq, [srcq+ssq*2] 351 mova [dstq+dsq*0], ym0 352 mova [dstq+dsq*1], ym1 353 lea dstq, [dstq+dsq*2] 354 sub hd, 2 355 jg .put_w32 356 RET 357.put_w64: 358 movu m0, [srcq+ssq*0] 359 movu m1, [srcq+ssq*1] 360 lea srcq, [srcq+ssq*2] 361 mova [dstq+dsq*0], m0 362 mova [dstq+dsq*1], m1 363 lea dstq, [dstq+dsq*2] 364 sub hd, 2 365 jg .put_w64 366 RET 367.put_w128: 368 movu m0, [srcq+ssq*0+64*0] 369 movu m1, [srcq+ssq*0+64*1] 370 movu m2, [srcq+ssq*1+64*0] 371 movu m3, [srcq+ssq*1+64*1] 372 lea srcq, [srcq+ssq*2] 373 mova [dstq+dsq*0+64*0], m0 374 mova [dstq+dsq*0+64*1], m1 375 mova [dstq+dsq*1+64*0], m2 376 mova [dstq+dsq*1+64*1], m3 377 lea dstq, [dstq+dsq*2] 378 sub hd, 2 379 jg .put_w128 380 RET 381.h: 382 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 383 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 384 imul mxyd, 255 385 vbroadcasti128 m4, [bilin_h_perm16] 386 add mxyd, 16 387 vpbroadcastw m5, mxyd 388 mov mxyd, r7m ; my 389 test mxyd, mxyd 390 jnz .hv 391 movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 392 vpbroadcastd m3, [pw_2048] 393 add wq, r7 394 jmp wq 395.h_w2: 396 movd xmm0, [srcq+ssq*0] 397 pinsrd xmm0, [srcq+ssq*1], 1 398 lea srcq, [srcq+ssq*2] 399 pshufb xmm0, xm4 400 pmaddubsw xmm0, xm5 401 pmulhrsw xmm0, xm3 402 packuswb xmm0, xmm0 403 pextrw [dstq+dsq*0], xmm0, 0 404 pextrw [dstq+dsq*1], xmm0, 2 405 lea dstq, [dstq+dsq*2] 406 sub hd, 2 407 jg .h_w2 408 RET 409.h_w4: 410 mova xmm4, [bilin_h_shuf4] 411.h_w4_loop: 412 movq xmm0, [srcq+ssq*0] 413 movhps xmm0, [srcq+ssq*1] 414 lea srcq, [srcq+ssq*2] 415 pshufb xmm0, xmm4 416 pmaddubsw xmm0, xm5 417 pmulhrsw xmm0, xm3 418 packuswb xmm0, xmm0 419 movd [dstq+dsq*0], xmm0 420 pextrd [dstq+dsq*1], xmm0, 1 421 lea dstq, [dstq+dsq*2] 422 sub hd, 2 423 jg .h_w4_loop 424 RET 425.h_w8: 426 movu xm0, [srcq+ssq*0] 427 vinserti32x4 ym0, [srcq+ssq*1], 1 428 lea srcq, [srcq+ssq*2] 429 pshufb ym0, ym4 430 pmaddubsw ym0, ym5 431 pmulhrsw ym0, ym3 432 vpmovuswb xm0, ym0 433 movq [dstq+dsq*0], xm0 434 movhps [dstq+dsq*1], xm0 435 lea dstq, [dstq+dsq*2] 436 sub hd, 2 437 jg .h_w8 438 RET 439.h_w16: 440 mova m4, [bilin_h_perm16] 441.h_w16_loop: 442 movu ym0, [srcq+ssq*0] 443 vinserti32x8 m0, [srcq+ssq*1], 1 444 lea srcq, [srcq+ssq*2] 445 vpermb m0, m4, m0 446 pmaddubsw m0, m5 447 pmulhrsw m0, m3 448 vpmovuswb ym0, m0 449 mova [dstq+dsq*0], xm0 450 vextracti128 [dstq+dsq*1], ym0, 1 451 lea dstq, [dstq+dsq*2] 452 sub hd, 2 453 jg .h_w16_loop 454 RET 455.h_w32: 456 movu ym0, [srcq+ssq*0+8*0] 457 vinserti32x8 m0, [srcq+ssq*1+8*0], 1 458 movu ym1, [srcq+ssq*0+8*1] 459 vinserti32x8 m1, [srcq+ssq*1+8*1], 1 460 lea srcq, [srcq+ssq*2] 461 pshufb m0, m4 462 pshufb m1, m4 463 pmaddubsw m0, m5 464 pmaddubsw m1, m5 465 pmulhrsw m0, m3 466 pmulhrsw m1, m3 467 packuswb m0, m1 468 mova [dstq+dsq*0], ym0 469 vextracti32x8 [dstq+dsq*1], m0, 1 470 lea dstq, [dstq+dsq*2] 471 sub hd, 2 472 jg .h_w32 473 RET 474.h_w64: 475 movu m0, [srcq+8*0] 476 movu m1, [srcq+8*1] 477 pshufb m0, m4 478 pshufb m1, m4 479 pmaddubsw m0, m5 480 pmaddubsw m1, m5 481 pmulhrsw m0, m3 482 pmulhrsw m1, m3 483 packuswb m0, m1 484 add srcq, ssq 485 mova [dstq], m0 486 add dstq, dsq 487 dec hd 488 jg .h_w64 489 RET 490.h_w128: 491 movu m0, [srcq+8*0] 492 movu m2, [srcq+8*1] 493 movu m1, [srcq+8*8] 494 movu m6, [srcq+8*9] 495 add srcq, ssq 496 REPX {pshufb x, m4}, m0, m2, m1, m6 497 REPX {pmaddubsw x, m5}, m0, m2, m1, m6 498 REPX {pmulhrsw x, m3}, m0, m2, m1, m6 499 packuswb m0, m2 500 packuswb m1, m6 501 mova [dstq+64*0], m0 502 mova [dstq+64*1], m1 503 add dstq, dsq 504 dec hd 505 jg .h_w128 506 RET 507.v: 508 movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 509 imul mxyd, 255 510 vpbroadcastd m5, [pw_2048] 511 add mxyd, 16 512 add wq, r7 513 vpbroadcastw m4, mxyd 514 jmp wq 515.v_w2: 516 movd xmm0, [srcq+ssq*0] 517.v_w2_loop: 518 pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 519 lea srcq, [srcq+ssq*2] 520 pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 521 pshuflw xmm1, xmm1, q2301 ; 1 0 522 punpcklbw xmm1, xmm0 523 pmaddubsw xmm1, xm4 524 pmulhrsw xmm1, xm5 525 packuswb xmm1, xmm1 526 pextrw [dstq+dsq*0], xmm1, 1 527 pextrw [dstq+dsq*1], xmm1, 0 528 lea dstq, [dstq+dsq*2] 529 sub hd, 2 530 jg .v_w2_loop 531 RET 532.v_w4: 533 movd xmm0, [srcq+ssq*0] 534.v_w4_loop: 535 vpbroadcastd xmm2, [srcq+ssq*1] 536 lea srcq, [srcq+ssq*2] 537 vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1 538 vpbroadcastd xmm0, [srcq+ssq*0] 539 vpblendd xmm2, xmm0, 0x02 ; 1 2 540 punpcklbw xmm1, xmm2 541 pmaddubsw xmm1, xm4 542 pmulhrsw xmm1, xm5 543 packuswb xmm1, xmm1 544 movd [dstq+dsq*0], xmm1 545 pextrd [dstq+dsq*1], xmm1, 1 546 lea dstq, [dstq+dsq*2] 547 sub hd, 2 548 jg .v_w4_loop 549 RET 550.v_w8: 551 movq xmm0, [srcq+ssq*0] 552.v_w8_loop: 553 movq xmm2, [srcq+ssq*1] 554 lea srcq, [srcq+ssq*2] 555 punpcklbw xmm1, xmm0, xmm2 556 movq xmm0, [srcq+ssq*0] 557 punpcklbw xmm2, xmm0 558 pmaddubsw xmm1, xm4 559 pmaddubsw xmm2, xm4 560 pmulhrsw xmm1, xm5 561 pmulhrsw xmm2, xm5 562 packuswb xmm1, xmm2 563 movq [dstq+dsq*0], xmm1 564 movhps [dstq+dsq*1], xmm1 565 lea dstq, [dstq+dsq*2] 566 sub hd, 2 567 jg .v_w8_loop 568 RET 569.v_w16: 570 movu xmm0, [srcq+ssq*0] 571.v_w16_loop: 572 vbroadcasti128 ymm3, [srcq+ssq*1] 573 lea srcq, [srcq+ssq*2] 574 vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1 575 vbroadcasti128 ymm0, [srcq+ssq*0] 576 vpblendd ymm3, ymm0, 0xf0 ; 1 2 577 punpcklbw ymm1, ymm2, ymm3 578 punpckhbw ymm2, ymm3 579 pmaddubsw ymm1, ym4 580 pmaddubsw ymm2, ym4 581 pmulhrsw ymm1, ym5 582 pmulhrsw ymm2, ym5 583 packuswb ymm1, ymm2 584 mova [dstq+dsq*0], xmm1 585 vextracti128 [dstq+dsq*1], ymm1, 1 586 lea dstq, [dstq+dsq*2] 587 sub hd, 2 588 jg .v_w16_loop 589 vzeroupper 590 RET 591.v_w32: 592 movu ym0, [srcq+ssq*0] 593 kxnorb k1, k1, k1 594.v_w32_loop: 595 vbroadcasti32x8 m3, [srcq+ssq*1] 596 lea srcq, [srcq+ssq*2] 597 vpblendmd m2{k1}, m3, m0 ; 0 1 598 vbroadcasti32x8 m0, [srcq+ssq*0] 599 vpblendmd m3{k1}, m0, m3 ; 1 2 600 punpcklbw m1, m2, m3 601 punpckhbw m2, m3 602 pmaddubsw m1, m4 603 pmaddubsw m2, m4 604 pmulhrsw m1, m5 605 pmulhrsw m2, m5 606 packuswb m1, m2 607 mova [dstq+dsq*0], ym1 608 vextracti32x8 [dstq+dsq*1], m1, 1 609 lea dstq, [dstq+dsq*2] 610 sub hd, 2 611 jg .v_w32_loop 612 RET 613.v_w64: 614 movu m0, [srcq+ssq*0] 615.v_w64_loop: 616 movu m3, [srcq+ssq*1] 617 lea srcq, [srcq+ssq*2] 618 punpcklbw m1, m0, m3 619 punpckhbw m6, m0, m3 620 movu m0, [srcq+ssq*0] 621 pmaddubsw m1, m4 622 pmaddubsw m6, m4 623 punpcklbw m2, m3, m0 624 punpckhbw m3, m0 625 pmaddubsw m2, m4 626 pmaddubsw m3, m4 627 REPX {pmulhrsw x, m5}, m1, m6, m2, m3 628 packuswb m1, m6 629 packuswb m2, m3 630 mova [dstq+dsq*0], m1 631 mova [dstq+dsq*1], m2 632 lea dstq, [dstq+dsq*2] 633 sub hd, 2 634 jg .v_w64_loop 635 RET 636.v_w128: 637 movu m0, [srcq+64*0] 638 movu m1, [srcq+64*1] 639.v_w128_loop: 640 add srcq, ssq 641 movu m2, [srcq+64*0] 642 movu m3, [srcq+64*1] 643 punpcklbw m6, m0, m2 644 pmaddubsw m6, m4 645 punpckhbw m0, m2 646 pmaddubsw m0, m4 647 punpcklbw m7, m1, m3 648 pmaddubsw m7, m4 649 punpckhbw m1, m3 650 pmaddubsw m1, m4 651 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 652 packuswb m6, m0 653 mova m0, m2 654 packuswb m7, m1 655 mova m1, m3 656 mova [dstq+64*0], m6 657 mova [dstq+64*1], m7 658 add dstq, dsq 659 dec hd 660 jg .v_w128_loop 661 RET 662.hv: 663 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 664 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 665 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 666 WIN64_SPILL_XMM 8 667 shl mxyd, 11 ; can't shift by 12 due to signed overflow 668 vpbroadcastd m7, [pw_2048] 669 add wq, r7 670 vpbroadcastw m6, mxyd 671 jmp wq 672.hv_w2: 673 vpbroadcastd xmm0, [srcq+ssq*0] 674 pshufb xmm0, xm4 675 pmaddubsw xmm0, xm5 676.hv_w2_loop: 677 movd xmm1, [srcq+ssq*1] 678 lea srcq, [srcq+ssq*2] 679 pinsrd xmm1, [srcq+ssq*0], 1 680 pshufb xmm1, xm4 681 pmaddubsw xmm1, xm5 ; 1 _ 2 _ 682 shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ 683 mova xmm0, xmm1 684 psubw xmm1, xmm2 685 paddw xmm1, xmm1 686 pmulhw xmm1, xm6 687 paddw xmm1, xmm2 688 pmulhrsw xmm1, xm7 689 packuswb xmm1, xmm1 690 pextrw [dstq+dsq*0], xmm1, 0 691 pextrw [dstq+dsq*1], xmm1, 2 692 lea dstq, [dstq+dsq*2] 693 sub hd, 2 694 jg .hv_w2_loop 695 RET 696.hv_w4: 697 mova xmm4, [bilin_h_shuf4] 698 movddup xmm0, [srcq+ssq*0] 699 pshufb xmm0, xmm4 700 pmaddubsw xmm0, xm5 701.hv_w4_loop: 702 movq xmm1, [srcq+ssq*1] 703 lea srcq, [srcq+ssq*2] 704 movhps xmm1, [srcq+ssq*0] 705 pshufb xmm1, xmm4 706 pmaddubsw xmm1, xm5 ; 1 2 707 shufps xmm2, xmm0, xmm1, q1032 ; 0 1 708 mova xmm0, xmm1 709 psubw xmm1, xmm2 710 paddw xmm1, xmm1 711 pmulhw xmm1, xm6 712 paddw xmm1, xmm2 713 pmulhrsw xmm1, xm7 714 packuswb xmm1, xmm1 715 movd [dstq+dsq*0], xmm1 716 pextrd [dstq+dsq*1], xmm1, 1 717 lea dstq, [dstq+dsq*2] 718 sub hd, 2 719 jg .hv_w4_loop 720 RET 721.hv_w8: 722 vbroadcasti128 ym0, [srcq+ssq*0] 723 pshufb ym0, ym4 724 pmaddubsw ym0, ym5 725.hv_w8_loop: 726 movu xm1, [srcq+ssq*1] 727 lea srcq, [srcq+ssq*2] 728 vinserti128 ym1, [srcq+ssq*0], 1 729 pshufb ym1, ym4 730 pmaddubsw ym1, ym5 ; 1 2 731 valignq ym2, ym1, ym0, 2 732 mova ym0, ym1 733 psubw ym1, ym2 734 paddw ym1, ym1 735 pmulhw ym1, ym6 736 paddw ym1, ym2 737 pmulhrsw ym1, ym7 738 vpmovuswb xm1, ym1 739 movq [dstq+dsq*0], xm1 740 movhps [dstq+dsq*1], xm1 741 lea dstq, [dstq+dsq*2] 742 sub hd, 2 743 jg .hv_w8_loop 744 RET 745.hv_w16: 746 vbroadcasti32x8 m0, [srcq+ssq*0] 747 mova m4, [bilin_h_perm16] 748 vpermb m0, m4, m0 749 pmaddubsw m0, m5 750.hv_w16_loop: 751 movu ym1, [srcq+ssq*1] 752 lea srcq, [srcq+ssq*2] 753 vinserti32x8 m1, [srcq+ssq*0], 1 754 vpermb m1, m4, m1 755 pmaddubsw m1, m5 ; 1 2 756 valignq m2, m1, m0, 4 ; 0 1 757 mova m0, m1 758 psubw m1, m2 759 paddw m1, m1 760 pmulhw m1, m6 761 paddw m1, m2 762 pmulhrsw m1, m7 763 vpmovuswb ym1, m1 764 mova [dstq+dsq*0], xm1 765 vextracti32x4 [dstq+dsq*1], ym1, 1 766 lea dstq, [dstq+dsq*2] 767 sub hd, 2 768 jg .hv_w16_loop 769 RET 770.hv_w32: 771 mova m4, [bilin_h_perm32] 772 vpermb m0, m4, [srcq+ssq*0] 773 pmovzxbq m8, [pb_02461357] 774 pmaddubsw m0, m5 775.hv_w32_loop: 776 vpermb m2, m4, [srcq+ssq*1] 777 lea srcq, [srcq+ssq*2] 778 vpermb m3, m4, [srcq+ssq*0] 779 pmaddubsw m2, m5 780 psubw m1, m2, m0 781 paddw m1, m1 782 pmulhw m1, m6 783 paddw m1, m0 784 pmaddubsw m0, m3, m5 785 psubw m3, m0, m2 786 paddw m3, m3 787 pmulhw m3, m6 788 paddw m3, m2 789 pmulhrsw m1, m7 790 pmulhrsw m3, m7 791 packuswb m1, m3 792 vpermq m1, m8, m1 793 mova [dstq+dsq*0], ym1 794 vextracti32x8 [dstq+dsq*1], m1, 1 795 lea dstq, [dstq+dsq*2] 796 sub hd, 2 797 jg .hv_w32_loop 798 RET 799.hv_w64: 800 movu m0, [srcq+8*0] 801 movu m1, [srcq+8*1] 802 pshufb m0, m4 803 pshufb m1, m4 804 pmaddubsw m0, m5 805 pmaddubsw m1, m5 806.hv_w64_loop: 807 add srcq, ssq 808 movu m2, [srcq+8*0] 809 movu m3, [srcq+8*1] 810 pshufb m2, m4 811 pshufb m3, m4 812 pmaddubsw m2, m5 813 pmaddubsw m3, m5 814 psubw m8, m2, m0 815 psubw m9, m3, m1 816 paddw m8, m8 817 pmulhw m8, m6 818 paddw m9, m9 819 pmulhw m9, m6 820 paddw m8, m0 821 pmulhrsw m8, m7 822 paddw m9, m1 823 pmulhrsw m9, m7 824 mova m0, m2 825 mova m1, m3 826 packuswb m8, m9 827 mova [dstq], m8 828 add dstq, dsq 829 dec hd 830 jg .hv_w64_loop 831 RET 832.hv_w128: 833 movu m0, [srcq+8*0] 834 movu m1, [srcq+8*1] 835 movu m2, [srcq+8*8] 836 movu m3, [srcq+8*9] 837 REPX {pshufb x, m4}, m0, m1, m2, m3 838 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 839.hv_w128_loop: 840 add srcq, ssq 841 movu m8, [srcq+8*0] 842 movu m9, [srcq+8*1] 843 movu m10, [srcq+8*8] 844 movu m11, [srcq+8*9] 845 REPX {pshufb x, m4}, m8, m9, m10, m11 846 REPX {pmaddubsw x, m5}, m8, m9, m10, m11 847 psubw m12, m8, m0 848 psubw m13, m9, m1 849 psubw m14, m10, m2 850 psubw m15, m11, m3 851 paddw m12, m12 852 pmulhw m12, m6 853 paddw m13, m13 854 pmulhw m13, m6 855 paddw m14, m14 856 pmulhw m14, m6 857 paddw m15, m15 858 pmulhw m15, m6 859 paddw m12, m0 860 pmulhrsw m12, m7 861 paddw m13, m1 862 pmulhrsw m13, m7 863 paddw m14, m2 864 pmulhrsw m14, m7 865 paddw m15, m3 866 pmulhrsw m15, m7 867 mova m0, m8 868 mova m1, m9 869 mova m2, m10 870 mova m3, m11 871 packuswb m12, m13 872 packuswb m14, m15 873 mova [dstq+64*0], m12 874 mova [dstq+64*1], m14 875 add dstq, dsq 876 dec hd 877 jg .hv_w128_loop 878 RET 879 880DECLARE_REG_TMP 3, 5, 6 881 882cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 883 movifnidn mxyd, r5m ; mx 884 lea t2, [prep_avx512icl] 885 tzcnt wd, wm 886 movifnidn hd, hm 887 test mxyd, mxyd 888 jnz .h 889 mov mxyd, r6m ; my 890 test mxyd, mxyd 891 jnz .v 892.prep: 893 movzx wd, word [t2+wq*2+table_offset(prep,)] 894 add wq, t2 895 lea stride3q, [strideq*3] 896 jmp wq 897.prep_w4: 898 movd xmm0, [srcq+strideq*0] 899 pinsrd xmm0, [srcq+strideq*1], 1 900 pinsrd xmm0, [srcq+strideq*2], 2 901 pinsrd xmm0, [srcq+stride3q ], 3 902 lea srcq, [srcq+strideq*4] 903 pmovzxbw ym0, xmm0 904 psllw ym0, 4 905 mova [tmpq], ym0 906 add tmpq, 32 907 sub hd, 4 908 jg .prep_w4 909 RET 910.prep_w8: 911 movq xmm0, [srcq+strideq*0] 912 movq xmm1, [srcq+strideq*1] 913 vinserti128 ym0, ymm0, [srcq+strideq*2], 1 914 vinserti128 ym1, ymm1, [srcq+stride3q ], 1 915 lea srcq, [srcq+strideq*4] 916 punpcklqdq ym0, ym1 917 pmovzxbw m0, ym0 918 psllw m0, 4 919 mova [tmpq], m0 920 add tmpq, 32*2 921 sub hd, 4 922 jg .prep_w8 923 RET 924.prep_w16: 925 movu xmm0, [srcq+strideq*0] 926 vinserti128 ym0, ymm0, [srcq+strideq*1], 1 927 movu xmm1, [srcq+strideq*2] 928 vinserti128 ym1, ymm1, [srcq+stride3q ], 1 929 lea srcq, [srcq+strideq*4] 930 pmovzxbw m0, ym0 931 pmovzxbw m1, ym1 932 psllw m0, 4 933 psllw m1, 4 934 mova [tmpq+64*0], m0 935 mova [tmpq+64*1], m1 936 add tmpq, 32*4 937 sub hd, 4 938 jg .prep_w16 939 RET 940.prep_w32: 941 pmovzxbw m0, [srcq+strideq*0] 942 pmovzxbw m1, [srcq+strideq*1] 943 pmovzxbw m2, [srcq+strideq*2] 944 pmovzxbw m3, [srcq+stride3q ] 945 lea srcq, [srcq+strideq*4] 946 REPX {psllw x, 4}, m0, m1, m2, m3 947 mova [tmpq+64*0], m0 948 mova [tmpq+64*1], m1 949 mova [tmpq+64*2], m2 950 mova [tmpq+64*3], m3 951 add tmpq, 64*4 952 sub hd, 4 953 jg .prep_w32 954 RET 955.prep_w64: 956 pmovzxbw m0, [srcq+strideq*0+32*0] 957 pmovzxbw m1, [srcq+strideq*0+32*1] 958 pmovzxbw m2, [srcq+strideq*1+32*0] 959 pmovzxbw m3, [srcq+strideq*1+32*1] 960 lea srcq, [srcq+strideq*2] 961 REPX {psllw x, 4}, m0, m1, m2, m3 962 mova [tmpq+64*0], m0 963 mova [tmpq+64*1], m1 964 mova [tmpq+64*2], m2 965 mova [tmpq+64*3], m3 966 add tmpq, 64*4 967 sub hd, 2 968 jg .prep_w64 969 RET 970.prep_w128: 971 pmovzxbw m0, [srcq+32*0] 972 pmovzxbw m1, [srcq+32*1] 973 pmovzxbw m2, [srcq+32*2] 974 pmovzxbw m3, [srcq+32*3] 975 REPX {psllw x, 4}, m0, m1, m2, m3 976 mova [tmpq+64*0], m0 977 mova [tmpq+64*1], m1 978 mova [tmpq+64*2], m2 979 mova [tmpq+64*3], m3 980 add tmpq, 64*4 981 add srcq, strideq 982 dec hd 983 jg .prep_w128 984 RET 985.h: 986 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 987 ; = (16 - mx) * src[x] + mx * src[x + 1] 988 imul mxyd, 255 989 add mxyd, 16 990 vpbroadcastw m5, mxyd 991 mov mxyd, r6m ; my 992 test mxyd, mxyd 993 jnz .hv 994 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] 995 add wq, t2 996 lea stride3q, [strideq*3] 997 jmp wq 998.h_w4: 999 vbroadcasti32x4 ym4, [bilin_h_shuf4] 1000.h_w4_loop: 1001 movq xmm0, [srcq+strideq*0] 1002 movq xmm1, [srcq+strideq*1] 1003 vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 1004 vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 1005 lea srcq, [srcq+strideq*4] 1006 punpcklqdq ym0, ym1 1007 pshufb ym0, ym4 1008 pmaddubsw ym0, ym5 1009 mova [tmpq], ym0 1010 add tmpq, 32 1011 sub hd, 4 1012 jg .h_w4_loop 1013 RET 1014.h_w8: 1015 vbroadcasti32x4 m4, [bilin_h_perm16] 1016.h_w8_loop: 1017 movu xmm0, [srcq+strideq*0] 1018 vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 1019 vinserti32x4 m0, [srcq+strideq*2], 2 1020 vinserti32x4 m0, [srcq+stride3q ], 3 1021 lea srcq, [srcq+strideq*4] 1022 pshufb m0, m4 1023 pmaddubsw m0, m5 1024 mova [tmpq], m0 1025 add tmpq, 64 1026 sub hd, 4 1027 jg .h_w8_loop 1028 RET 1029.h_w16: 1030 mova m4, [bilin_h_perm16] 1031.h_w16_loop: 1032 movu ym0, [srcq+strideq*0] 1033 vinserti32x8 m0, [srcq+strideq*1], 1 1034 movu ym1, [srcq+strideq*2] 1035 vinserti32x8 m1, [srcq+stride3q ], 1 1036 lea srcq, [srcq+strideq*4] 1037 vpermb m0, m4, m0 1038 vpermb m1, m4, m1 1039 pmaddubsw m0, m5 1040 pmaddubsw m1, m5 1041 mova [tmpq+64*0], m0 1042 mova [tmpq+64*1], m1 1043 add tmpq, 64*2 1044 sub hd, 4 1045 jg .h_w16_loop 1046 RET 1047.h_w32: 1048 mova m4, [bilin_h_perm32] 1049.h_w32_loop: 1050 vpermb m0, m4, [srcq+strideq*0] 1051 vpermb m1, m4, [srcq+strideq*1] 1052 vpermb m2, m4, [srcq+strideq*2] 1053 vpermb m3, m4, [srcq+stride3q ] 1054 lea srcq, [srcq+strideq*4] 1055 pmaddubsw m0, m5 1056 pmaddubsw m1, m5 1057 pmaddubsw m2, m5 1058 pmaddubsw m3, m5 1059 mova [tmpq+64*0], m0 1060 mova [tmpq+64*1], m1 1061 mova [tmpq+64*2], m2 1062 mova [tmpq+64*3], m3 1063 add tmpq, 64*4 1064 sub hd, 4 1065 jg .h_w32_loop 1066 RET 1067.h_w64: 1068 mova m4, [bilin_h_perm32] 1069.h_w64_loop: 1070 vpermb m0, m4, [srcq+strideq*0+32*0] 1071 vpermb m1, m4, [srcq+strideq*0+32*1] 1072 vpermb m2, m4, [srcq+strideq*1+32*0] 1073 vpermb m3, m4, [srcq+strideq*1+32*1] 1074 lea srcq, [srcq+strideq*2] 1075 pmaddubsw m0, m5 1076 pmaddubsw m1, m5 1077 pmaddubsw m2, m5 1078 pmaddubsw m3, m5 1079 mova [tmpq+64*0], m0 1080 mova [tmpq+64*1], m1 1081 mova [tmpq+64*2], m2 1082 mova [tmpq+64*3], m3 1083 add tmpq, 64*4 1084 sub hd, 2 1085 jg .h_w64_loop 1086 RET 1087.h_w128: 1088 mova m4, [bilin_h_perm32] 1089.h_w128_loop: 1090 vpermb m0, m4, [srcq+32*0] 1091 vpermb m1, m4, [srcq+32*1] 1092 vpermb m2, m4, [srcq+32*2] 1093 vpermb m3, m4, [srcq+32*3] 1094 pmaddubsw m0, m5 1095 pmaddubsw m1, m5 1096 pmaddubsw m2, m5 1097 pmaddubsw m3, m5 1098 mova [tmpq+64*0], m0 1099 mova [tmpq+64*1], m1 1100 mova [tmpq+64*2], m2 1101 mova [tmpq+64*3], m3 1102 add tmpq, 64*4 1103 add srcq, strideq 1104 dec hd 1105 jg .h_w128_loop 1106 RET 1107.v: 1108 WIN64_SPILL_XMM 7 1109 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] 1110 imul mxyd, 255 1111 add mxyd, 16 1112 add wq, t2 1113 lea stride3q, [strideq*3] 1114 vpbroadcastw m6, mxyd 1115 jmp wq 1116.v_w4: 1117 vpbroadcastd xm0, [srcq+strideq*0] 1118 mov r3d, 0x29 1119 vbroadcasti32x4 ym3, [bilin_v_shuf4] 1120 kmovb k1, r3d 1121.v_w4_loop: 1122 vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ 1123 vpbroadcastd ym2, [srcq+strideq*2] 1124 vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ 1125 lea srcq, [srcq+strideq*4] 1126 vpbroadcastd ym0, [srcq+strideq*0] 1127 punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ 1128 pshufb ym2, ym3 1129 pmaddubsw ym2, ym6 1130 mova [tmpq], ym2 1131 add tmpq, 32 1132 sub hd, 4 1133 jg .v_w4_loop 1134 RET 1135.v_w8: 1136 mova m5, [bilin_v_perm8] 1137 vbroadcasti32x4 ym0, [srcq+strideq*0] 1138.v_w8_loop: 1139 vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1140 vpbroadcastq ym0, [srcq+strideq*2] 1141 vinserti32x4 m1, [srcq+stride3q ], 2 1142 lea srcq, [srcq+strideq*4] 1143 vinserti32x4 ym0, [srcq+strideq*0], 0 1144 vpermt2b m1, m5, m0 1145 pmaddubsw m1, m6 1146 mova [tmpq], m1 1147 add tmpq, 64 1148 sub hd, 4 1149 jg .v_w8_loop 1150 RET 1151.v_w16: 1152 mova m5, [bilin_v_perm16] 1153 movu xm0, [srcq+strideq*0] 1154.v_w16_loop: 1155 movu xm2, [srcq+strideq*2] 1156 vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1157 vpermt2b m1, m5, m2 1158 vinserti32x4 ym2, [srcq+stride3q ], 1 1159 lea srcq, [srcq+strideq*4] 1160 movu xm0, [srcq+strideq*0] 1161 vpermt2b m2, m5, m0 1162 pmaddubsw m1, m6 1163 pmaddubsw m2, m6 1164 mova [tmpq+64*0], m1 1165 mova [tmpq+64*1], m2 1166 add tmpq, 64*2 1167 sub hd, 4 1168 jg .v_w16_loop 1169 RET 1170.v_w32: 1171 mova m5, [bilin_v_perm32] 1172 movu ym0, [srcq+strideq*0] 1173.v_w32_loop: 1174 movu ym2, [srcq+strideq*1] 1175 movu ym3, [srcq+strideq*2] 1176 movu ym4, [srcq+stride3q ] 1177 lea srcq, [srcq+strideq*4] 1178 vpermt2b m0, m5, m2 1179 vpermt2b m2, m5, m3 1180 vpermt2b m3, m5, m4 1181 pmaddubsw m1, m0, m6 1182 movu ym0, [srcq+strideq*0] 1183 vpermt2b m4, m5, m0 1184 pmaddubsw m2, m6 1185 pmaddubsw m3, m6 1186 pmaddubsw m4, m6 1187 mova [tmpq+64*0], m1 1188 mova [tmpq+64*1], m2 1189 mova [tmpq+64*2], m3 1190 mova [tmpq+64*3], m4 1191 add tmpq, 64*4 1192 sub hd, 4 1193 jg .v_w32_loop 1194 RET 1195.v_w64: 1196 mova m5, [bilin_v_perm64] 1197 vpermq m0, m5, [srcq+strideq*0] 1198.v_w64_loop: 1199 vpermq m1, m5, [srcq+strideq*1] 1200 lea srcq, [srcq+strideq*2] 1201 punpcklbw m4, m0, m1 1202 punpckhbw m2, m0, m1 1203 vpermq m0, m5, [srcq+strideq*0] 1204 punpcklbw m3, m1, m0 1205 punpckhbw m1, m0 1206 pmaddubsw m4, m6 1207 pmaddubsw m2, m6 1208 pmaddubsw m3, m6 1209 pmaddubsw m1, m6 1210 mova [tmpq+64*0], m4 1211 mova [tmpq+64*1], m2 1212 mova [tmpq+64*2], m3 1213 mova [tmpq+64*3], m1 1214 add tmpq, 64*4 1215 sub hd, 2 1216 jg .v_w64_loop 1217 RET 1218.v_w128: 1219 mova m5, [bilin_v_perm64] 1220 vpermq m0, m5, [srcq+strideq*0+ 0] 1221 vpermq m1, m5, [srcq+strideq*0+64] 1222.v_w128_loop: 1223 vpermq m2, m5, [srcq+strideq*1+ 0] 1224 vpermq m3, m5, [srcq+strideq*1+64] 1225 lea srcq, [srcq+strideq*2] 1226 punpcklbw m4, m0, m2 1227 punpckhbw m0, m2 1228 pmaddubsw m4, m6 1229 pmaddubsw m0, m6 1230 mova [tmpq+64*0], m4 1231 mova [tmpq+64*1], m0 1232 punpcklbw m4, m1, m3 1233 punpckhbw m1, m3 1234 pmaddubsw m4, m6 1235 pmaddubsw m1, m6 1236 mova [tmpq+64*2], m4 1237 mova [tmpq+64*3], m1 1238 vpermq m0, m5, [srcq+strideq*0+ 0] 1239 vpermq m1, m5, [srcq+strideq*0+64] 1240 punpcklbw m4, m2, m0 1241 punpckhbw m2, m0 1242 pmaddubsw m4, m6 1243 pmaddubsw m2, m6 1244 mova [tmpq+64*4], m4 1245 mova [tmpq+64*5], m2 1246 punpcklbw m4, m3, m1 1247 punpckhbw m3, m1 1248 pmaddubsw m4, m6 1249 pmaddubsw m3, m6 1250 mova [tmpq+64*6], m4 1251 mova [tmpq+64*7], m3 1252 add tmpq, 64*8 1253 sub hd, 2 1254 jg .v_w128_loop 1255 RET 1256.hv: 1257 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1258 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1259 WIN64_SPILL_XMM 7 1260 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] 1261 shl mxyd, 11 1262 vpbroadcastw m6, mxyd 1263 add wq, t2 1264 lea stride3q, [strideq*3] 1265 jmp wq 1266.hv_w4: 1267 vbroadcasti32x4 ym4, [bilin_h_shuf4] 1268 vpbroadcastq ym0, [srcq+strideq*0] 1269 pshufb ym0, ym4 1270 pmaddubsw ym0, ym5 1271.hv_w4_loop: 1272 movq xmm1, [srcq+strideq*1] 1273 movq xmm2, [srcq+strideq*2] 1274 vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 1275 lea srcq, [srcq+strideq*4] 1276 vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 1277 punpcklqdq ym1, ym2 1278 pshufb ym1, ym4 1279 pmaddubsw ym1, ym5 ; 1 2 3 4 1280 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 1281 mova ym0, ym1 1282 psubw ym1, ym2 1283 pmulhrsw ym1, ym6 1284 paddw ym1, ym2 1285 mova [tmpq], ym1 1286 add tmpq, 32 1287 sub hd, 4 1288 jg .hv_w4_loop 1289 RET 1290.hv_w8: 1291 vbroadcasti32x4 m4, [bilin_h_perm16] 1292 vbroadcasti32x4 m0, [srcq+strideq*0] 1293 pshufb m0, m4 1294 pmaddubsw m0, m5 1295.hv_w8_loop: 1296 movu xmm1, [srcq+strideq*1] 1297 vinserti128 ym1, ymm1, [srcq+strideq*2], 1 1298 vinserti128 m1, [srcq+stride3q ], 2 1299 lea srcq, [srcq+strideq*4] 1300 vinserti128 m1, [srcq+strideq*0], 3 1301 pshufb m1, m4 1302 pmaddubsw m1, m5 ; 1 2 3 4 1303 valignq m2, m1, m0, 6 ; 0 1 2 3 1304 mova m0, m1 1305 psubw m1, m2 1306 pmulhrsw m1, m6 1307 paddw m1, m2 1308 mova [tmpq], m1 1309 add tmpq, 64 1310 sub hd, 4 1311 jg .hv_w8_loop 1312 RET 1313.hv_w16: 1314 mova m4, [bilin_h_perm16] 1315 vbroadcasti32x8 m0, [srcq+strideq*0] 1316 vpermb m0, m4, m0 1317 pmaddubsw m0, m5 1318.hv_w16_loop: 1319 movu ym1, [srcq+strideq*1] 1320 vinserti32x8 m1, [srcq+strideq*2], 1 1321 movu ym2, [srcq+stride3q ] 1322 lea srcq, [srcq+strideq*4] 1323 vinserti32x8 m2, [srcq+strideq*0], 1 1324 vpermb m1, m4, m1 1325 vpermb m2, m4, m2 1326 pmaddubsw m1, m5 ; 1 2 1327 vshufi32x4 m3, m0, m1, q1032 ; 0 1 1328 pmaddubsw m0, m2, m5 ; 3 4 1329 vshufi32x4 m2, m1, m0, q1032 ; 2 3 1330 psubw m1, m3 1331 pmulhrsw m1, m6 1332 paddw m1, m3 1333 psubw m3, m0, m2 1334 pmulhrsw m3, m6 1335 paddw m3, m2 1336 mova [tmpq+64*0], m1 1337 mova [tmpq+64*1], m3 1338 add tmpq, 64*2 1339 sub hd, 4 1340 jg .hv_w16_loop 1341 RET 1342.hv_w32: 1343 mova m4, [bilin_h_perm32] 1344 vpermb m0, m4, [srcq+strideq*0] 1345 pmaddubsw m0, m5 1346.hv_w32_loop: 1347 vpermb m1, m4, [srcq+strideq*1] 1348 lea srcq, [srcq+strideq*2] 1349 vpermb m2, m4, [srcq+strideq*0] 1350 pmaddubsw m1, m5 1351 psubw m3, m1, m0 1352 pmulhrsw m3, m6 1353 paddw m3, m0 1354 pmaddubsw m0, m2, m5 1355 psubw m2, m0, m1 1356 pmulhrsw m2, m6 1357 paddw m2, m1 1358 mova [tmpq+64*0], m3 1359 mova [tmpq+64*1], m2 1360 add tmpq, 64*2 1361 sub hd, 2 1362 jg .hv_w32_loop 1363 RET 1364.hv_w64: 1365 mova m4, [bilin_h_perm32] 1366 vpermb m0, m4, [srcq+32*0] 1367 vpermb m1, m4, [srcq+32*1] 1368 pmaddubsw m0, m5 1369 pmaddubsw m1, m5 1370.hv_w64_loop: 1371 add srcq, strideq 1372 vpermb m2, m4, [srcq+32*0] 1373 vpermb m3, m4, [srcq+32*1] 1374 pmaddubsw m2, m5 1375 pmaddubsw m3, m5 1376 psubw m7, m2, m0 1377 psubw m8, m3, m1 1378 pmulhrsw m7, m6 1379 pmulhrsw m8, m6 1380 paddw m7, m0 1381 mova m0, m2 1382 paddw m8, m1 1383 mova m1, m3 1384 mova [tmpq+64*0], m7 1385 mova [tmpq+64*1], m8 1386 add tmpq, 64*2 1387 dec hd 1388 jg .hv_w64_loop 1389 RET 1390.hv_w128: 1391 mova m4, [bilin_h_perm32] 1392 vpermb m0, m4, [srcq+32*0] 1393 vpermb m1, m4, [srcq+32*1] 1394 vpermb m2, m4, [srcq+32*2] 1395 vpermb m3, m4, [srcq+32*3] 1396 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 1397.hv_w128_loop: 1398 add srcq, strideq 1399 vpermb m7, m4, [srcq+32*0] 1400 vpermb m8, m4, [srcq+32*1] 1401 vpermb m9, m4, [srcq+32*2] 1402 vpermb m10, m4, [srcq+32*3] 1403 REPX {pmaddubsw x, m5}, m7, m8, m9, m10 1404 psubw m11, m7, m0 1405 psubw m12, m8, m1 1406 psubw m13, m9, m2 1407 psubw m14, m10, m3 1408 REPX {pmulhrsw x, m6}, m11, m12, m13, m14 1409 paddw m11, m0 1410 mova m0, m7 1411 paddw m12, m1 1412 mova m1, m8 1413 paddw m13, m2 1414 mova m2, m9 1415 paddw m14, m3 1416 mova m3, m10 1417 mova [tmpq+64*0], m11 1418 mova [tmpq+64*1], m12 1419 mova [tmpq+64*2], m13 1420 mova [tmpq+64*3], m14 1421 add tmpq, 64*4 1422 dec hd 1423 jg .hv_w128_loop 1424 RET 1425 1426; int8_t subpel_filters[5][15][8] 1427%assign FILTER_REGULAR (0*15 << 16) | 3*15 1428%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1429%assign FILTER_SHARP (2*15 << 16) | 3*15 1430 1431%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to 1432cglobal %1_%2_8bpc 1433 mov t0d, FILTER_%3 1434%ifidn %3, %4 1435 mov t1d, t0d 1436%else 1437 mov t1d, FILTER_%4 1438%endif 1439%if %0 == 5 ; skip the jump in the last filter 1440 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1441%endif 1442%endmacro 1443 1444%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb 1445%if %5 1446 vpermb m%2, m6, m%1 1447 vpermb m%3, m7, m%1 1448 vpermb m%4, m8, m%1 1449%else 1450%if %2 < %4 ; reuse a previous value if possible 1451 pshufb m%2, m%1, m6 1452%endif 1453 pshufb m%3, m%1, m7 1454 pshufb m%4, m%1, m8 1455%endif 1456 mova m%1, m5 1457 vpdpbusd m%1, m%2, m9 1458 mova m%2, m5 1459 vpdpbusd m%2, m%3, m9 1460 vpdpbusd m%1, m%3, m10 1461 vpdpbusd m%2, m%4, m10 1462 packusdw m%1, m%2 1463 psrlw m%1, 6 1464%endmacro 1465 1466%if WIN64 1467DECLARE_REG_TMP 4, 5 1468%else 1469DECLARE_REG_TMP 7, 8 1470%endif 1471 1472; Due to the use of vpdpbusd (which does 4 pixels per instruction) in 1473; the horizontal filter, 6-tap is only used for the vertical filter. 1474%define PUT_8TAP_FN FN put_8tap, 1475PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc 1476PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc 1477PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc 1478PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc 1479PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc 1480PUT_8TAP_FN regular, REGULAR, REGULAR 1481 1482cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns 1483%define base r8-put_avx512icl 1484 imul mxd, mxm, 0x010101 1485 add mxd, t0d ; 6tap_h, mx, 4tap_h 1486 imul myd, mym, 0x010101 1487 add myd, t1d ; 6tap_v, my, 4tap_v 1488 lea r8, [put_avx512icl] 1489 movsxd wq, wm 1490 movifnidn hd, hm 1491 test mxd, 0xf00 1492 jnz .h 1493 test myd, 0xf00 1494 jnz .v 1495.put: 1496 tzcnt wd, wd 1497 movzx wd, word [r8+wq*2+table_offset(put,)] 1498 add wq, r8 1499 lea r6, [ssq*3] 1500 lea r7, [dsq*3] 1501%if WIN64 1502 pop r8 1503%endif 1504 jmp wq 1505.v: 1506 movzx mxd, myb 1507 shr myd, 16 1508 cmp hd, 6 1509 cmovs myd, mxd 1510 tzcnt r6d, wd 1511 movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] 1512 vpbroadcastd m6, [pw_512] 1513 lea myq, [base+subpel_filters+1+myq*8] 1514 vpbroadcastw m7, [myq+0] 1515 add r6, r8 1516 vpbroadcastw m8, [myq+2] 1517 mov nsq, ssq 1518 vpbroadcastw m9, [myq+4] 1519 neg nsq 1520 jmp r6 1521.v_w2: 1522 movd xmm2, [srcq+nsq*2] 1523 pinsrw xmm2, [srcq+nsq*1], 2 1524 pinsrw xmm2, [srcq+ssq*0], 4 1525 pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3 1526 lea srcq, [srcq+ssq*2] 1527 vpbroadcastd xmm0, [srcq+ssq*0] 1528 palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1529 punpcklbw xmm1, xmm2, xmm3 ; 01 12 1530 punpckhbw xmm2, xmm3 ; 23 34 1531.v_w2_loop: 1532 vpbroadcastd xmm4, [srcq+ssq*1] 1533 lea srcq, [srcq+ssq*2] 1534 pmaddubsw xmm3, xmm1, xm7 ; a0 b0 1535 mova xmm1, xmm2 1536 pmaddubsw xmm2, xm8 ; a1 b1 1537 paddw xmm3, xmm2 1538 vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 1539 vpbroadcastd xmm0, [srcq+ssq*0] 1540 vpblendd xmm4, xmm0, 0x02 ; 5 6 1541 punpcklbw xmm2, xmm4 ; 67 78 1542 pmaddubsw xmm4, xmm2, xm9 ; a3 b3 1543 paddw xmm3, xmm4 1544 pmulhrsw xmm3, xm6 1545 packuswb xmm3, xmm3 1546 pextrw [dstq+dsq*0], xmm3, 0 1547 pextrw [dstq+dsq*1], xmm3, 2 1548 lea dstq, [dstq+dsq*2] 1549 sub hd, 2 1550 jg .v_w2_loop 1551 RET 1552.v_w4: 1553 movd xmm2, [srcq+nsq*2] 1554 pinsrd xmm2, [srcq+nsq*1], 1 1555 pinsrd xmm2, [srcq+ssq*0], 2 1556 pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 1557 lea srcq, [srcq+ssq*2] 1558 vpbroadcastd xmm0, [srcq+ssq*0] 1559 palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1560 punpcklbw xmm1, xmm2, xmm3 ; 01 12 1561 punpckhbw xmm2, xmm3 ; 23 34 1562.v_w4_loop: 1563 vpbroadcastd xmm4, [srcq+ssq*1] 1564 lea srcq, [srcq+ssq*2] 1565 pmaddubsw xmm3, xmm1, xm7 ; a0 b0 1566 mova xmm1, xmm2 1567 pmaddubsw xmm2, xm8 ; a1 b1 1568 paddw xmm3, xmm2 1569 vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 1570 vpbroadcastd xmm0, [srcq+ssq*0] 1571 vpblendd xmm4, xmm0, 0x02 ; 5 6 1572 punpcklbw xmm2, xmm4 ; 45 56 1573 pmaddubsw xmm4, xmm2, xm9 ; a2 b2 1574 paddw xmm3, xmm4 1575 pmulhrsw xmm3, xm6 1576 packuswb xmm3, xmm3 1577 movd [dstq+dsq*0], xmm3 1578 pextrd [dstq+dsq*1], xmm3, 1 1579 lea dstq, [dstq+dsq*2] 1580 sub hd, 2 1581 jg .v_w4_loop 1582 RET 1583.v_w8: 1584 movq xmm1, [srcq+nsq*2] 1585 vpbroadcastq ymm3, [srcq+nsq*1] 1586 vpbroadcastq ymm2, [srcq+ssq*0] 1587 vpbroadcastq ymm4, [srcq+ssq*1] 1588 lea srcq, [srcq+ssq*2] 1589 vpbroadcastq ymm0, [srcq+ssq*0] 1590 vpblendd ymm1, ymm3, 0x30 1591 vpblendd ymm3, ymm2, 0x30 1592 punpcklbw ymm1, ymm3 ; 01 12 1593 vpblendd ymm2, ymm4, 0x30 1594 vpblendd ymm4, ymm0, 0x30 1595 punpcklbw ymm2, ymm4 ; 23 34 1596.v_w8_loop: 1597 vpbroadcastq ymm4, [srcq+ssq*1] 1598 lea srcq, [srcq+ssq*2] 1599 pmaddubsw ymm3, ymm1, ym7 ; a0 b0 1600 mova ymm1, ymm2 1601 pmaddubsw ymm2, ym8 ; a1 b1 1602 paddw ymm3, ymm2 1603 vpblendd ymm2, ymm0, ymm4, 0x30 1604 vpbroadcastq ymm0, [srcq+ssq*0] 1605 vpblendd ymm4, ymm0, 0x30 1606 punpcklbw ymm2, ymm4 ; 45 56 1607 pmaddubsw ymm4, ymm2, ym9 ; a2 b2 1608 paddw ymm3, ymm4 1609 pmulhrsw ymm3, ym6 1610 vextracti128 xmm4, ymm3, 1 1611 packuswb xmm3, xmm4 1612 movq [dstq+dsq*0], xmm3 1613 movhps [dstq+dsq*1], xmm3 1614 lea dstq, [dstq+dsq*2] 1615 sub hd, 2 1616 jg .v_w8_loop 1617 vzeroupper 1618 RET 1619.v_w16: 1620 mova m5, [spel_v_perm16a] 1621 vbroadcasti32x4 m1, [srcq+nsq*2] 1622 vbroadcasti32x4 ym3, [srcq+nsq*1] 1623 mov r6d, 0x0f 1624 vbroadcasti32x4 m2, [srcq+ssq*0] 1625 kmovb k1, r6d 1626 vbroadcasti32x4 ym4, [srcq+ssq*1] 1627 lea srcq, [srcq+ssq*2] 1628 vbroadcasti32x4 m0, [srcq+ssq*0] 1629 vshufpd m1{k1}, m3, m2, 0xcc 1630 vshufpd m2{k1}, m4, m0, 0xcc 1631 vpermb m1, m5, m1 ; 01 12 1632 vpermb m2, m5, m2 ; 23 34 1633.v_w16_loop: 1634 vbroadcasti32x4 ym4, [srcq+ssq*1] 1635 lea srcq, [srcq+ssq*2] 1636 pmaddubsw m3, m1, m7 ; a0 b0 1637 mova m1, m2 1638 pmaddubsw m2, m8 ; a1 b1 1639 paddw m3, m2 1640 mova m2, m0 1641 vbroadcasti32x4 m0, [srcq+ssq*0] 1642 vshufpd m2{k1}, m4, m0, 0xcc 1643 vpermb m2, m5, m2 ; 45 56 1644 pmaddubsw m4, m2, m9 ; a2 b2 1645 paddw m3, m4 1646 pmulhrsw m3, m6 1647 vextracti32x8 ym4, m3, 1 1648 packuswb ym3, ym4 1649 mova [dstq+dsq*0], xm3 1650 vextracti32x4 [dstq+dsq*1], ym3, 1 1651 lea dstq, [dstq+dsq*2] 1652 sub hd, 2 1653 jg .v_w16_loop 1654 RET 1655.v_w32: 1656 mova m10, [spel_v_perm32] 1657 pmovzxbq m5, [pb_02461357] 1658 vpshrdw m11, m10, m10, 8 1659 movu ym0, [srcq+nsq*2] 1660 vinserti32x8 m0, [srcq+nsq*1], 1 1661 vpermb m1, m10, m0 ; 01 1662 vinserti32x8 m0, [srcq+ssq*0], 0 1663 vpermb m2, m11, m0 ; 12 1664 vinserti32x8 m0, [srcq+ssq*1], 1 1665 lea srcq, [srcq+ssq*2] 1666 vpermb m3, m10, m0 ; 23 1667 vinserti32x8 m0, [srcq+ssq*0], 0 1668 vpermb m4, m11, m0 ; 34 1669.v_w32_loop: 1670 vinserti32x8 m0, [srcq+ssq*1], 1 1671 lea srcq, [srcq+ssq*2] 1672 pmaddubsw m12, m1, m7 1673 mova m1, m3 1674 pmaddubsw m13, m2, m7 1675 mova m2, m4 1676 pmaddubsw m14, m3, m8 1677 vpermb m3, m10, m0 ; 45 1678 vinserti32x8 m0, [srcq+ssq*0], 0 1679 pmaddubsw m15, m4, m8 1680 vpermb m4, m11, m0 ; 56 1681 paddw m12, m14 1682 pmaddubsw m14, m3, m9 1683 paddw m13, m15 1684 pmaddubsw m15, m4, m9 1685 paddw m12, m14 1686 paddw m13, m15 1687 pmulhrsw m12, m6 1688 pmulhrsw m13, m6 1689 packuswb m12, m13 1690 vpermq m12, m5, m12 1691 mova [dstq+dsq*0], ym12 1692 vextracti32x8 [dstq+dsq*1], m12, 1 1693 lea dstq, [dstq+dsq*2] 1694 sub hd, 2 1695 jg .v_w32_loop 1696 RET 1697.v_w64: 1698.v_w128: 1699 lea r6d, [hq+wq*4-256] 1700.v_loop0: 1701 movu m2, [srcq+nsq*2] 1702 movu m4, [srcq+nsq*1] 1703 lea r4, [srcq+ssq*2] 1704 movu m11, [srcq+ssq*0] 1705 movu m13, [srcq+ssq*1] 1706 mov r7, dstq 1707 movu m0, [r4 +ssq*0] 1708 punpcklbw m1, m2, m4 ; 01l 1709 punpckhbw m2, m4 ; 01h 1710 punpcklbw m3, m4, m11 ; 12l 1711 punpckhbw m4, m11 ; 12h 1712 punpcklbw m10, m11, m13 ; 23l 1713 punpckhbw m11, m13 ; 23h 1714 punpcklbw m12, m13, m0 ; 34l 1715 punpckhbw m13, m0 ; 34h 1716.v_loop: 1717 movu m5, [r4+ssq*1] 1718 pmaddubsw m14, m1, m7 ; a0l 1719 mova m1, m10 1720 pmaddubsw m10, m8 ; a1l 1721 lea r4, [r4+ssq*2] 1722 pmaddubsw m15, m2, m7 ; a0h 1723 mova m2, m11 1724 pmaddubsw m11, m8 ; a1h 1725 paddw m14, m10 1726 punpcklbw m10, m0, m5 ; 45l 1727 paddw m15, m11 1728 punpckhbw m11, m0, m5 ; 45h 1729 pmaddubsw m0, m10, m9 ; a2l 1730 paddw m14, m0 1731 pmaddubsw m0, m11, m9 ; a2h 1732 paddw m15, m0 1733 movu m0, [r4+ssq*0] 1734 pmulhrsw m14, m6 1735 pmulhrsw m15, m6 1736 packuswb m14, m15 1737 pmaddubsw m15, m3, m7 ; b0l 1738 mova m3, m12 1739 pmaddubsw m12, m8 ; b1l 1740 mova [r7+dsq*0], m14 1741 pmaddubsw m14, m4, m7 ; b0h 1742 mova m4, m13 1743 pmaddubsw m13, m8 ; b1h 1744 paddw m15, m12 1745 punpcklbw m12, m5, m0 ; 56l 1746 paddw m14, m13 1747 punpckhbw m13, m5, m0 ; 56h 1748 pmaddubsw m5, m12, m9 ; b2l 1749 paddw m15, m5 1750 pmaddubsw m5, m13, m9 ; b2h 1751 paddw m14, m5 1752 pmulhrsw m15, m6 1753 pmulhrsw m14, m6 1754 packuswb m15, m14 1755 mova [r7+dsq*1], m15 1756 lea r7, [r7+dsq*2] 1757 sub hd, 2 1758 jg .v_loop 1759 add srcq, 64 1760 add dstq, 64 1761 movzx hd, r6b 1762 sub r6d, 256 1763 jg .v_loop0 1764 RET 1765.h: 1766 test myd, 0xf00 1767 jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2 1768.hv: 1769 vpbroadcastd m9, [pd_34] 1770 mova xm10, [spel_hv_end] 1771 pxor xm0, xm0 1772 cmp wd, 4 1773 jg .hv_w8 1774 movzx mxd, mxb 1775 dec srcq 1776 vpbroadcastd m7, [base+subpel_filters+mxq*8+2] 1777 movzx mxd, myb 1778 shr myd, 16 1779 cmp hd, 6 1780 cmovs myd, mxd 1781 vpbroadcastq ym1, [base+subpel_filters+1+myq*8] 1782 mov nsq, ssq 1783 punpcklbw ym0, ym1 1784 neg nsq 1785 psraw ym0, 2 ; << 6 1786 pshufd ym11, ym0, q0000 1787 pshufd ym12, ym0, q1111 1788 pshufd ym13, ym0, q2222 1789 cmp wd, 4 1790 je .hv_w4 1791 vbroadcasti128 ym5, [subpel_h_shuf4] 1792 movq xmm0, [srcq+nsq*2] 1793 movhps xmm0, [srcq+nsq*1] 1794 movq xmm2, [srcq+ssq*0] 1795 movhps xmm2, [srcq+ssq*1] 1796 lea srcq, [srcq+ssq*2] 1797 vpbroadcastq ymm1, [srcq+ssq*0] 1798 vpblendd ymm0, ymm1, 0x30 1799 pshufb xmm2, xm5 ; 2 3 1800 pshufb ymm0, ym5 ; 0 1 4 1801 mova xmm1, xm9 1802 vpdpbusd xmm1, xmm2, xm7 1803 mova ymm2, ym9 1804 vpdpbusd ymm2, ymm0, ym7 1805 packssdw ymm2, ymm1 1806 psraw ymm2, 2 1807 vextracti128 xmm0, ymm2, 1 1808 vzeroupper 1809 palignr xmm0, xmm2, 4 1810 punpcklwd xmm1, xmm2, xmm0 ; 01 12 1811 punpckhwd xmm2, xmm0 ; 23 34 1812.hv_w2_loop: 1813 movq xmm3, [srcq+ssq*1] 1814 lea srcq, [srcq+ssq*2] 1815 movhps xmm3, [srcq+ssq*0] 1816 pmaddwd xmm4, xmm1, xm11 ; a0 b0 1817 mova xmm1, xmm2 1818 vpdpwssd xmm4, xmm2, xm12 ; a1 b1 1819 pshufb xmm3, xm5 1820 mova xmm2, xm9 1821 vpdpbusd xmm2, xmm3, xm7 1822 packssdw xmm3, xmm2, xmm2 1823 psraw xmm3, 2 1824 palignr xmm2, xmm3, xmm0, 12 1825 mova xmm0, xmm3 1826 punpcklwd xmm2, xmm3 ; 45 56 1827 vpdpwssd xmm4, xmm2, xm13 ; a2 b2 1828 packuswb xmm4, xmm4 1829 pshufb xmm4, xm10 1830 pextrw [dstq+dsq*0], xmm4, 0 1831 pextrw [dstq+dsq*1], xmm4, 1 1832 lea dstq, [dstq+dsq*2] 1833 sub hd, 2 1834 jg .hv_w2_loop 1835 RET 1836.hv_w4: 1837 movq xm2, [srcq+nsq*2] 1838 vpbroadcastq ym1, [srcq+nsq*1] 1839 vinserti32x4 ym2, [srcq+ssq*0], 1 1840 vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3 1841 lea srcq, [srcq+ssq*2] 1842 vbroadcasti32x4 m5, [subpel_h_shufA] 1843 vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4 1844 pshufb m1, m5 1845 mova m0, m9 1846 pshufb m2, m5 1847 mova m3, m9 1848 vpdpbusd m0, m1, m7 1849 mova ym1, [spel_hv_perm4a] 1850 vpdpbusd m3, m2, m7 1851 mova ym2, [spel_hv_perm4b] 1852 mov r6d, 0x5555 1853 mova ym6, [spel_hv_perm4d] 1854 packssdw m0, m3 1855 kmovw k1, r6d 1856 psraw m0, 2 ; _ 0 1 2 3 4 5 6 1857 vpermb ym1, ym1, ym0 ; 01 12 1858 vpermb m2, m2, m0 ; 23 34 1859.hv_w4_loop: 1860 movq xm3, [srcq+ssq*1] 1861 lea srcq, [srcq+ssq*2] 1862 vinserti32x4 ym3, [srcq+ssq*0], 1 1863 pmaddwd ym4, ym1, ym11 ; a0 b0 1864 mova ym1, ym2 1865 pshufb ym3, ym5 1866 mova ym0, ym9 1867 vpdpbusd ym0, ym3, ym7 1868 vpdpwssd ym4, ym2, ym12 ; a1 b1 1869 vpsraw ym2{k1}, ym0, 2 ; 5 6 1870 vpermb ym2, ym6, ym2 ; 45 56 1871 vpdpwssd ym4, ym2, ym13 ; a2 b2 1872 packuswb ym4, ym4 1873 vpermb ym4, ym10, ym4 1874 movd [dstq+dsq*0], xm4 1875 pextrd [dstq+dsq*1], xm4, 1 1876 lea dstq, [dstq+dsq*2] 1877 sub hd, 2 1878 jg .hv_w4_loop 1879 RET 1880.hv_w8: 1881 shr mxd, 16 1882 sub srcq, 3 1883 vpbroadcastd m11, [base+subpel_filters+mxq*8+0] 1884 vpbroadcastd m12, [base+subpel_filters+mxq*8+4] 1885 movzx mxd, myb 1886 shr myd, 16 1887 cmp hd, 6 1888 cmovs myd, mxd 1889 vpbroadcastq m1, [base+subpel_filters+1+myq*8] 1890 mov nsq, ssq 1891 punpcklbw m0, m1 1892 neg nsq 1893 psraw m0, 2 ; << 6 1894 pshufd m13, m0, q0000 1895 pshufd m14, m0, q1111 1896 pshufd m15, m0, q2222 1897 cmp wd, 8 1898 jne .hv_w16 1899 movu xm0, [srcq+nsq*2] 1900 vinserti32x4 ym0, [srcq+nsq*1], 1 1901 vbroadcasti32x4 m1, [subpel_h_shufA] 1902 vinserti32x4 m0, [srcq+ssq*0], 2 1903 vbroadcasti32x4 m4, [subpel_h_shufB] 1904 vinserti32x4 m0, [srcq+ssq*1], 3 1905 lea srcq, [srcq+ssq*2] 1906 vbroadcasti32x4 m7, [subpel_h_shufC] 1907 vbroadcasti32x4 ym5, [srcq+ssq*0] 1908 vbroadcasti32x8 m6, [subpel_h_shufA] 1909 pshufb m1, m0, m1 ; 0 1 2 3 0123 1910 mova m2, m9 1911 vpdpbusd m2, m1, m11 1912 pshufb m4, m0, m4 ; 0 1 2 3 4567 1913 mova m1, m9 1914 vpdpbusd m1, m4, m11 1915 pshufb m0, m7 ; 0 1 2 3 89ab 1916 pshufb ym7, ym5, ym6 ; 4 0123 4567 1917 mova ym3, ym9 1918 vpdpbusd ym3, ym7, ym11 1919 vbroadcasti32x8 m7, [subpel_h_shufB] 1920 vpdpbusd m2, m4, m12 1921 mova m4, [spel_hv_perm8a] 1922 pshufb ym5, ym7 ; 4 4567 89ab 1923 vpdpbusd m1, m0, m12 1924 vpaddd m0, m4, [pb_32] {1to16} 1925 vpdpbusd ym3, ym5, ym12 1926 mova m5, [spel_hv_perm8b] 1927 mov r6, 0x55555555ff00 1928 packssdw m2, m1 1929 vpmovsdw xm3, ym3 1930 kmovq k1, r6 1931 psraw m2, 2 ; 0 1 2 3 1932 psraw xm3, 2 ; 4 1933 vpermb m1, m4, m2 ; 01 12 1934 kshiftrq k2, k1, 16 1935 vpermt2b m2, m0, m3 ; 23 34 1936.hv_w8_loop: 1937 vbroadcasti32x4 ym3, [srcq+ssq*1] 1938 lea srcq, [srcq+ssq*2] 1939 vbroadcasti32x4 m3{k1}, [srcq+ssq*0] 1940 pmaddwd m0, m1, m13 ; a0 b0 1941 pshufb m1, m3, m6 ; 5 6 0123 4567 1942 mova m4, m9 1943 vpdpbusd m4, m1, m11 1944 pshufb m3, m7 ; 5 6 4567 89ab 1945 vpdpwssd m0, m2, m14 ; a1 b1 1946 mova m1, m2 1947 vpdpbusd m4, m3, m12 1948 psraw m2{k2}, m4, 2 ; 53 64 1949 vpermb m2, m5, m2 ; 45 56 1950 vpdpwssd m0, m2, m15 ; a2 b2 1951 packuswb m0, m0 1952 vpermb m0, m10, m0 1953 movq [dstq+dsq*0], xm0 1954 movhps [dstq+dsq*1], xm0 1955 lea dstq, [dstq+dsq*2] 1956 sub hd, 2 1957 jg .hv_w8_loop 1958 RET 1959.hv_w16: 1960 movu m19, [spel_hv_perm16a] 1961 vpbroadcastd m7, [pb_4] 1962 lea r6d, [wq*2-32] 1963 mova m6, [spel_hv_perm16b] 1964 paddb m20, m7, m19 1965 lea r6d, [hq+r6*8] 1966 paddb m21, m7, m20 1967 mova ym10, [spel_hv_end16] 1968 paddb m7, m6 1969.hv_w16_loop0: 1970 movu ym16, [srcq+nsq*2] 1971 vinserti32x8 m16, [srcq+nsq*1], 1 1972 lea r4, [srcq+ssq*2] 1973 movu ym17, [srcq+ssq*0] 1974 vinserti32x8 m17, [srcq+ssq*1], 1 1975 mov r7, dstq 1976 movu ym18, [r4 +ssq*0] 1977 vpermb m2, m19, m16 ; 0 1 0123 89ab 1978 mova m1, m9 1979 vpermb m3, m21, m16 ; 0 1 89ab ghij 1980 vpdpbusd m1, m2, m11 1981 mova m2, m9 1982 vpermb m4, m19, m17 ; 2 3 0123 89ab 1983 vpdpbusd m2, m3, m12 1984 mova m3, m9 1985 vpermb m5, m21, m17 ; 2 3 89ab ghij 1986 vpdpbusd m3, m4, m11 1987 mova m4, m9 1988 vpermb m0, m6, m18 ; 4 0145 2367 89cd abef 1989 vpdpbusd m4, m5, m12 1990 mova m5, m9 1991 vpermb m16, m20, m16 ; 0 1 4567 cdef 1992 vpdpbusd m5, m0, m11 1993 vpermb m17, m20, m17 ; 2 3 4567 cdef 1994 vpdpbusd m1, m16, m12 1995 vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij 1996 vpdpbusd m2, m16, m11 1997 vpdpbusd m3, m17, m12 1998 vpdpbusd m4, m17, m11 1999 vpdpbusd m5, m18, m12 2000 packssdw m1, m2 ; 01 2001 packssdw m3, m4 ; 23 2002 REPX {psraw x, 2}, m1, m3, m5 2003 vpshrdd m2, m1, m3, 16 ; 12 2004 vpshrdd m4, m3, m5, 16 ; 34 2005.hv_w16_loop: 2006 movu ym18, [r4+ssq*1] 2007 lea r4, [r4+ssq*2] 2008 vinserti32x8 m18, [r4+ssq*0], 1 2009 pmaddwd m16, m1, m13 ; a0 2010 vpermb m1, m19, m18 ; 5 6 0123 89ab 2011 pmaddwd m17, m2, m13 ; b0 2012 vpermb m2, m20, m18 ; 5 6 4567 cdef 2013 mova m0, m9 2014 vpdpbusd m0, m1, m11 2015 vpermb m18, m21, m18 2016 mova m1, m9 2017 vpdpbusd m1, m2, m11 2018 vpdpwssd m16, m3, m14 ; a1 2019 vpdpwssd m17, m4, m14 ; b1 2020 vpdpbusd m0, m2, m12 2021 mova m2, m4 2022 vpdpbusd m1, m18, m12 2023 packssdw m0, m1 2024 mova m1, m3 2025 psraw m4, m0, 2 ; 5 6 2026 vpshrdd m3, m2, m4, 16 ; 4 5 2027 vpdpwssd m17, m4, m15 ; b2 2028 vpdpwssd m16, m3, m15 ; a2 2029 packuswb m16, m17 2030 vpermb m16, m10, m16 2031 mova [r7+dsq*0], xm16 2032 vextracti128 [r7+dsq*1], ym16, 1 2033 lea r7, [r7+dsq*2] 2034 sub hd, 2 2035 jg .hv_w16_loop 2036 add srcq, 16 2037 add dstq, 16 2038 movzx hd, r6b 2039 sub r6d, 1<<8 2040 jg .hv_w16_loop0 2041 vzeroupper 2042 RET 2043 2044PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc 2045PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc 2046PUT_8TAP_FN sharp, SHARP, SHARP 2047 2048cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 2049 imul mxd, mxm, 0x010101 2050 add mxd, t0d ; 8tap_h, mx, 4tap_h 2051 imul myd, mym, 0x010101 2052 add myd, t1d ; 8tap_v, my, 4tap_v 2053 lea r8, [put_avx512icl] 2054 movsxd wq, wm 2055 movifnidn hd, hm 2056 test mxd, 0xf00 2057 jnz .h 2058 test myd, 0xf00 2059 jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put 2060.v: 2061 movzx mxd, myb 2062 shr myd, 16 2063 cmp hd, 6 2064 cmovs myd, mxd 2065 tzcnt r6d, wd 2066 lea myq, [base+subpel_filters+myq*8] 2067 movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] 2068 vpbroadcastd m7, [pw_512] 2069 vpbroadcastw m8, [myq+0] 2070 add r6, r8 2071 vpbroadcastw m9, [myq+2] 2072 lea ss3q, [ssq*3] 2073 vpbroadcastw m10, [myq+4] 2074 sub srcq, ss3q 2075 vpbroadcastw m11, [myq+6] 2076 jmp r6 2077.v_w2: 2078 movd xmm2, [srcq+ssq*0] 2079 pinsrw xmm2, [srcq+ssq*1], 2 2080 pinsrw xmm2, [srcq+ssq*2], 4 2081 add srcq, ss3q 2082 pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 2083 movd xmm3, [srcq+ssq*1] 2084 vpbroadcastd xmm1, [srcq+ssq*2] 2085 add srcq, ss3q 2086 vpbroadcastd xmm0, [srcq+ssq*0] 2087 vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 2088 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 2089 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2090 punpcklbw xmm3, xmm1 ; 45 56 2091 punpcklbw xmm1, xmm2, xmm4 ; 01 12 2092 punpckhbw xmm2, xmm4 ; 23 34 2093.v_w2_loop: 2094 pmaddubsw xmm5, xmm1, xm8 ; a0 b0 2095 mova xmm1, xmm2 2096 pmaddubsw xmm2, xm9 ; a1 b1 2097 paddw xmm5, xmm2 2098 mova xmm2, xmm3 2099 pmaddubsw xmm3, xm10 ; a2 b2 2100 paddw xmm5, xmm3 2101 vpbroadcastd xmm4, [srcq+ssq*1] 2102 lea srcq, [srcq+ssq*2] 2103 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2104 vpbroadcastd xmm0, [srcq+ssq*0] 2105 vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 2106 punpcklbw xmm3, xmm4 ; 67 78 2107 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 2108 paddw xmm5, xmm4 2109 pmulhrsw xmm5, xm7 2110 packuswb xmm5, xmm5 2111 pextrw [dstq+dsq*0], xmm5, 0 2112 pextrw [dstq+dsq*1], xmm5, 2 2113 lea dstq, [dstq+dsq*2] 2114 sub hd, 2 2115 jg .v_w2_loop 2116 RET 2117.v_w4: 2118 movd xmm2, [srcq+ssq*0] 2119 pinsrd xmm2, [srcq+ssq*1], 1 2120 pinsrd xmm2, [srcq+ssq*2], 2 2121 add srcq, ss3q 2122 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 2123 movd xmm3, [srcq+ssq*1] 2124 vpbroadcastd xmm1, [srcq+ssq*2] 2125 add srcq, ss3q 2126 vpbroadcastd xmm0, [srcq+ssq*0] 2127 vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 2128 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 2129 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2130 punpcklbw xmm3, xmm1 ; 45 56 2131 punpcklbw xmm1, xmm2, xmm4 ; 01 12 2132 punpckhbw xmm2, xmm4 ; 23 34 2133.v_w4_loop: 2134 vpbroadcastd xmm4, [srcq+ssq*1] 2135 lea srcq, [srcq+ssq*2] 2136 pmaddubsw xmm5, xmm1, xm8 ; a0 b0 2137 mova xmm1, xmm2 2138 pmaddubsw xmm2, xm9 ; a1 b1 2139 paddw xmm5, xmm2 2140 mova xmm2, xmm3 2141 pmaddubsw xmm3, xm10 ; a2 b2 2142 paddw xmm5, xmm3 2143 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2144 vpbroadcastd xmm0, [srcq+ssq*0] 2145 vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 2146 punpcklbw xmm3, xmm4 ; 67 78 2147 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 2148 paddw xmm5, xmm4 2149 pmulhrsw xmm5, xm7 2150 packuswb xmm5, xmm5 2151 movd [dstq+dsq*0], xmm5 2152 pextrd [dstq+dsq*1], xmm5, 1 2153 lea dstq, [dstq+dsq*2] 2154 sub hd, 2 2155 jg .v_w4_loop 2156 RET 2157.v_w8: 2158 movq xmm1, [srcq+ssq*0] 2159 vpbroadcastq ymm0, [srcq+ssq*1] 2160 vpbroadcastq ymm2, [srcq+ssq*2] 2161 add srcq, ss3q 2162 vpbroadcastq ymm5, [srcq+ssq*0] 2163 vpbroadcastq ymm3, [srcq+ssq*1] 2164 vpbroadcastq ymm4, [srcq+ssq*2] 2165 add srcq, ss3q 2166 vpblendd ymm1, ymm0, 0x30 2167 vpblendd ymm0, ymm2, 0x30 2168 punpcklbw ymm1, ymm0 ; 01 12 2169 vpbroadcastq ymm0, [srcq+ssq*0] 2170 vpblendd ymm2, ymm5, 0x30 2171 vpblendd ymm5, ymm3, 0x30 2172 punpcklbw ymm2, ymm5 ; 23 34 2173 vpblendd ymm3, ymm4, 0x30 2174 vpblendd ymm4, ymm0, 0x30 2175 punpcklbw ymm3, ymm4 ; 45 56 2176.v_w8_loop: 2177 vpbroadcastq ymm4, [srcq+ssq*1] 2178 lea srcq, [srcq+ssq*2] 2179 pmaddubsw ymm5, ymm1, ym8 ; a0 b0 2180 mova ymm1, ymm2 2181 pmaddubsw ymm2, ym9 ; a1 b1 2182 paddw ymm5, ymm2 2183 mova ymm2, ymm3 2184 pmaddubsw ymm3, ym10 ; a2 b2 2185 paddw ymm5, ymm3 2186 vpblendd ymm3, ymm0, ymm4, 0x30 2187 vpbroadcastq ymm0, [srcq+ssq*0] 2188 vpblendd ymm4, ymm4, ymm0, 0x30 2189 punpcklbw ymm3, ymm4 ; 67 78 2190 pmaddubsw ymm4, ymm3, ym11 ; a3 b3 2191 paddw ymm5, ymm4 2192 pmulhrsw ymm5, ym7 2193 vextracti128 xmm4, ymm5, 1 2194 packuswb xmm5, xmm4 2195 movq [dstq+dsq*0], xmm5 2196 movhps [dstq+dsq*1], xmm5 2197 lea dstq, [dstq+dsq*2] 2198 sub hd, 2 2199 jg .v_w8_loop 2200 vzeroupper 2201 RET 2202.v_w16: 2203 mova m12, [spel_v_perm16a] 2204 vbroadcasti32x4 m1, [srcq+ssq*0] 2205 vbroadcasti32x4 ym4, [srcq+ssq*1] 2206 mov r6d, 0x0f 2207 vbroadcasti32x4 m2, [srcq+ssq*2] 2208 add srcq, ss3q 2209 vbroadcasti32x4 ym5, [srcq+ssq*0] 2210 kmovb k1, r6d 2211 vbroadcasti32x4 m3, [srcq+ssq*1] 2212 vbroadcasti32x4 ym6, [srcq+ssq*2] 2213 add srcq, ss3q 2214 vbroadcasti32x4 m0, [srcq+ssq*0] 2215 vshufpd m1{k1}, m4, m2, 0xcc 2216 vshufpd m2{k1}, m5, m3, 0xcc 2217 vshufpd m3{k1}, m6, m0, 0xcc 2218 vpermb m1, m12, m1 ; 01 12 2219 vpermb m2, m12, m2 ; 23 34 2220 vpermb m3, m12, m3 ; 45 56 2221.v_w16_loop: 2222 pmaddubsw m4, m1, m8 ; a0 b0 2223 mova m1, m2 2224 pmaddubsw m5, m2, m9 ; a1 b1 2225 mova m2, m3 2226 pmaddubsw m6, m3, m10 ; a2 b2 2227 mova m3, m0 2228 paddw m4, m5 2229 vbroadcasti32x4 ym5, [srcq+ssq*1] 2230 lea srcq, [srcq+ssq*2] 2231 vbroadcasti32x4 m0, [srcq+ssq*0] 2232 vshufpd m3{k1}, m5, m0, 0xcc 2233 vpermb m3, m12, m3 ; 67 78 2234 pmaddubsw m5, m3, m11 ; a3 b3 2235 paddw m4, m6 2236 paddw m4, m5 2237 pmulhrsw m4, m7 2238 vextracti32x8 ym5, m4, 1 2239 packuswb ym4, ym5 2240 mova [dstq+dsq*0], xm4 2241 vextracti32x4 [dstq+dsq*1], ym4, 1 2242 lea dstq, [dstq+dsq*2] 2243 sub hd, 2 2244 jg .v_w16_loop 2245 RET 2246.v_w32: 2247 mova m12, [spel_v_perm32] 2248 pmovzxbq m14, [pb_02461357] 2249 vpshrdw m13, m12, m12, 8 2250 movu ym0, [srcq+ssq*0] 2251 vinserti32x8 m0, [srcq+ssq*1], 1 2252 vpermb m1, m12, m0 ; 01 2253 vinserti32x8 m0, [srcq+ssq*2], 0 2254 add srcq, ss3q 2255 vpermb m2, m13, m0 ; 12 2256 vinserti32x8 m0, [srcq+ssq*0], 1 2257 vpermb m3, m12, m0 ; 23 2258 vinserti32x8 m0, [srcq+ssq*1], 0 2259 vpermb m4, m13, m0 ; 34 2260 vinserti32x8 m0, [srcq+ssq*2], 1 2261 add srcq, ss3q 2262 vpermb m5, m12, m0 ; 45 2263 vinserti32x8 m0, [srcq+ssq*0], 0 2264 vpermb m6, m13, m0 ; 56 2265.v_w32_loop: 2266 vinserti32x8 m0, [srcq+ssq*1], 1 2267 lea srcq, [srcq+ssq*2] 2268 pmaddubsw m15, m1, m8 2269 mova m1, m3 2270 pmaddubsw m16, m2, m8 2271 mova m2, m4 2272 pmaddubsw m17, m3, m9 2273 mova m3, m5 2274 pmaddubsw m18, m4, m9 2275 mova m4, m6 2276 pmaddubsw m19, m5, m10 2277 vpermb m5, m12, m0 ; 67 2278 vinserti32x8 m0, [srcq+ssq*0], 0 2279 pmaddubsw m20, m6, m10 2280 vpermb m6, m13, m0 ; 78 2281 paddw m15, m17 2282 pmaddubsw m17, m5, m11 2283 paddw m16, m18 2284 pmaddubsw m18, m6, m11 2285 paddw m15, m19 2286 paddw m16, m20 2287 paddw m15, m17 2288 paddw m16, m18 2289 pmulhrsw m15, m7 2290 pmulhrsw m16, m7 2291 packuswb m15, m16 2292 vpermq m15, m14, m15 2293 mova [dstq+dsq*0], ym15 2294 vextracti32x8 [dstq+dsq*1], m15, 1 2295 lea dstq, [dstq+dsq*2] 2296 sub hd, 2 2297 jg .v_w32_loop 2298 vzeroupper 2299 RET 2300.v_w64: 2301.v_w128: 2302 lea r6d, [hq+wq*4-256] 2303 mov r4, srcq 2304 mov r7, dstq 2305.v_loop0: 2306 movu m2, [srcq+ssq*0] 2307 movu m4, [srcq+ssq*1] 2308 movu m6, [srcq+ssq*2] 2309 add srcq, ss3q 2310 movu m13, [srcq+ssq*0] 2311 movu m15, [srcq+ssq*1] 2312 movu m17, [srcq+ssq*2] 2313 add srcq, ss3q 2314 movu m0, [srcq+ssq*0] 2315 punpcklbw m1, m2, m4 ; 01l 2316 punpckhbw m2, m4 ; 01h 2317 punpcklbw m3, m4, m6 ; 12l 2318 punpckhbw m4, m6 ; 12h 2319 punpcklbw m5, m6, m13 ; 23l 2320 punpckhbw m6, m13 ; 23h 2321 punpcklbw m12, m13, m15 ; 34l 2322 punpckhbw m13, m15 ; 34h 2323 punpcklbw m14, m15, m17 ; 45l 2324 punpckhbw m15, m17 ; 45h 2325 punpcklbw m16, m17, m0 ; 56l 2326 punpckhbw m17, m0 ; 56h 2327.v_loop: 2328 pmaddubsw m18, m1, m8 ; a0l 2329 mova m1, m5 2330 pmaddubsw m19, m2, m8 ; a0h 2331 mova m2, m6 2332 pmaddubsw m20, m3, m8 ; b0l 2333 mova m3, m12 2334 pmaddubsw m21, m4, m8 ; b0h 2335 mova m4, m13 2336 pmaddubsw m5, m9 ; a1l 2337 pmaddubsw m6, m9 ; a1h 2338 pmaddubsw m12, m9 ; b1l 2339 pmaddubsw m13, m9 ; b1h 2340 paddw m18, m5 2341 mova m5, m14 2342 pmaddubsw m14, m10 ; a2l 2343 paddw m19, m6 2344 mova m6, m15 2345 pmaddubsw m15, m10 ; a2h 2346 paddw m20, m12 2347 mova m12, m16 2348 pmaddubsw m16, m10 ; b2l 2349 paddw m21, m13 2350 mova m13, m17 2351 pmaddubsw m17, m10 ; b2h 2352 paddw m18, m14 2353 paddw m19, m15 2354 paddw m20, m16 2355 paddw m21, m17 2356 movu m17, [srcq+ssq*1] 2357 lea srcq, [srcq+ssq*2] 2358 punpcklbw m14, m0, m17 ; 67l 2359 punpckhbw m15, m0, m17 ; 67h 2360 pmaddubsw m16, m14, m11 ; a3l 2361 pmaddubsw m0, m15, m11 ; a3h 2362 paddw m18, m16 2363 paddw m19, m0 2364 movu m0, [srcq+ssq*0] 2365 punpcklbw m16, m17, m0 ; 78l 2366 punpckhbw m17, m0 ; 78h 2367 pmulhrsw m18, m7 2368 pmulhrsw m19, m7 2369 packuswb m18, m19 2370 mova [dstq+dsq*0], m18 2371 pmaddubsw m18, m16, m11 ; b3l 2372 pmaddubsw m19, m17, m11 ; b3h 2373 paddw m18, m20 2374 paddw m19, m21 2375 pmulhrsw m18, m7 2376 pmulhrsw m19, m7 2377 packuswb m18, m19 2378 mova [dstq+dsq*1], m18 2379 lea dstq, [dstq+dsq*2] 2380 sub hd, 2 2381 jg .v_loop 2382 add r4, 64 2383 add r7, 64 2384 movzx hd, r6b 2385 mov srcq, r4 2386 mov dstq, r7 2387 sub r6d, 256 2388 jg .v_loop0 2389 vzeroupper 2390 RET 2391.h: 2392 test myd, 0xf00 2393 jnz .hv 2394.h2: 2395 vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) 2396 cmp wd, 4 2397 jl .h_w2 2398 vbroadcasti128 m6, [subpel_h_shufA] 2399 je .h_w4 2400 tzcnt wd, wd 2401 vbroadcasti128 m7, [subpel_h_shufB] 2402 vbroadcasti128 m8, [subpel_h_shufC] 2403 shr mxd, 16 2404 sub srcq, 3 2405 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] 2406 vpbroadcastd m9, [base+mxq*8+subpel_filters+0] 2407 vpbroadcastd m10, [base+mxq*8+subpel_filters+4] 2408 add wq, r8 2409 jmp wq 2410.h_w2: 2411 movzx mxd, mxb 2412 dec srcq 2413 mova xmm4, [subpel_h_shuf4] 2414 vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] 2415.h_w2_loop: 2416 movq xmm0, [srcq+ssq*0] 2417 movhps xmm0, [srcq+ssq*1] 2418 lea srcq, [srcq+ssq*2] 2419 pshufb xmm0, xmm4 2420 mova xmm1, xm5 2421 vpdpbusd xmm1, xmm0, xmm3 2422 packssdw xmm0, xmm1, xmm1 2423 psraw xmm0, 6 2424 packuswb xmm0, xm0 2425 pextrw [dstq+dsq*0], xmm0, 0 2426 pextrw [dstq+dsq*1], xmm0, 1 2427 lea dstq, [dstq+dsq*2] 2428 sub hd, 2 2429 jg .h_w2_loop 2430 RET 2431.h_w4: 2432 movzx mxd, mxb 2433 dec srcq 2434 vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] 2435.h_w4_loop: 2436 movq xmm0, [srcq+ssq*0] 2437 movq xmm1, [srcq+ssq*1] 2438 lea srcq, [srcq+ssq*2] 2439 pshufb xmm0, xm6 2440 pshufb xmm1, xm6 2441 mova xmm2, xm5 2442 vpdpbusd xmm2, xmm0, xmm3 2443 mova xmm0, xm5 2444 vpdpbusd xmm0, xmm1, xmm3 2445 packssdw xmm0, xmm2, xmm0 2446 psraw xmm0, 6 2447 packuswb xmm0, xmm0 2448 movd [dstq+dsq*0], xmm0 2449 pextrd [dstq+dsq*1], xmm0, 1 2450 lea dstq, [dstq+dsq*2] 2451 sub hd, 2 2452 jg .h_w4_loop 2453 RET 2454.h_w8: 2455 movu xm0, [srcq+ssq*0] 2456 vinserti32x4 ym0, [srcq+ssq*1], 1 2457 lea srcq, [srcq+ssq*2] 2458 WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 2459 vpmovuswb xm0, ym0 2460 movq [dstq+dsq*0], xm0 2461 movhps [dstq+dsq*1], xm0 2462 lea dstq, [dstq+dsq*2] 2463 sub hd, 2 2464 jg .h_w8 2465 RET 2466.h_w16: 2467 mova m6, [spel_h_perm16] 2468 vpbroadcastd m8, [pb_4] 2469 paddb m7, m8, m6 2470 paddb m8, m7 2471.h_w16_loop: 2472 movu ym0, [srcq+ssq*0] 2473 vinserti32x8 m0, [srcq+ssq*1], 1 2474 lea srcq, [srcq+ssq*2] 2475 PUT_8TAP_H 0, 1, 2, 3, 1 2476 vpmovuswb ym0, m0 2477 mova [dstq+dsq*0], xm0 2478 vextracti128 [dstq+dsq*1], ym0, 1 2479 lea dstq, [dstq+dsq*2] 2480 sub hd, 2 2481 jg .h_w16_loop 2482 RET 2483.h_w32: 2484 movu ym0, [srcq+ssq*0+8*0] 2485 vinserti32x8 m0, [srcq+ssq*1+8*0], 1 2486 movu ym1, [srcq+ssq*0+8*1] 2487 vinserti32x8 m1, [srcq+ssq*1+8*1], 1 2488 lea srcq, [srcq+ssq*2] 2489 PUT_8TAP_H 0, 2, 3, 4 2490 PUT_8TAP_H 1, 4, 3, 2 2491 packuswb m0, m1 2492 mova [dstq+dsq*0], ym0 2493 vextracti32x8 [dstq+dsq*1], m0, 1 2494 lea dstq, [dstq+dsq*2] 2495 sub hd, 2 2496 jg .h_w32 2497 RET 2498.h_w64: 2499 movu m0, [srcq+8*0] 2500 movu m1, [srcq+8*1] 2501 add srcq, ssq 2502 PUT_8TAP_H 0, 2, 3, 4 2503 PUT_8TAP_H 1, 4, 3, 2 2504 packuswb m0, m1 2505 mova [dstq], m0 2506 add dstq, dsq 2507 dec hd 2508 jg .h_w64 2509 RET 2510.h_w128: 2511 movu m0, [srcq+8*0] 2512 movu m2, [srcq+8*1] 2513 movu m1, [srcq+8*8] 2514 movu m3, [srcq+8*9] 2515 add srcq, ssq 2516 PUT_8TAP_H 0, 4, 11, 12 2517 PUT_8TAP_H 2, 12, 11, 4 2518 PUT_8TAP_H 1, 4, 11, 12 2519 PUT_8TAP_H 3, 12, 11, 4 2520 packuswb m0, m2 2521 packuswb m1, m3 2522 mova [dstq+64*0], m0 2523 mova [dstq+64*1], m1 2524 add dstq, dsq 2525 dec hd 2526 jg .h_w128 2527 RET 2528.hv: 2529 vpbroadcastd m9, [pd_34] 2530 pxor xm0, xm0 2531 cmp wd, 4 2532 jg .hv_w8 2533 movzx mxd, mxb 2534 dec srcq 2535 vpbroadcastd m7, [base+subpel_filters+mxq*8+2] 2536 movzx mxd, myb 2537 shr myd, 16 2538 cmp hd, 6 2539 cmovs myd, mxd 2540 vpbroadcastq ym1, [base+subpel_filters+myq*8] 2541 lea ss3q, [ssq*3] 2542 mov r6, srcq 2543 punpcklbw ym0, ym1 2544 sub r6, ss3q 2545 psraw ym0, 2 ; << 6 2546 mova xm14, [spel_hv_end] 2547 pshufd ym10, ym0, q0000 2548 pshufd ym11, ym0, q1111 2549 pshufd ym12, ym0, q2222 2550 pshufd ym13, ym0, q3333 2551 cmp wd, 4 2552 je .hv_w4 2553 vbroadcasti128 ym6, [subpel_h_shuf4] 2554 movq xmm2, [r6+ssq*0] 2555 movhps xmm2, [r6+ssq*1] 2556 movq xmm0, [r6+ssq*2] 2557 movhps xmm0, [srcq+ssq*0] 2558 vpbroadcastq ymm3, [srcq+ssq*1] 2559 vpbroadcastq ymm4, [srcq+ssq*2] 2560 add srcq, ss3q 2561 vpbroadcastq ymm1, [srcq+ssq*0] 2562 vpblendd ymm2, ymm3, 0x30 2563 vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ 2564 vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 2565 pshufb ymm2, ym6 2566 pshufb ymm0, ym6 2567 mova ymm1, ym9 2568 vpdpbusd ymm1, ymm2, ym7 2569 mova ymm2, ym9 2570 vpdpbusd ymm2, ymm0, ym7 2571 packssdw ymm2, ymm1, ymm2 2572 psraw ymm2, 2 2573 vextracti128 xmm3, ymm2, 1 2574 palignr xmm4, xmm3, xmm2, 4 2575 punpcklwd xmm1, xmm2, xmm4 ; 01 12 2576 punpckhwd xmm2, xmm4 ; 23 34 2577 pshufd xmm0, xmm3, q2121 2578 punpcklwd xmm3, xmm0 ; 45 56 2579.hv_w2_loop: 2580 movq xmm4, [srcq+ssq*1] 2581 lea srcq, [srcq+ssq*2] 2582 movhps xmm4, [srcq+ssq*0] 2583 pmaddwd xmm5, xmm1, xm10 ; a0 b0 2584 mova xmm1, xmm2 2585 vpdpwssd xmm5, xmm2, xm11 ; a1 b1 2586 pshufb xmm4, xm6 2587 mova xmm2, xmm3 2588 vpdpwssd xmm5, xmm3, xm12 ; a2 b2 2589 mova xmm3, xm9 2590 vpdpbusd xmm3, xmm4, xm7 2591 packssdw xmm4, xmm3, xmm3 2592 psraw xmm4, 2 2593 palignr xmm3, xmm4, xmm0, 12 2594 mova xmm0, xmm4 2595 punpcklwd xmm3, xmm4 ; 67 78 2596 vpdpwssd xmm5, xmm3, xm13 ; a3 b3 2597 packuswb xmm5, xmm5 2598 pshufb xmm5, xm14 2599 pextrw [dstq+dsq*0], xmm5, 0 2600 pextrw [dstq+dsq*1], xmm5, 1 2601 lea dstq, [dstq+dsq*2] 2602 sub hd, 2 2603 jg .hv_w2_loop 2604 vzeroupper 2605 RET 2606.hv_w4: 2607 movq xmm1, [r6+ssq*0] 2608 vpbroadcastq ym2, [r6+ssq*1] 2609 vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 2610 vinserti32x4 m2, [srcq+ssq*0], 2 2611 vinserti32x4 m1, [srcq+ssq*1], 2 2612 vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 2613 vbroadcasti32x4 m6, [subpel_h_shufA] 2614 add srcq, ss3q 2615 vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 2616 pshufb m2, m6 2617 pshufb m1, m6 2618 mova m0, m9 2619 vpdpbusd m0, m2, m7 2620 mova m4, m9 2621 vpdpbusd m4, m1, m7 2622 mova ym1, [spel_hv_perm4a] 2623 mova ym2, [spel_hv_perm4b] 2624 mova ym3, [spel_hv_perm4c] 2625 packssdw m0, m4 2626 psraw m0, 2 ; _ 0 1 2 3 4 5 6 2627 mov r6d, 0x5555 2628 vpermb ym1, ym1, ym0 ; 01 12 2629 vpermb m2, m2, m0 ; 23 34 2630 vpermb m3, m3, m0 ; 45 56 2631 kmovw k1, r6d 2632 mova ym15, [spel_hv_perm4d] 2633.hv_w4_loop: 2634 movq xmm4, [srcq+ssq*1] 2635 lea srcq, [srcq+ssq*2] 2636 vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 2637 pmaddwd ym5, ym1, ym10 ; a0 b0 2638 mova ym1, ym2 2639 pshufb ym4, ym6 2640 mova ym0, ym9 2641 vpdpbusd ym0, ym4, ym7 2642 vpdpwssd ym5, ym2, ym11 ; a1 b1 2643 mova ym2, ym3 2644 vpdpwssd ym5, ym3, ym12 ; a2 b2 2645 vpsraw ym3{k1}, ym0, 2 ; 7 8 2646 vpermb ym3, ym15, ym3 ; 67 78 2647 vpdpwssd ym5, ym3, ym13 ; a3 b3 2648 packuswb ym5, ym5 2649 vpermb ym5, ym14, ym5 2650 movd [dstq+dsq*0], xm5 2651 pextrd [dstq+dsq*1], xm5, 1 2652 lea dstq, [dstq+dsq*2] 2653 sub hd, 2 2654 jg .hv_w4_loop 2655 RET 2656.hv_w8: 2657 shr mxd, 16 2658 sub srcq, 3 2659 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] 2660 vpbroadcastd m11, [base+subpel_filters+mxq*8+4] 2661 movzx mxd, myb 2662 shr myd, 16 2663 cmp hd, 6 2664 cmovs myd, mxd 2665 vpbroadcastq m1, [base+subpel_filters+myq*8] 2666 punpcklbw m0, m1 2667 lea ss3q, [ssq*3] 2668 psraw m0, 2 ; << 6 2669 pshufd m12, m0, q0000 2670 pshufd m13, m0, q1111 2671 pshufd m14, m0, q2222 2672 pshufd m15, m0, q3333 2673 cmp wd, 8 2674 jne .hv_w16 2675 mov r6, srcq 2676 sub r6, ss3q 2677 movu xmm1, [r6+ssq*0] 2678 vinserti128 ymm1, [r6+ssq*1], 1 2679 movu xmm2, [srcq+ssq*1] 2680 vinserti32x4 m6, zmm1, [r6+ssq*2], 2 2681 vinserti128 ymm2, [srcq+ssq*2], 1 2682 vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 2683 add srcq, ss3q 2684 vbroadcasti32x4 m4, [subpel_h_shufA] 2685 vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ 2686 vbroadcasti32x4 m7, [subpel_h_shufB] 2687 vbroadcasti32x4 m8, [subpel_h_shufC] 2688 pshufb m1, m6, m4 ; 0 1 2 3 0123 2689 mova m2, m9 2690 vpdpbusd m2, m1, m10 2691 pshufb m5, m6, m7 ; 0 1 2 3 4567 2692 mova m1, m9 2693 vpdpbusd m1, m5, m10 2694 pshufb m4, m0, m4 ; 4 5 6 _ 0123 2695 mova m3, m9 2696 vpdpbusd m3, m4, m10 2697 pshufb m7, m0, m7 ; 4 5 6 _ 4567 2698 mova m4, m9 2699 vpdpbusd m4, m7, m10 2700 pshufb m6, m8 2701 vpdpbusd m2, m5, m11 2702 vpdpbusd m1, m6, m11 2703 pshufb m6, m0, m8 2704 vpdpbusd m3, m7, m11 2705 vpdpbusd m4, m6, m11 2706 mova m5, [spel_hv_perm8a] 2707 vpaddd m0, m5, [pb_32] {1to16} 2708 mov r6, 0x55555555ff00 2709 packssdw m2, m1 2710 packssdw m3, m4 2711 mova m8, [spel_hv_perm8b] 2712 psraw m2, 2 ; 0 1 2 3 2713 psraw m3, 2 ; 4 5 6 _ 2714 vpermb m1, m5, m2 ; 01 12 2715 vbroadcasti32x8 m6, [subpel_h_shufA] 2716 kmovq k1, r6 2717 vpermt2b m2, m0, m3 ; 23 34 2718 vbroadcasti32x8 m7, [subpel_h_shufB] 2719 kshiftrq k2, k1, 16 2720 mova xm16, [spel_hv_end] 2721 vpermb m3, m5, m3 ; 45 56 2722.hv_w8_loop: 2723 vbroadcasti32x4 ym4, [srcq+ssq*1] 2724 lea srcq, [srcq+ssq*2] 2725 vbroadcasti32x4 m4{k1}, [srcq+ssq*0] 2726 pmaddwd m0, m1, m12 ; a0 b0 2727 pshufb m1, m4, m6 ; 7 8 0123 4567 2728 mova m5, m9 2729 vpdpbusd m5, m1, m10 2730 pshufb m4, m7 ; 7 8 4567 89ab 2731 vpdpwssd m0, m2, m13 ; a1 b1 2732 mova m1, m2 2733 vpdpbusd m5, m4, m11 2734 mova m2, m3 2735 vpdpwssd m0, m3, m14 ; a2 b2 2736 psraw m3{k2}, m5, 2 ; 75 86 2737 vpermb m3, m8, m3 ; 67 78 2738 vpdpwssd m0, m3, m15 ; a3 b3 2739 packuswb m0, m0 2740 vpermb zmm1, m16, m0 2741 movq [dstq+dsq*0], xmm1 2742 movhps [dstq+dsq*1], xmm1 2743 lea dstq, [dstq+dsq*2] 2744 sub hd, 2 2745 jg .hv_w8_loop 2746 vzeroupper 2747 RET 2748.hv_w16: 2749 WIN64_SPILL_XMM 23 2750 movu m22, [spel_hv_perm16a] 2751 sub srcq, ss3q 2752 vpbroadcastd m8, [pb_4] 2753 lea r6d, [wq*2-32] 2754 mova m7, [spel_hv_perm16b] 2755 paddb m20, m8, m22 2756 mova ym16, [spel_hv_end16] 2757 paddb m21, m8, m20 2758 lea r6d, [hq+r6*8] 2759 paddb m8, m7 2760.hv_w16_loop0: 2761 movu ym17, [srcq+ssq*0] 2762 vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 2763 lea r4, [srcq+ss3q] 2764 movu ym18, [srcq+ssq*2] 2765 vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3 2766 mov r7, dstq 2767 movu ym19, [r4 +ssq*1] 2768 vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5 2769 add r4, ss3q 2770 vpermb m2, m22, m17 ; 0 1 0123 89ab 2771 mova m1, m9 2772 vpermb m3, m21, m17 ; 0 1 89ab ghij 2773 vpdpbusd m1, m2, m10 2774 mova m2, m9 2775 vpermb m4, m22, m18 ; 2 3 0123 89ab 2776 vpdpbusd m2, m3, m11 2777 mova m3, m9 2778 vpermb m5, m21, m18 ; 2 3 89ab ghij 2779 vpdpbusd m3, m4, m10 2780 mova m4, m9 2781 vpermb m6, m22, m19 ; 4 5 0123 89ab 2782 vpdpbusd m4, m5, m11 2783 mova m5, m9 2784 vpermb m17, m20, m17 ; 0 1 4567 cdef 2785 vpdpbusd m5, m6, m10 2786 mova m6, m9 2787 vpermb m0, m21, m19 ; 4 5 89ab ghij 2788 vpdpbusd m1, m17, m11 2789 vpdpbusd m2, m17, m10 2790 movu ym17, [r4+ssq*0] ; 6 2791 vpermb m18, m20, m18 ; 2 3 4567 cdef 2792 vpdpbusd m6, m0, m11 2793 vpermb m0, m7, m17 ; 6 0145 2367 89cd abef 2794 vpdpbusd m3, m18, m11 2795 vpermb m19, m20, m19 ; 4 5 4567 cdef 2796 vpdpbusd m4, m18, m10 2797 mova m18, m9 2798 vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij 2799 vpdpbusd m18, m0, m10 2800 packssdw m1, m2 2801 vpdpbusd m5, m19, m11 2802 vpdpbusd m6, m19, m10 2803 packssdw m3, m4 2804 vpdpbusd m18, m17, m11 2805 psraw m1, 2 ; 01 2806 psraw m3, 2 ; 23 2807 packssdw m5, m6 2808 vpshrdd m2, m1, m3, 16 ; 12 2809 psraw m5, 2 ; 45 2810 vpshrdd m4, m3, m5, 16 ; 34 2811 psraw m18, 2 2812 vpshrdd m6, m5, m18, 16 ; 56 2813.hv_w16_loop: 2814 movu ym19, [r4+ssq*1] 2815 lea r4, [r4+ssq*2] 2816 vinserti32x8 m19, [r4+ssq*0], 1 2817 pmaddwd m17, m1, m12 ; a0 2818 vpermb m1, m22, m19 ; 7 8 0123 89ab 2819 pmaddwd m18, m2, m12 ; b0 2820 mova m0, m9 2821 vpermb m2, m21, m19 ; 7 8 89ab ghij 2822 vpdpbusd m0, m1, m10 2823 mova m1, m9 2824 vpermb m19, m20, m19 ; 7 8 4567 cdef 2825 vpdpbusd m1, m2, m11 2826 mova m2, m4 2827 vpdpwssd m17, m3, m13 ; a1 2828 vpdpwssd m18, m4, m13 ; b1 2829 mova m4, m6 2830 vpdpbusd m0, m19, m11 2831 vpdpbusd m1, m19, m10 2832 vpdpwssd m17, m5, m14 ; a2 2833 vpdpwssd m18, m6, m14 ; b2 2834 packssdw m0, m1 2835 mova m1, m3 2836 psraw m6, m0, 2 ; 78 2837 mova m3, m5 2838 vpshrdd m5, m4, m6, 16 ; 67 2839 vpdpwssd m18, m6, m15 ; b3 2840 vpdpwssd m17, m5, m15 ; a3 2841 packuswb m17, m18 2842 vpermb m17, m16, m17 2843 mova [r7+dsq*0], xm17 2844 vextracti128 [r7+dsq*1], ym17, 1 2845 lea r7, [r7+dsq*2] 2846 sub hd, 2 2847 jg .hv_w16_loop 2848 add srcq, 16 2849 add dstq, 16 2850 movzx hd, r6b 2851 sub r6d, 1<<8 2852 jg .hv_w16_loop0 2853 RET 2854 2855%if WIN64 2856DECLARE_REG_TMP 6, 4 2857%else 2858DECLARE_REG_TMP 6, 7 2859%endif 2860 2861%define PREP_8TAP_FN FN prep_8tap, 2862PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc 2863PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc 2864PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc 2865PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc 2866PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc 2867PREP_8TAP_FN regular, REGULAR, REGULAR 2868 2869cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3 2870%define base r7-prep_avx512icl 2871 imul mxd, mxm, 0x010101 2872 add mxd, t0d ; 6tap_h, mx, 4tap_h 2873 imul myd, mym, 0x010101 2874 add myd, t1d ; 6tap_v, my, 4tap_v 2875 lea r7, [prep_avx512icl] 2876 movifnidn hd, hm 2877 test mxd, 0xf00 2878 jnz .h 2879 test myd, 0xf00 2880 jnz .v 2881.prep: 2882 tzcnt wd, wd 2883 movzx wd, word [r7+wq*2+table_offset(prep,)] 2884 add wq, r7 2885 lea r6, [ssq*3] 2886%if WIN64 2887 pop r7 2888%endif 2889 jmp wq 2890.v: 2891 movzx mxd, myb 2892 shr myd, 16 2893 cmp hd, 4 2894 cmove myd, mxd 2895 tzcnt r5d, wd 2896 lea myq, [base+subpel_filters+1+myq*8] 2897 movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)] 2898 vpbroadcastd m7, [pw_8192] 2899 sub srcq, ssq 2900 vpbroadcastw m8, [myq+0] 2901 add r5, r7 2902 vpbroadcastw m9, [myq+2] 2903 lea ss3q, [ssq*3] 2904 vpbroadcastw m10, [myq+4] 2905 sub srcq, ssq 2906 jmp r5 2907.v_w4: 2908 movd xmm2, [srcq+ssq*0] 2909 pinsrd xmm2, [srcq+ssq*1], 1 2910 vpbroadcastd ymm1, [srcq+ssq*2] 2911 add srcq, ss3q 2912 vpbroadcastd ymm3, [srcq+ssq*0] 2913 vpbroadcastd ymm0, [srcq+ssq*1] 2914 vbroadcasti128 ymm5, [deint_shuf4] 2915 vpblendd ymm1, ymm2, 0xeb 2916 punpcklqdq ymm3, ymm0 2917 vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _ 2918 pshufb ymm1, ymm5 ; 01 12 23 34 2919.v_w4_loop: 2920 pinsrd xmm0, [srcq+ssq*2], 1 2921 vpbroadcastd ymm2, [srcq+ss3q ] 2922 lea srcq, [srcq+ssq*4] 2923 vpbroadcastd ymm3, [srcq+ssq*0] 2924 vpblendd ymm2, ymm0, 0xeb 2925 vpbroadcastd ymm0, [srcq+ssq*1] 2926 punpcklqdq ymm3, ymm0 2927 vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _ 2928 pshufb ymm2, ymm5 ; 45 56 67 78 2929 pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0 2930 vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56 2931 pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2 2932 pmaddubsw ymm1, ym9 ; a1 b1 c1 d1 2933 paddw ymm3, ymm4 2934 paddw ymm3, ymm1 2935 pmulhrsw ymm3, ym7 2936 mova ymm1, ymm2 2937 mova [tmpq], ymm3 2938 add tmpq, 32 2939 sub hd, 4 2940 jg .v_w4_loop 2941 vzeroupper 2942 RET 2943.v_w8: 2944 mova m6, [spel_v_perm8] 2945 movq xm1, [srcq+ssq*0] 2946 mov r6d, 0x3e 2947 movq xm2, [srcq+ssq*1] 2948 kmovb k1, r6d 2949 vpbroadcastq ym3, [srcq+ssq*2] 2950 add srcq, ss3q 2951 vpunpcklqdq ym2, [srcq+ssq*0] {1to4} 2952 vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8} 2953 movq xm0, [srcq+ssq*1] 2954 kshiftlb k2, k1, 2 2955 shufpd m1, m2, 0x18 ; 0 1 2 3 4 2956 vpermb m1, m6, m1 ; 01 12 23 34 2957.v_w8_loop: 2958 vpbroadcastq ym3, [srcq+ss3q ] 2959 vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4} 2960 lea srcq, [srcq+ssq*4] 2961 vpbroadcastq m3, [srcq+ssq*1] 2962 vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8} 2963 pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 2964 vpermb m2, m6, m0 ; 45 56 67 78 2965 mova xm0, xm3 2966 vshufi32x4 m1, m2, q1032 ; 23 34 45 56 2967 pmaddubsw m3, m2, m10 ; a3 b3 c3 d3 2968 pmaddubsw m5, m1, m9 ; a2 b2 c2 d2 2969 mova m1, m2 2970 paddw m4, m3 2971 paddw m4, m5 2972 pmulhrsw m4, m7 2973 mova [tmpq], m4 2974 add tmpq, 64 2975 sub hd, 4 2976 jg .v_w8_loop 2977 RET 2978.v_w16: 2979 mova m11, [spel_v_perm16b] 2980 vbroadcasti32x4 m1, [srcq+ssq*0] 2981 mov r6d, 0x0f 2982 vbroadcasti32x4 ym3, [srcq+ssq*1] 2983 vbroadcasti32x4 m2, [srcq+ssq*2] 2984 kmovb k1, r6d 2985 add srcq, ss3q 2986 vbroadcasti32x4 ym4, [srcq+ssq*0] 2987 vbroadcasti32x4 m0, [srcq+ssq*1] 2988 vshufpd m1{k1}, m3, m2, 0xcc 2989 vshufpd m2{k1}, m4, m0, 0xcc 2990 vpermb m1, m11, m1 ; 01 12 2991 vpermb m2, m11, m2 ; 23 34 2992.v_w16_loop: 2993 pmaddubsw m3, m1, m8 ; a0 b0 2994 pmaddubsw m5, m2, m9 ; a1 b1 2995 vbroadcasti32x4 ym6, [srcq+ssq*2] 2996 pmaddubsw m4, m2, m8 ; c0 d0 2997 vbroadcasti32x4 m2, [srcq+ss3q ] 2998 lea srcq, [srcq+ssq*4] 2999 vshufpd m0{k1}, m6, m2, 0xcc 3000 vbroadcasti32x4 ym6, [srcq+ssq*0] 3001 vpermb m1, m11, m0 ; 45 56 3002 vbroadcasti32x4 m0, [srcq+ssq*1] 3003 vshufpd m2{k1}, m6, m0, 0xcc 3004 pmaddubsw m6, m1, m9 ; c1 d1 3005 vpermb m2, m11, m2 ; 67 78 3006 paddw m3, m5 3007 pmaddubsw m5, m1, m10 ; a2 b2 3008 paddw m4, m6 3009 pmaddubsw m6, m2, m10 ; c2 d2 3010 paddw m3, m5 3011 paddw m4, m6 3012 pmulhrsw m3, m7 3013 pmulhrsw m4, m7 3014 mova [tmpq+ 0], m3 3015 mova [tmpq+64], m4 3016 add tmpq, 64*2 3017 sub hd, 4 3018 jg .v_w16_loop 3019 RET 3020.v_w32: 3021 movshdup m6, [bilin_v_perm64] 3022 movu ym16, [srcq+ssq*0] 3023 movu ym17, [srcq+ssq*1] 3024 movu ym18, [srcq+ssq*2] 3025 add srcq, ss3q 3026 movu ym19, [srcq+ssq*0] 3027 add srcq, ssq 3028 movu ym20, [srcq+ssq*0] 3029 vpermt2q m16, m6, m18 ; 0 2 3030 vpermt2q m17, m6, m19 ; 1 3 3031 vpermt2q m18, m6, m20 ; 2 4 3032 punpcklbw m0, m16, m17 ; 01 3033 punpcklbw m1, m17, m18 ; 12 3034 punpckhbw m2, m16, m17 ; 23 3035 punpckhbw m3, m17, m18 ; 34 3036.v_w32_loop: 3037 movu ym16, [srcq+ssq*1] 3038 lea srcq, [srcq+ssq*2] 3039 movu ym17, [srcq+ssq*0] 3040 pmaddubsw m4, m0, m8 ; a0 3041 mova m0, m2 3042 pmaddubsw m2, m9 ; a1 3043 vpermt2q m16, m6, m17 ; 5 6 3044 pmaddubsw m5, m1, m8 ; b0 3045 mova m1, m3 3046 pmaddubsw m3, m9 ; b1 3047 shufpd m18, m16, 0x55 ; 4 5 3048 paddw m4, m2 3049 punpcklbw m2, m18, m16 ; 45 3050 paddw m5, m3 3051 punpckhbw m3, m18, m16 ; 56 3052 mova m18, m16 3053 pmaddubsw m16, m2, m10 ; a2 3054 pmaddubsw m17, m3, m10 ; b2 3055 paddw m4, m16 3056 paddw m5, m17 3057 pmulhrsw m4, m7 3058 pmulhrsw m5, m7 3059 mova [tmpq+ 0], m4 3060 mova [tmpq+64], m5 3061 add tmpq, 64*2 3062 sub hd, 2 3063 jg .v_w32_loop 3064 vzeroupper 3065 RET 3066.v_w64: 3067.v_w128: 3068 mova m6, [bilin_v_perm64] 3069 add wd, wd 3070 lea r6d, [hq+wq] 3071.v_loop0: 3072 vpermq m12, m6, [srcq+ssq*0] 3073 vpermq m13, m6, [srcq+ssq*1] 3074 lea r5, [srcq+ssq*2] 3075 vpermq m14, m6, [r5 +ssq*0] 3076 vpermq m15, m6, [r5 +ssq*1] 3077 lea r5, [r5+ssq*2] 3078 vpermq m16, m6, [r5 +ssq*0] 3079 mov r7, tmpq 3080 punpcklbw m0, m12, m13 ; 01 3081 punpckhbw m12, m13 3082 punpcklbw m1, m13, m14 ; 12 3083 punpckhbw m13, m14 3084 punpcklbw m2, m14, m15 ; 23 3085 punpckhbw m14, m15 3086 punpcklbw m3, m15, m16 ; 34 3087 punpckhbw m15, m16 3088.v_loop: 3089 pmaddubsw m17, m0, m8 ; a0 3090 vpermq m5, m6, [r5+ssq*1] 3091 pmaddubsw m18, m12, m8 3092 mova m0, m2 3093 pmaddubsw m2, m9 ; a1 3094 mova m12, m14 3095 pmaddubsw m14, m9 3096 lea r5, [r5+ssq*2] 3097 pmaddubsw m19, m1, m8 ; b0 3098 pmaddubsw m20, m13, m8 3099 mova m1, m3 3100 pmaddubsw m3, m9 ; b1 3101 mova m13, m15 3102 pmaddubsw m15, m9 3103 paddw m17, m2 3104 punpcklbw m2, m16, m5 ; 67 3105 paddw m18, m14 3106 punpckhbw m14, m16, m5 3107 vpermq m16, m6, [r5+ssq*0] 3108 paddw m19, m3 3109 pmaddubsw m3, m2, m10 ; a3 3110 paddw m20, m15 3111 pmaddubsw m15, m14, m10 3112 paddw m17, m3 3113 punpcklbw m3, m5, m16 ; 78 3114 pmaddubsw m4, m3, m10 ; b3 3115 paddw m18, m15 3116 punpckhbw m15, m5, m16 3117 pmaddubsw m5, m15, m10 3118 paddw m19, m4 3119 paddw m20, m5 3120 REPX {pmulhrsw x, m7}, m17, m18, m19, m20 3121 mova [r7+wq*0+ 0], m17 3122 mova [r7+wq*0+64], m18 3123 mova [r7+wq*1+ 0], m19 3124 mova [r7+wq*1+64], m20 3125 lea r7, [r7+wq*2] 3126 sub hd, 2 3127 jg .v_loop 3128 add srcq, 64 3129 add tmpq, 128 3130 movzx hd, r6b 3131 sub r6d, 1<<8 3132 jg .v_loop0 3133 vzeroupper 3134 RET 3135.h: 3136 test myd, 0xf00 3137 jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2 3138.hv: 3139 vpbroadcastd m8, [pd_2] 3140 vpbroadcastd m9, [pd_32] 3141 cmp wd, 4 3142 jg .hv_w8 3143 movzx mxd, mxb 3144 vpbroadcastd m11, [base+subpel_filters+mxq*8+2] 3145 movzx mxd, myb 3146 shr myd, 16 3147 cmp hd, 4 3148 cmove myd, mxd 3149 vpbroadcastq m3, [base+subpel_filters+1+myq*8] 3150 vbroadcasti128 m10, [subpel_h_shufA] 3151 lea r6, [ssq*2+1] 3152 mov r3d, 0x30 3153 sub srcq, r6 3154 kmovb k1, r3d 3155 vpbroadcastq ym2, [srcq+ssq*0] 3156 lea ss3q, [ssq*3] 3157 vpbroadcastq m1, [srcq+ssq*1] 3158 kaddb k2, k1, k1 3159 vpbroadcastq m2{k1}, [srcq+ssq*2] 3160 add srcq, ss3q 3161 vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3 3162 punpcklbw m3, m3 3163 vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4 3164 psraw m3, 8 ; sign-extend 3165 mova m6, [spel_hv_perm4a] 3166 kshiftrb k1, k1, 2 3167 movu m7, [spel_hv_perm4b] 3168 pshufb m1, m10 3169 mova m0, m8 3170 vpdpbusd m0, m1, m11 3171 pshufb m2, m10 3172 mova m1, m8 3173 vpdpbusd m1, m2, m11 3174 pshufd m12, m3, q0000 3175 pshufd m13, m3, q1111 3176 pshufd m14, m3, q2222 3177 packssdw m0, m1 ; _ _ _ 0 1 2 3 4 3178 psraw m0, 2 3179 vpermb m1, m7, m0 ; 01 12 23 34 3180.hv_w4_loop: 3181 movq xm3, [srcq+ssq*2] 3182 movq xm4, [srcq+ss3q ] 3183 lea srcq, [srcq+ssq*4] 3184 vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7 3185 vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8 3186 pshufb ym3, ym10 3187 mova ym2, ym8 3188 vpdpbusd ym2, ym3, ym11 3189 pshufb ym4, ym10 3190 mova ym3, ym8 3191 vpdpbusd ym3, ym4, ym11 3192 mova m4, m9 3193 vpdpwssd m4, m1, m12 ; a0 b0 c0 d0 3194 packssdw ym2, ym3 ; 5 6 7 8 3195 psraw ym2, 2 3196 vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8 3197 vpermb m2, m6, m0 ; 23 34 45 56 3198 vpermb m1, m7, m0 ; 45 56 67 78 3199 vpdpwssd m4, m2, m13 ; a1 b1 c1 d1 3200 vpdpwssd m4, m1, m14 ; a2 b2 c2 d2 3201 psrad m4, 6 3202 vpmovdw [tmpq], m4 3203 add tmpq, 32 3204 sub hd, 4 3205 jg .hv_w4_loop 3206 RET 3207.hv_w8: 3208 shr mxd, 16 3209 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] 3210 vpbroadcastd m11, [base+subpel_filters+mxq*8+4] 3211 movzx mxd, myb 3212 shr myd, 16 3213 cmp hd, 4 3214 cmove myd, mxd 3215 vpbroadcastq m0, [base+subpel_filters+1+myq*8] 3216 lea r6, [ssq*2+3] 3217 punpcklbw m0, m0 3218 sub srcq, r6 3219 psraw m0, 8 ; sign-extend 3220 lea ss3q, [ssq*3] 3221 pshufd m12, m0, q0000 3222 pshufd m13, m0, q1111 3223 pshufd m14, m0, q2222 3224 cmp wd, 8 3225 jg .hv_w16 3226 movu xm16, [srcq+ssq*0] 3227 vbroadcasti32x4 m19, [subpel_h_shufA] 3228 vinserti128 ym16, [srcq+ssq*1], 1 3229 vbroadcasti32x4 m21, [subpel_h_shufC] 3230 vinserti32x4 m16, [srcq+ssq*2], 2 3231 add srcq, ss3q 3232 vinserti32x4 m16, [srcq+ssq*0], 3 3233 movu xm17, [srcq+ssq*1] 3234 vbroadcasti32x4 m20, [subpel_h_shufB] 3235 pshufb m3, m16, m19 ; 0 1 2 3 0123 3236 mova m2, m8 3237 pshufb m0, m16, m21 ; 0 1 2 3 89ab 3238 vpdpbusd m2, m3, m10 3239 mova m3, m8 3240 pshufb xm1, xm17, xm19 ; 3 4 5 6 0123 3241 vpdpbusd m3, m0, m11 3242 mova xm0, xm8 3243 pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab 3244 vpdpbusd xm0, xm1, xm10 3245 mova xm1, xm8 3246 pshufb m16, m20 ; 0 1 2 3 4567 3247 vpdpbusd xm1, xm18, xm11 3248 pshufb xm17, xm20 ; 3 4 5 6 4567 3249 vpdpbusd m2, m16, m11 3250 vpdpbusd m3, m16, m10 3251 vpdpbusd xm0, xm17, xm11 3252 vpdpbusd xm1, xm17, xm10 3253 packssdw m2, m3 3254 packssdw xm0, xm1 3255 psraw m2, 2 ; 0 1 2 3 3256 psraw xm0, 2 ; 4 3257 valignq m0, m2, 2 ; 1 2 3 4 3258 punpcklwd m1, m2, m0 ; 01 12 23 34 3259 punpckhwd m2, m0 3260.hv_w8_loop: 3261 movu xm16, [srcq+ssq*2] 3262 vinserti128 ym16, [srcq+ss3q ], 1 3263 lea srcq, [srcq+ssq*4] 3264 vinserti32x4 m16, [srcq+ssq*0], 2 3265 vinserti32x4 m16, [srcq+ssq*1], 3 3266 pshufb m6, m16, m19 ; 5 6 7 8 0123 3267 mova m5, m8 3268 pshufb m3, m16, m21 ; 5 6 7 8 89ab 3269 vpdpbusd m5, m6, m10 3270 mova m6, m8 3271 pshufb m16, m20 ; 5 6 7 8 4567 3272 vpdpbusd m6, m3, m11 3273 mova m3, m9 3274 vpdpwssd m3, m1, m12 ; a0 b0 c0 d0 3275 mova m4, m9 3276 vpdpwssd m4, m2, m12 3277 vpdpbusd m5, m16, m11 3278 vpdpbusd m6, m16, m10 3279 mova m16, m1 3280 packssdw m5, m6 3281 mova m6, m2 3282 psraw m5, 2 ; 5 6 7 8 3283 valignq m2, m5, m0, 6 ; 4 5 6 7 3284 mova m0, m5 3285 punpcklwd m1, m2, m5 ; 45 56 67 78 3286 punpckhwd m2, m5 3287 vpdpwssd m3, m1, m14 ; a2 b2 c2 d2 3288 vpdpwssd m4, m2, m14 3289 vshufi32x4 m16, m1, q1032 ; 23 34 45 56 3290 vshufi32x4 m6, m2, q1032 3291 vpdpwssd m3, m16, m13 ; a1 b1 c1 d1 3292 vpdpwssd m4, m6, m13 3293 psrad m3, 6 3294 psrad m4, 6 3295 packssdw m3, m4 3296 mova [tmpq], m3 3297 add tmpq, 64 3298 sub hd, 4 3299 jg .hv_w8_loop 3300 vzeroupper 3301 RET 3302.hv_w16: 3303 mova m16, [spel_h_perm16] 3304 vpbroadcastd m18, [pb_4] 3305 add wd, wd 3306 paddb m17, m18, m16 3307 lea r6d, [hq+wq*8-256] 3308 paddb m18, m17 3309.hv_w16_loop0: 3310 movu ym19, [srcq+ssq*0] 3311 vinserti32x8 m19, [srcq+ssq*1], 1 3312 lea r5, [srcq+ssq*2] 3313 movu ym20, [r5 +ssq*0] 3314 vinserti32x8 m20, [r5 +ssq*1], 1 3315 lea r5, [r5 +ssq*2] 3316 movu ym21, [r5 +ssq*0] 3317 mov r7, tmpq 3318 vpermb m3, m16, m19 ; 0 1 0123 89ab 3319 mova m2, m8 3320 vpermb m4, m18, m19 ; 0 1 89ab ghij 3321 vpdpbusd m2, m3, m10 3322 mova m3, m8 3323 vpermb m5, m16, m20 ; 2 3 0123 89ab 3324 vpdpbusd m3, m4, m11 3325 mova m4, m8 3326 vpermb m0, m18, m20 ; 2 3 89ab ghij 3327 vpdpbusd m4, m5, m10 3328 mova m5, m8 3329 vpermb ym1, ym16, ym21 ; 4 0123 89ab 3330 vpdpbusd m5, m0, m11 3331 mova ym0, ym8 3332 vpermb ym6, ym18, ym21 ; 4 89ab ghij 3333 vpdpbusd ym0, ym1, ym10 3334 mova ym1, ym8 3335 vpermb m19, m17, m19 ; 0 1 4567 cdef 3336 vpdpbusd ym1, ym6, ym11 3337 vpermb m20, m17, m20 ; 2 3 4567 cdef 3338 vpdpbusd m2, m19, m11 3339 vpdpbusd m3, m19, m10 3340 vpermb ym21, ym17, ym21 ; 4 4567 cdef 3341 vpdpbusd m4, m20, m11 3342 vpdpbusd m5, m20, m10 3343 vpdpbusd ym0, ym21, ym11 3344 vpdpbusd ym1, ym21, ym10 3345 packssdw m2, m3 ; 0 1 3346 packssdw m4, m5 ; 2 3 3347 packssdw ym0, ym1 ; 4 3348 REPX {psraw x, 2}, m2, m4, ym0 3349 vshufi32x4 m3, m2, m4, q1032 ; 1 2 3350 vshufi32x4 m0, m4, m0, q1032 ; 3 4 3351 punpcklwd m1, m2, m3 ; 01 12 3352 punpckhwd m2, m3 3353 punpcklwd m3, m4, m0 ; 23 34 3354 punpckhwd m4, m0 3355.hv_w16_loop: 3356 movu ym19, [r5+ssq*1] 3357 lea r5, [r5+ssq*2] 3358 vinserti32x8 m19, [r5+ssq*0], 1 3359 vpermb m6, m16, m19 ; 5 6 0123 89ab 3360 mova m5, m8 3361 vpermb m20, m18, m19 ; 5 6 89ab ghij 3362 vpdpbusd m5, m6, m10 3363 mova m6, m8 3364 vpermb m19, m17, m19 ; 5 6 4567 cdef 3365 vpdpbusd m6, m20, m11 3366 mova m20, m9 3367 vpdpwssd m20, m1, m12 ; a0 b0 3368 mova m21, m9 3369 vpdpwssd m21, m2, m12 3370 vpdpbusd m5, m19, m11 3371 vpdpbusd m6, m19, m10 3372 vpdpwssd m20, m3, m13 ; a1 b1 3373 vpdpwssd m21, m4, m13 3374 packssdw m5, m6 3375 mova m1, m3 3376 psraw m5, 2 ; 5 6 3377 mova m2, m4 3378 vshufi32x4 m4, m0, m5, q1032 ; 4 5 3379 mova m0, m5 3380 punpcklwd m3, m4, m0 ; 45 56 3381 punpckhwd m4, m0 3382 vpdpwssd m20, m3, m14 ; a2 b2 3383 vpdpwssd m21, m4, m14 3384 psrad m20, 6 3385 psrad m21, 6 3386 packssdw m20, m21 3387 mova [r7+wq*0], ym20 3388 vextracti32x8 [r7+wq*1], m20, 1 3389 lea r7, [r7+wq*2] 3390 sub hd, 2 3391 jg .hv_w16_loop 3392 add srcq, 16 3393 add tmpq, 32 3394 movzx hd, r6b 3395 sub r6d, 1<<8 3396 jg .hv_w16_loop0 3397 vzeroupper 3398 RET 3399 3400%macro PREP_8TAP_H 0 3401 vpermb m10, m5, m0 3402 vpermb m11, m5, m1 3403 vpermb m12, m6, m0 3404 vpermb m13, m6, m1 3405 vpermb m14, m7, m0 3406 vpermb m15, m7, m1 3407 mova m0, m4 3408 vpdpbusd m0, m10, m8 3409 mova m2, m4 3410 vpdpbusd m2, m12, m8 3411 mova m1, m4 3412 vpdpbusd m1, m11, m8 3413 mova m3, m4 3414 vpdpbusd m3, m13, m8 3415 vpdpbusd m0, m12, m9 3416 vpdpbusd m2, m14, m9 3417 vpdpbusd m1, m13, m9 3418 vpdpbusd m3, m15, m9 3419 packssdw m0, m2 3420 packssdw m1, m3 3421 psraw m0, 2 3422 psraw m1, 2 3423 mova [tmpq+64*0], m0 3424 mova [tmpq+64*1], m1 3425%endmacro 3426 3427PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc 3428PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc 3429PREP_8TAP_FN sharp, SHARP, SHARP 3430 3431cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3 3432 imul mxd, mxm, 0x010101 3433 add mxd, t0d ; 8tap_h, mx, 4tap_h 3434 imul myd, mym, 0x010101 3435 add myd, t1d ; 8tap_v, my, 4tap_v 3436 lea r7, [prep_avx512icl] 3437 movifnidn hd, hm 3438 test mxd, 0xf00 3439 jnz .h 3440 test myd, 0xf00 3441 jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep 3442.v: 3443 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. 3444 shr myd, 16 ; Note that the code is 8-tap only, having 3445 cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 3446 cmove myd, mxd ; had a negligible effect on performance. 3447 tzcnt r5d, wd 3448 lea myq, [base+subpel_filters+myq*8] 3449 movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)] 3450 vpbroadcastd m7, [pw_8192] 3451 vpbroadcastw m8, [myq+0] 3452 add r5, r7 3453 vpbroadcastw m9, [myq+2] 3454 lea stride3q, [strideq*3] 3455 vpbroadcastw m10, [myq+4] 3456 sub srcq, stride3q 3457 vpbroadcastw m11, [myq+6] 3458 jmp r5 3459.v_w4: 3460 movd xmm0, [srcq+strideq*0] 3461 vpbroadcastd ymm1, [srcq+strideq*2] 3462 vpbroadcastd xmm2, [srcq+strideq*1] 3463 vpbroadcastd ymm3, [srcq+stride3q ] 3464 lea srcq, [srcq+strideq*4] 3465 vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ 3466 vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ 3467 vpbroadcastd ymm0, [srcq+strideq*0] 3468 vpbroadcastd ymm2, [srcq+strideq*1] 3469 vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ 3470 vpbroadcastd ymm0, [srcq+strideq*2] 3471 vbroadcasti128 ymm5, [deint_shuf4] 3472 vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 3473 vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 3474 vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ 3475 punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 3476 vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 3477 punpckhbw ymm2, ymm3 ; 23 34 45 56 3478.v_w4_loop: 3479 pinsrd xmm0, [srcq+stride3q ], 1 3480 lea srcq, [srcq+strideq*4] 3481 vpbroadcastd ymm3, [srcq+strideq*0] 3482 vpbroadcastd ymm4, [srcq+strideq*1] 3483 vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ 3484 vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ 3485 vpbroadcastd ymm0, [srcq+strideq*2] 3486 vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ 3487 pshufb ymm3, ymm5 ; 67 78 89 9a 3488 pmaddubsw ymm4, ymm1, ym8 3489 vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 3490 pmaddubsw ymm2, ym9 3491 paddw ymm4, ymm2 3492 mova ymm2, ymm3 3493 pmaddubsw ymm3, ym11 3494 paddw ymm3, ymm4 3495 pmaddubsw ymm4, ymm1, ym10 3496 paddw ymm3, ymm4 3497 pmulhrsw ymm3, ym7 3498 mova [tmpq], ymm3 3499 add tmpq, 32 3500 sub hd, 4 3501 jg .v_w4_loop 3502 vzeroupper 3503 RET 3504.v_w8: 3505 mova m6, [spel_v_perm8] 3506 movq xm1, [srcq+strideq*0] 3507 mov r6d, 0x3e 3508 movq xm2, [srcq+strideq*1] 3509 vpbroadcastq ym3, [srcq+strideq*2] 3510 kmovb k1, r6d 3511 vpbroadcastq ym4, [srcq+stride3q ] 3512 lea srcq, [srcq+strideq*4] 3513 vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8} 3514 vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8} 3515 movq xm0, [srcq+strideq*2] 3516 kshiftlb k2, k1, 2 3517 shufpd m1, m2, 0x30 ; 0 1 2 3 4 5 3518 vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _ 3519 vpermb m1, m6, m1 ; 01 12 23 34 3520 vpermb m2, m6, m2 ; 23 34 45 56 3521.v_w8_loop: 3522 vpbroadcastq ym3, [srcq+strideq*4] 3523 vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4} 3524 lea srcq, [srcq+strideq*4] 3525 vpbroadcastq m3, [srcq+strideq*2] 3526 vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8} 3527 pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 3528 mova m1, m2 3529 pmaddubsw m5, m2, m9 ; a1 b1 c1 d1 3530 vpermb m2, m6, m0 ; 67 78 89 9a 3531 mova xm0, xm3 3532 vshufi32x4 m1, m2, q1032 ; 45 56 67 78 3533 pmaddubsw m3, m2, m11 ; a3 b3 c3 d3 3534 paddw m4, m5 3535 pmaddubsw m5, m1, m10 ; a2 b2 c2 d2 3536 paddw m4, m3 3537 paddw m4, m5 3538 pmulhrsw m4, m7 3539 mova [tmpq], m4 3540 add tmpq, 64 3541 sub hd, 4 3542 jg .v_w8_loop 3543 RET 3544.v_w16: 3545 mova m12, [spel_v_perm16b] 3546 vbroadcasti32x4 m1, [srcq+strideq*0] 3547 mov r6d, 0x0f 3548 vbroadcasti32x4 ym4, [srcq+strideq*1] 3549 vbroadcasti32x4 m2, [srcq+strideq*2] 3550 kmovb k1, r6d 3551 vbroadcasti32x4 ym5, [srcq+stride3q ] 3552 lea srcq, [srcq+strideq*4] 3553 vbroadcasti32x4 m3, [srcq+strideq*0] 3554 vbroadcasti32x4 ym6, [srcq+strideq*1] 3555 vbroadcasti32x4 m0, [srcq+strideq*2] 3556 vshufpd m1{k1}, m4, m2, 0xcc 3557 vshufpd m2{k1}, m5, m3, 0xcc 3558 vshufpd m3{k1}, m6, m0, 0xcc 3559 vpermb m1, m12, m1 ; 01 12 3560 vpermb m2, m12, m2 ; 23 34 3561 vpermb m3, m12, m3 ; 45 56 3562.v_w16_loop: 3563 pmaddubsw m4, m1, m8 ; a0 b0 3564 mova m1, m3 3565 pmaddubsw m13, m2, m9 ; a1 b1 3566 vbroadcasti32x4 ym6, [srcq+stride3q ] 3567 pmaddubsw m5, m2, m8 ; c0 d0 3568 lea srcq, [srcq+strideq*4] 3569 pmaddubsw m14, m3, m9 ; c1 d1 3570 vbroadcasti32x4 m3, [srcq+strideq*0] 3571 vshufpd m0{k1}, m6, m3, 0xcc 3572 vbroadcasti32x4 ym6, [srcq+strideq*1] 3573 vpermb m2, m12, m0 ; 67 78 3574 vbroadcasti32x4 m0, [srcq+strideq*2] 3575 vshufpd m3{k1}, m6, m0, 0xcc 3576 paddw m4, m13 3577 pmaddubsw m13, m1, m10 ; a2 b2 3578 vpermb m3, m12, m3 ; 89 9a 3579 paddw m5, m14 3580 pmaddubsw m14, m2, m10 ; c2 d2 3581 pmaddubsw m15, m2, m11 ; a3 b3 3582 pmaddubsw m6, m3, m11 ; c3 d3 3583 paddw m4, m13 3584 paddw m5, m14 3585 paddw m4, m15 3586 paddw m5, m6 3587 pmulhrsw m4, m7 3588 pmulhrsw m5, m7 3589 mova [tmpq+ 0], m4 3590 mova [tmpq+64], m5 3591 add tmpq, 64*2 3592 sub hd, 4 3593 jg .v_w16_loop 3594 RET 3595.v_w32: 3596 movshdup m21, [bilin_v_perm64] 3597 movu ym16, [srcq+strideq*0] 3598 movu ym17, [srcq+strideq*1] 3599 movu ym18, [srcq+strideq*2] 3600 add srcq, stride3q 3601 movu ym19, [srcq+strideq*0] 3602 vpermt2q m16, m21, m19 ; 0 3 3603 movu ym20, [srcq+strideq*1] 3604 vpermt2q m17, m21, m20 ; 1 4 3605 movu ym20, [srcq+strideq*2] 3606 add srcq, stride3q 3607 vpermt2q m18, m21, m20 ; 2 5 3608 movu ym20, [srcq+strideq*0] 3609 vpermt2q m19, m21, m20 ; 3 6 3610 punpcklbw m0, m16, m17 ; 01 3611 punpcklbw m1, m17, m18 ; 12 3612 punpcklbw m2, m18, m19 ; 23 3613 punpckhbw m3, m16, m17 ; 34 3614 punpckhbw m4, m17, m18 ; 45 3615 punpckhbw m5, m18, m19 ; 56 3616.v_w32_loop: 3617 movu ym16, [srcq+strideq*1] 3618 lea srcq, [srcq+strideq*2] 3619 movu ym17, [srcq+strideq*0] 3620 pmaddubsw m14, m0, m8 3621 mova m0, m2 3622 pmaddubsw m15, m1, m8 3623 mova m1, m3 3624 pmaddubsw m2, m9 3625 vpermt2q m16, m21, m17 ; 7 8 3626 pmaddubsw m3, m9 3627 pmaddubsw m12, m4, m10 3628 pmaddubsw m13, m5, m10 3629 shufpd m19, m16, 0x55 ; 6 7 3630 paddw m14, m2 3631 mova m2, m4 3632 punpcklbw m4, m19, m16 ; 67 3633 paddw m15, m3 3634 mova m3, m5 3635 punpckhbw m5, m19, m16 ; 78 3636 paddw m14, m12 3637 paddw m15, m13 3638 pmaddubsw m12, m4, m11 3639 pmaddubsw m13, m5, m11 3640 mova m19, m16 3641 paddw m14, m12 3642 paddw m15, m13 3643 pmulhrsw m14, m7 3644 pmulhrsw m15, m7 3645 mova [tmpq+ 0], m14 3646 mova [tmpq+64], m15 3647 add tmpq, 64*2 3648 sub hd, 2 3649 jg .v_w32_loop 3650 vzeroupper 3651 RET 3652.v_w64: 3653.v_w128: 3654 WIN64_SPILL_XMM 24 3655 mova m23, [bilin_v_perm64] 3656 add wd, wd 3657 lea r6d, [hq+wq] 3658.v_loop0: 3659 vpermq m12, m23, [srcq+strideq*0] 3660 vpermq m13, m23, [srcq+strideq*1] 3661 lea r5, [srcq+strideq*2] 3662 vpermq m14, m23, [r5 +strideq*0] 3663 vpermq m15, m23, [r5 +strideq*1] 3664 lea r5, [r5+strideq*2] 3665 vpermq m16, m23, [r5 +strideq*0] 3666 vpermq m17, m23, [r5 +strideq*1] 3667 lea r5, [r5+strideq*2] 3668 vpermq m18, m23, [r5 +strideq*0] 3669 mov r7, tmpq 3670 punpcklbw m0, m12, m13 ; 01 3671 punpckhbw m12, m13 3672 punpcklbw m1, m13, m14 ; 12 3673 punpckhbw m13, m14 3674 punpcklbw m2, m14, m15 ; 23 3675 punpckhbw m14, m15 3676 punpcklbw m3, m15, m16 ; 34 3677 punpckhbw m15, m16 3678 punpcklbw m4, m16, m17 ; 45 3679 punpckhbw m16, m17 3680 punpcklbw m5, m17, m18 ; 56 3681 punpckhbw m17, m18 3682.v_loop: 3683 pmaddubsw m19, m0, m8 ; a0 3684 vpermq m6, m23, [r5+strideq*1] 3685 pmaddubsw m20, m12, m8 3686 mova m0, m2 3687 pmaddubsw m2, m9 ; a1 3688 mova m12, m14 3689 pmaddubsw m14, m9 3690 lea r5, [r5+strideq*2] 3691 pmaddubsw m21, m1, m8 ; b0 3692 pmaddubsw m22, m13, m8 3693 mova m1, m3 3694 pmaddubsw m3, m9 ; b1 3695 mova m13, m15 3696 pmaddubsw m15, m9 3697 paddw m19, m2 3698 mova m2, m4 3699 pmaddubsw m4, m10 ; a2 3700 paddw m20, m14 3701 mova m14, m16 3702 pmaddubsw m16, m10 3703 paddw m21, m3 3704 mova m3, m5 3705 pmaddubsw m5, m10 ; b2 3706 paddw m22, m15 3707 mova m15, m17 3708 pmaddubsw m17, m10 3709 paddw m19, m4 3710 punpcklbw m4, m18, m6 ; 67 3711 paddw m20, m16 3712 punpckhbw m16, m18, m6 3713 vpermq m18, m23, [r5+strideq*0] 3714 paddw m21, m5 3715 pmaddubsw m5, m4, m11 ; a3 3716 paddw m22, m17 3717 pmaddubsw m17, m16, m11 3718 paddw m19, m5 3719 punpcklbw m5, m6, m18 ; 78 3720 paddw m20, m17 3721 punpckhbw m17, m6, m18 3722 pmaddubsw m6, m5, m11 ; b3 3723 paddw m21, m6 3724 pmaddubsw m6, m17, m11 3725 paddw m22, m6 3726 REPX {pmulhrsw x, m7}, m19, m20, m21, m22 3727 mova [r7+wq*0+ 0], m19 3728 mova [r7+wq*0+64], m20 3729 mova [r7+wq*1+ 0], m21 3730 mova [r7+wq*1+64], m22 3731 lea r7, [r7+wq*2] 3732 sub hd, 2 3733 jg .v_loop 3734 add srcq, 64 3735 add tmpq, 128 3736 movzx hd, r6b 3737 sub r6d, 1<<8 3738 jg .v_loop0 3739 RET 3740.h: 3741 RESET_STACK_STATE 3742 test myd, 0xf00 3743 jnz .hv 3744.h2: 3745 vpbroadcastd m4, [pd_2] 3746 cmp wd, 4 3747 je .h_w4 3748 tzcnt wd, wd 3749 shr mxd, 16 3750 sub srcq, 3 3751 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] 3752 vpbroadcastd m8, [base+subpel_filters+mxq*8+0] 3753 vpbroadcastd m9, [base+subpel_filters+mxq*8+4] 3754 add wq, r7 3755 jmp wq 3756.h_w4: 3757 movzx mxd, mxb 3758 vbroadcasti128 ym5, [subpel_h_shufA] 3759 mov r3d, 0x4 3760 dec srcq 3761 vpbroadcastd ym6, [base+subpel_filters+mxq*8+2] 3762 kmovb k1, r3d 3763 lea stride3q, [strideq*3] 3764.h_w4_loop: 3765 movq xm2, [srcq+strideq*0] 3766 movq xm3, [srcq+strideq*1] 3767 vpbroadcastq ym2{k1}, [srcq+strideq*2] 3768 vpbroadcastq ym3{k1}, [srcq+stride3q ] 3769 lea srcq, [srcq+strideq*4] 3770 pshufb ym2, ym5 3771 pshufb ym3, ym5 3772 mova ym0, ym4 3773 vpdpbusd ym0, ym2, ym6 3774 mova ym1, ym4 3775 vpdpbusd ym1, ym3, ym6 3776 packssdw ym0, ym1 3777 psraw ym0, 2 3778 mova [tmpq], ym0 3779 add tmpq, 32 3780 sub hd, 4 3781 jg .h_w4_loop 3782 RET 3783.h_w8: 3784 vbroadcasti128 m5, [subpel_h_shufA] 3785 vbroadcasti128 m6, [subpel_h_shufB] 3786 vbroadcasti128 m7, [subpel_h_shufC] 3787 lea stride3q, [strideq*3] 3788.h_w8_loop: 3789 movu xmm3, [srcq+strideq*0] 3790 vinserti128 ym3, ymm3, [srcq+strideq*1], 1 3791 vinserti128 m3, [srcq+strideq*2], 2 3792 vinserti128 m3, [srcq+stride3q ], 3 3793 lea srcq, [srcq+strideq*4] 3794 pshufb m1, m3, m5 3795 pshufb m2, m3, m6 3796 mova m0, m4 3797 vpdpbusd m0, m1, m8 3798 mova m1, m4 3799 vpdpbusd m1, m2, m8 3800 pshufb m3, m7 3801 vpdpbusd m0, m2, m9 3802 vpdpbusd m1, m3, m9 3803 packssdw m0, m1 3804 psraw m0, 2 3805 mova [tmpq], m0 3806 add tmpq, 64 3807 sub hd, 4 3808 jg .h_w8_loop 3809 RET 3810.h_w16: 3811 mova m5, [spel_h_perm16] 3812 vpbroadcastd m7, [pb_4] 3813 lea stride3q, [strideq*3] 3814 paddb m6, m7, m5 3815 paddb m7, m6 3816.h_w16_loop: 3817 movu ym0, [srcq+strideq*0] 3818 movu ym1, [srcq+strideq*2] 3819 vinserti32x8 m0, [srcq+strideq*1], 1 3820 vinserti32x8 m1, [srcq+stride3q ], 1 3821 lea srcq, [srcq+strideq*4] 3822 PREP_8TAP_H 3823 add tmpq, 64*2 3824 sub hd, 4 3825 jg .h_w16_loop 3826 RET 3827.h_w32: 3828 mova m5, [spel_h_perm32] 3829 vpbroadcastd m7, [pb_4] 3830 paddb m6, m7, m5 3831 paddb m7, m6 3832.h_w32_loop: 3833 movu m0, [srcq+strideq*0] 3834 movu m1, [srcq+strideq*1] 3835 lea srcq, [srcq+strideq*2] 3836 PREP_8TAP_H 3837 add tmpq, 64*2 3838 sub hd, 2 3839 jg .h_w32_loop 3840 RET 3841.h_w64: 3842 xor r6d, r6d 3843 jmp .h_start 3844.h_w128: 3845 mov r6, -64*1 3846.h_start: 3847 mova m5, [spel_h_perm32] 3848 vpbroadcastd m7, [pb_4] 3849 sub srcq, r6 3850 paddb m6, m7, m5 3851 paddb m7, m6 3852.h_loop0: 3853 mov r5, r6 3854.h_loop: 3855 movu m0, [srcq+r5+32*0] 3856 movu m1, [srcq+r5+32*1] 3857 PREP_8TAP_H 3858 add tmpq, 64*2 3859 add r5, 64 3860 jle .h_loop 3861 add srcq, strideq 3862 dec hd 3863 jg .h_loop0 3864 RET 3865.hv: 3866 RESET_STACK_STATE 3867 vpbroadcastd m8, [pd_2] 3868 vpbroadcastd m9, [pd_32] 3869 cmp wd, 4 3870 jg .hv_w8 3871 movzx mxd, mxb 3872 dec srcq 3873 vpbroadcastd m11, [base+subpel_filters+mxq*8+2] 3874 movzx mxd, myb 3875 shr myd, 16 3876 cmp hd, 4 3877 cmove myd, mxd 3878 vpbroadcastq m0, [base+subpel_filters+myq*8] 3879 lea stride3q, [strideq*3] 3880 sub srcq, stride3q 3881 mov r3d, 0x04 3882 kmovb k1, r3d 3883 kshiftlb k2, k1, 2 3884 kshiftlb k3, k1, 4 3885 vbroadcasti128 m10, [subpel_h_shufA] 3886 punpcklbw m0, m0 3887 psraw m0, 8 ; sign-extend 3888 pshufd m12, m0, q0000 3889 pshufd m13, m0, q1111 3890 pshufd m14, m0, q2222 3891 pshufd m15, m0, q3333 3892 movq xm3, [srcq+strideq*0] 3893 vpbroadcastq ym2, [srcq+strideq*1] 3894 vpbroadcastq ym3{k1}, [srcq+strideq*2] 3895 vpbroadcastq m2{k2}, [srcq+stride3q ] 3896 lea srcq, [srcq+strideq*4] 3897 vpbroadcastq m3{k2}, [srcq+strideq*0] 3898 vpbroadcastq m2{k3}, [srcq+strideq*1] 3899 vpbroadcastq m3{k3}, [srcq+strideq*2] 3900 mova m6, [spel_hv_perm4a] 3901 movu m7, [spel_hv_perm4b] 3902 mova m0, m8 3903 mova m1, m8 3904 pshufb m2, m10 3905 pshufb m3, m10 3906 vpdpbusd m0, m2, m11 3907 vpdpbusd m1, m3, m11 3908 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 3909 psraw m0, 2 3910 vpermb m1, m6, m0 ; 01 12 23 34 3911 vpermb m2, m7, m0 ; 23 34 45 56 3912.hv_w4_loop: 3913 movq xm3, [srcq+stride3q ] 3914 lea srcq, [srcq+strideq*4] 3915 movq xm4, [srcq+strideq*0] 3916 vpbroadcastq ym3{k1}, [srcq+strideq*1] 3917 vpbroadcastq ym4{k1}, [srcq+strideq*2] 3918 mova m5, m9 3919 pshufb ym3, ym10 3920 vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 3921 mova ym1, ym8 3922 pshufb ym4, ym10 3923 vpdpbusd ym1, ym3, ym11 3924 mova ym3, ym8 3925 vpdpbusd ym3, ym4, ym11 3926 vpdpwssd m5, m2, m13 ; a1 b1 c1 d1 3927 packssdw ym1, ym3 ; 7 8 9 a 3928 psraw ym1, 2 3929 vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a 3930 vpermb m1, m6, m0 ; 45 56 67 78 3931 vpermb m2, m7, m0 ; 67 78 89 9a 3932 vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 3933 vpdpwssd m5, m2, m15 ; a3 b3 c3 d3 3934 psrad m5, 6 3935 vpmovdw [tmpq], m5 3936 add tmpq, 32 3937 sub hd, 4 3938 jg .hv_w4_loop 3939 RET 3940.hv_w8: 3941 shr mxd, 16 3942 sub srcq, 3 3943 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] 3944 vpbroadcastd m11, [base+subpel_filters+mxq*8+4] 3945 movzx mxd, myb 3946 shr myd, 16 3947 cmp hd, 4 3948 cmove myd, mxd 3949 vpbroadcastq m0, [base+subpel_filters+myq*8] 3950 lea stride3q, [strideq*3] 3951 sub srcq, stride3q 3952 punpcklbw m0, m0 3953 psraw m0, 8 ; sign-extend 3954 pshufd m12, m0, q0000 3955 pshufd m13, m0, q1111 3956 pshufd m14, m0, q2222 3957 pshufd m15, m0, q3333 3958 cmp wd, 8 3959 jg .hv_w16 3960 vbroadcasti32x4 m17, [srcq+stride3q ] 3961 vinserti32x4 m16, m17, [srcq+strideq*0], 0 3962 vbroadcasti32x4 m19, [subpel_h_shufA] 3963 vinserti32x4 m16, [srcq+strideq*1], 1 3964 vbroadcasti32x4 m21, [subpel_h_shufC] 3965 vinserti32x4 m16, [srcq+strideq*2], 2 3966 lea srcq, [srcq+strideq*4] 3967 vinserti128 ym17, [srcq+strideq*0], 1 3968 vbroadcasti32x4 m20, [subpel_h_shufB] 3969 vinserti32x4 m17, [srcq+strideq*1], 2 3970 vinserti32x4 m17, [srcq+strideq*2], 3 3971 pshufb m3, m16, m19 ; 0 1 2 3 0123 3972 mova m2, m8 3973 pshufb m0, m16, m21 ; 0 1 2 3 89ab 3974 vpdpbusd m2, m3, m10 3975 mova m3, m8 3976 pshufb m1, m17, m19 ; 3 4 5 6 0123 3977 vpdpbusd m3, m0, m11 3978 mova m0, m8 3979 pshufb m4, m17, m21 ; 3 4 5 6 89ab 3980 vpdpbusd m0, m1, m10 3981 mova m1, m8 3982 pshufb m16, m20 ; 0 1 2 3 4567 3983 vpdpbusd m1, m4, m11 3984 pshufb m17, m20 ; 3 4 5 6 4567 3985 vpdpbusd m2, m16, m11 3986 vpdpbusd m3, m16, m10 3987 vpdpbusd m0, m17, m11 3988 vpdpbusd m1, m17, m10 3989 packssdw m2, m3 3990 packssdw m0, m1 3991 psraw m2, 2 ; 0 1 2 3 3992 psraw m0, 2 ; 3 4 5 6 3993 vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5 3994 vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4 3995 punpcklwd m3, m4, m0 ; 23 34 45 56 3996 punpckhwd m4, m0 3997 punpcklwd m1, m2, m5 ; 01 12 23 34 3998 punpckhwd m2, m5 3999.hv_w8_loop: 4000 movu xm18, [srcq+stride3q ] 4001 lea srcq, [srcq+strideq*4] 4002 vinserti128 ym18, [srcq+strideq*0], 1 4003 vinserti32x4 m18, [srcq+strideq*1], 2 4004 vinserti32x4 m18, [srcq+strideq*2], 3 4005 pshufb m17, m18, m19 ; 7 8 9 a 0123 4006 mova m16, m8 4007 pshufb m5, m18, m21 ; 7 8 9 a 89ab 4008 vpdpbusd m16, m17, m10 4009 mova m17, m8 4010 pshufb m18, m20 ; 7 8 9 a 4567 4011 vpdpbusd m17, m5, m11 4012 mova m5, m9 4013 vpdpwssd m5, m3, m13 ; a1 b1 c1 d1 4014 mova m6, m9 4015 vpdpwssd m6, m4, m13 4016 vpdpbusd m16, m18, m11 4017 vpdpbusd m17, m18, m10 4018 vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 4019 mova m1, m3 4020 vpdpwssd m6, m2, m12 4021 mova m2, m4 4022 packssdw m16, m17 4023 psraw m16, 2 ; 7 8 9 a 4024 valignq m4, m16, m0, 6 ; 6 7 8 9 4025 mova m0, m16 4026 punpcklwd m3, m4, m16 ; 67 78 89 9a 4027 punpckhwd m4, m16 4028 vpdpwssd m5, m3, m15 ; a3 b3 c3 d3 4029 vpdpwssd m6, m4, m15 4030 vshufi32x4 m1, m3, q1032 ; 45 56 67 78 4031 vshufi32x4 m2, m4, q1032 4032 vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 4033 vpdpwssd m6, m2, m14 4034 psrad m5, 6 4035 psrad m6, 6 4036 packssdw m5, m6 4037 mova [tmpq], m5 4038 add tmpq, 64 4039 sub hd, 4 4040 jg .hv_w8_loop 4041 vzeroupper 4042 RET 4043.hv_w16: 4044 WIN64_SPILL_XMM 23 4045 mova m16, [spel_h_perm16] 4046 vpbroadcastd m18, [pb_4] 4047 add wd, wd 4048 paddb m17, m18, m16 4049 lea r6d, [hq+wq*8-256] 4050 paddb m18, m17 4051.hv_w16_loop0: 4052 movu ym19, [srcq+strideq*0] 4053 vinserti32x8 m19, [srcq+strideq*1], 1 4054 lea r5, [srcq+strideq*2] 4055 movu ym20, [r5 +strideq*0] 4056 vinserti32x8 m20, [r5 +strideq*1], 1 4057 lea r5, [r5 +strideq*2] 4058 movu ym21, [r5 +strideq*0] 4059 vinserti32x8 m21, [r5 +strideq*1], 1 4060 lea r5, [r5 +strideq*2] 4061 movu ym22, [r5 +strideq*0] 4062 mov r7, tmpq 4063 vpermb m3, m16, m19 ; 0 1 0123 89ab 4064 mova m2, m8 4065 vpermb m4, m18, m19 ; 0 1 89ab ghij 4066 vpdpbusd m2, m3, m10 4067 mova m3, m8 4068 vpermb m5, m16, m20 ; 2 3 0123 89ab 4069 vpdpbusd m3, m4, m11 4070 mova m4, m8 4071 vpermb m6, m18, m20 ; 2 3 89ab ghij 4072 vpdpbusd m4, m5, m10 4073 mova m5, m8 4074 vpermb m7, m16, m21 ; 4 5 0123 89ab 4075 vpdpbusd m5, m6, m11 4076 mova m6, m8 4077 vpermb m0, m18, m21 ; 4 5 89ab ghij 4078 vpdpbusd m6, m7, m10 4079 mova m7, m8 4080 vpermb ym1, ym16, ym22 ; 6 0123 89ab 4081 vpdpbusd m7, m0, m11 4082 mova ym0, ym8 4083 vpermb m19, m17, m19 ; 0 1 4567 cdef 4084 vpdpbusd ym0, ym1, ym10 4085 vpermb ym1, ym18, ym22 ; 6 89ab ghij 4086 vpdpbusd m2, m19, m11 4087 vpdpbusd m3, m19, m10 4088 mova ym19, ym8 4089 vpermb m20, m17, m20 ; 2 3 4567 cdef 4090 vpdpbusd ym19, ym1, ym11 4091 vpermb m21, m17, m21 ; 4 5 4567 cdef 4092 vpdpbusd m4, m20, m11 4093 vpdpbusd m5, m20, m10 4094 vpermb ym22, ym17, ym22 ; 6 4567 cdef 4095 vpdpbusd m6, m21, m11 4096 vpdpbusd m7, m21, m10 4097 packssdw m2, m3 ; 0 1 4098 vpdpbusd ym0, ym22, ym11 4099 packssdw m4, m5 ; 2 3 4100 vpdpbusd ym19, ym22, ym10 4101 packssdw m6, m7 ; 4 5 4102 packssdw ym0, ym19 ; 6 4103 REPX {psraw x, 2}, m2, m4, m6, ym0 4104 vshufi32x4 m3, m2, m4, q1032 ; 1 2 4105 vshufi32x4 m5, m4, m6, q1032 ; 3 4 4106 vshufi32x4 m0, m6, m0, q1032 ; 5 6 4107 punpcklwd m1, m2, m3 ; 01 12 4108 punpckhwd m2, m3 4109 punpcklwd m3, m4, m5 ; 23 34 4110 punpckhwd m4, m5 4111 punpcklwd m5, m6, m0 ; 45 56 4112 punpckhwd m6, m0 4113.hv_w16_loop: 4114 movu ym19, [r5+strideq*1] 4115 lea r5, [r5+strideq*2] 4116 vinserti32x8 m19, [r5+strideq*0], 1 4117 mova m20, m9 4118 vpdpwssd m20, m1, m12 ; a0 4119 vpermb m1, m16, m19 4120 mova m21, m9 4121 vpdpwssd m21, m2, m12 ; b0 4122 vpermb m2, m17, m19 4123 mova m22, m8 4124 vpdpbusd m22, m1, m10 4125 mova m1, m8 4126 vpermb m19, m18, m19 4127 vpdpbusd m1, m2, m10 4128 vpdpwssd m20, m3, m13 ; a1 4129 vpdpwssd m21, m4, m13 ; b1 4130 vpdpbusd m22, m2, m11 4131 mova m2, m4 4132 vpdpbusd m1, m19, m11 4133 mova m4, m6 4134 vpdpwssd m20, m5, m14 ; a2 4135 vpdpwssd m21, m6, m14 ; b2 4136 packssdw m22, m1 4137 mova m1, m3 4138 psraw m22, 2 ; 7 8 4139 mova m3, m5 4140 vshufi32x4 m6, m0, m22, q1032 ; 6 7 4141 mova m0, m22 4142 punpcklwd m5, m6, m0 ; 67 78 4143 punpckhwd m6, m0 4144 vpdpwssd m20, m5, m15 ; a3 4145 vpdpwssd m21, m6, m15 ; b3 4146 psrad m20, 6 4147 psrad m21, 6 4148 packssdw m20, m21 4149 mova [r7+wq*0], ym20 4150 vextracti32x8 [r7+wq*1], m20, 1 4151 lea r7, [r7+wq*2] 4152 sub hd, 2 4153 jg .hv_w16_loop 4154 add srcq, 16 4155 add tmpq, 32 4156 movzx hd, r6b 4157 sub r6d, 1<<8 4158 jg .hv_w16_loop0 4159 RET 4160 4161cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts 4162 vpbroadcastd m9, [pd_16384] 4163 mova ym15, [warp_8x8t_end] 4164 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main 4165 jmp .start 4166.loop: 4167 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 4168 lea tmpq, [tmpq+tsq*4] 4169.start: 4170 paddd m16, m16 4171 vpermb m16, m15, m16 4172 mova [tmpq+tsq*0], xm16 4173 vextracti128 [tmpq+tsq*2], ym16, 1 4174 sub r6d, 0x1800 4175 jg .loop 4176 RET 4177 4178cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter 4179 vpbroadcastd m9, [pd_262144] 4180 mova xm15, [warp_8x8_end] 4181 call .main 4182 jmp .start 4183.loop: 4184 call .main2 4185 lea dstq, [dstq+dsq*2] 4186.start: 4187 psrad m16, 19 4188 packuswb m16, m16 4189 vpermb m16, m15, m16 4190 movq [dstq+dsq*0], xm16 4191 movhps [dstq+dsq*1], xm16 4192 sub r6d, 0x1800 4193 jg .loop 4194 RET 4195ALIGN function_align 4196.main: 4197 vpbroadcastd m1, [pd_512] 4198%if WIN64 4199 mov abcdq, r5mp 4200 vpaddd ym18, ym1, r6m {1to8} ; mx 4201%else 4202 add r5d, 512 4203 vpbroadcastd ym18, r5d 4204%endif 4205 vpaddd ym20, ym1, r7m {1to8} ; my 4206 mova ym16, [pd_0to7] 4207 vpbroadcastd ym19, [abcdq+4*0] 4208 vpbroadcastd ym21, [abcdq+4*1] 4209 lea r4, [ssq*3+3] 4210 mova m10, [warp_8x8_permA] 4211 mov r6d, 0x5555 4212 mova m11, [warp_8x8_permB] 4213 lea filterq, [mc_warp_filter+64*8] 4214 vpbroadcastq m12, [warp_8x8_hpack] 4215 sub srcq, r4 ; src -= src_stride*3 + 3 4216 vbroadcasti32x4 m13, [warp_8x8_permC] 4217 kxnorb k2, k2, k2 4218 vbroadcasti32x4 m14, [warp_8x8_permD] 4219 vpdpwssd ym18, ym19, ym16 ; alpha 4220 vpdpwssd ym20, ym21, ym16 ; gamma 4221 vbroadcasti32x4 m0, [srcq] 4222 psrad ym19, 16 ; beta 4223 psrad ym21, 16 ; delta 4224 kmovw k1, r6d 4225 psrad ym16, ym18, 10 4226 kmovb k3, k2 4227 paddd ym18, ym19 4228 vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 4229 psrld m1, 8 ; pd_2 4230 pshufb m0, m11 4231 paddd m8, m1, m1 ; pd_4 4232 vpdpbusd m1, m0, m2 4233 call .h 4234 psllq m2, m1, 45 4235 pslld m1, 13 4236 paddd m1, m2 4237 vpshrdq m1, m0, 48 ; 01 12 4238 call .h 4239 vpshrdq m2, m1, m0, 48 ; 23 34 4240 call .h 4241 vpshrdq m3, m2, m0, 48 ; 45 56 4242.main2: 4243 call .h 4244 psrad ym17, ym20, 10 4245 kmovb k2, k3 4246 paddd ym20, ym21 4247 vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 4248 psrad ym16, ym20, 10 4249 kmovb k3, k2 4250 paddd ym20, ym21 4251 vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 4252 shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 4253 mova m16, m9 4254 pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 4255 vpdpwssd m16, m1, m4 4256 pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 4257 mova m1, m2 4258 vpdpwssd m16, m2, m5 4259 shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 4260 mova m2, m3 4261 pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 4262 vpdpwssd m16, m3, m4 4263 vpshrdq m3, m0, 48 ; 67 78 4264 pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 4265 vpdpwssd m16, m3, m5 4266 ret 4267ALIGN function_align 4268.h: 4269 movu xm5, [srcq+ssq*1] 4270 psrad ym16, ym18, 10 4271 lea srcq, [srcq+ssq*2] 4272 vinserti32x4 ym5, [srcq+ssq*0], 1 4273 kmovb k2, k3 4274 paddd ym18, ym19 4275 vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 4276 psrad ym17, ym18, 10 4277 kmovb k3, k2 4278 paddd ym18, ym19 4279 vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 4280 mova m0, m8 4281 vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 4282 vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 4283 vpdpbusd m0, m4, m17 4284 vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb 4285 vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 4286 vpdpbusd m0, m5, m16 4287 vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) 4288 ret 4289 4290%macro BIDIR_FN 1 ; op 4291 lea stride3q, [strideq*3] 4292 jmp wq 4293.w4: 4294 cmp hd, 8 4295 jg .w4_h16 4296 WRAP_YMM %1 0 4297 vextracti32x4 xm1, ym0, 1 4298 movd [dstq ], xm0 4299 pextrd [dstq+strideq*1], xm0, 1 4300 movd [dstq+strideq*2], xm1 4301 pextrd [dstq+stride3q ], xm1, 1 4302 jl .w4_ret 4303 lea dstq, [dstq+strideq*4] 4304 pextrd [dstq ], xm0, 2 4305 pextrd [dstq+strideq*1], xm0, 3 4306 pextrd [dstq+strideq*2], xm1, 2 4307 pextrd [dstq+stride3q ], xm1, 3 4308.w4_ret: 4309 RET 4310.w4_h16: 4311 vpbroadcastd m7, strided 4312 pmulld m7, [bidir_sctr_w4] 4313 %1 0 4314 kxnorw k1, k1, k1 4315 vpscatterdd [dstq+m7]{k1}, m0 4316 RET 4317.w8: 4318 cmp hd, 4 4319 jne .w8_h8 4320 WRAP_YMM %1 0 4321 vextracti32x4 xm1, ym0, 1 4322 movq [dstq ], xm0 4323 movq [dstq+strideq*1], xm1 4324 movhps [dstq+strideq*2], xm0 4325 movhps [dstq+stride3q ], xm1 4326 RET 4327.w8_loop: 4328 %1_INC_PTR 2 4329 lea dstq, [dstq+strideq*4] 4330.w8_h8: 4331 %1 0 4332 vextracti32x4 xm1, ym0, 1 4333 vextracti32x4 xm2, m0, 2 4334 vextracti32x4 xm3, m0, 3 4335 movq [dstq ], xm0 4336 movq [dstq+strideq*1], xm1 4337 movq [dstq+strideq*2], xm2 4338 movq [dstq+stride3q ], xm3 4339 lea dstq, [dstq+strideq*4] 4340 movhps [dstq ], xm0 4341 movhps [dstq+strideq*1], xm1 4342 movhps [dstq+strideq*2], xm2 4343 movhps [dstq+stride3q ], xm3 4344 sub hd, 8 4345 jg .w8_loop 4346 RET 4347.w16_loop: 4348 %1_INC_PTR 2 4349 lea dstq, [dstq+strideq*4] 4350.w16: 4351 %1 0 4352 vpermq m0, m0, q3120 4353 mova [dstq ], xm0 4354 vextracti32x4 [dstq+strideq*1], m0, 2 4355 vextracti32x4 [dstq+strideq*2], ym0, 1 4356 vextracti32x4 [dstq+stride3q ], m0, 3 4357 sub hd, 4 4358 jg .w16_loop 4359 RET 4360.w32: 4361 pmovzxbq m7, [pb_02461357] 4362.w32_loop: 4363 %1 0 4364 %1_INC_PTR 2 4365 vpermq m0, m7, m0 4366 mova [dstq+strideq*0], ym0 4367 vextracti32x8 [dstq+strideq*1], m0, 1 4368 lea dstq, [dstq+strideq*2] 4369 sub hd, 2 4370 jg .w32_loop 4371 RET 4372.w64: 4373 pmovzxbq m7, [pb_02461357] 4374.w64_loop: 4375 %1 0 4376 %1_INC_PTR 2 4377 vpermq m0, m7, m0 4378 mova [dstq], m0 4379 add dstq, strideq 4380 dec hd 4381 jg .w64_loop 4382 RET 4383.w128: 4384 pmovzxbq m7, [pb_02461357] 4385.w128_loop: 4386 %1 0 4387 vpermq m6, m7, m0 4388 %1 2 4389 mova [dstq+64*0], m6 4390 %1_INC_PTR 4 4391 vpermq m6, m7, m0 4392 mova [dstq+64*1], m6 4393 add dstq, strideq 4394 dec hd 4395 jg .w128_loop 4396 RET 4397%endmacro 4398 4399%macro AVG 1 ; src_offset 4400 mova m0, [tmp1q+(%1+0)*mmsize] 4401 paddw m0, [tmp2q+(%1+0)*mmsize] 4402 mova m1, [tmp1q+(%1+1)*mmsize] 4403 paddw m1, [tmp2q+(%1+1)*mmsize] 4404 pmulhrsw m0, m4 4405 pmulhrsw m1, m4 4406 packuswb m0, m1 4407%endmacro 4408 4409%macro AVG_INC_PTR 1 4410 add tmp1q, %1*mmsize 4411 add tmp2q, %1*mmsize 4412%endmacro 4413 4414cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 4415%define base r6-avg_avx512icl_table 4416 lea r6, [avg_avx512icl_table] 4417 tzcnt wd, wm 4418 movifnidn hd, hm 4419 movsxd wq, dword [r6+wq*4] 4420 vpbroadcastd m4, [base+pw_1024] 4421 add wq, r6 4422 BIDIR_FN AVG 4423 4424%macro W_AVG 1 ; src_offset 4425 ; (a * weight + b * (16 - weight) + 128) >> 8 4426 ; = ((a - b) * weight + (b << 4) + 128) >> 8 4427 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 4428 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 4429 mova m0, [tmp1q+(%1+0)*mmsize] 4430 psubw m2, m0, [tmp2q+(%1+0)*mmsize] 4431 mova m1, [tmp1q+(%1+1)*mmsize] 4432 psubw m3, m1, [tmp2q+(%1+1)*mmsize] 4433 pmulhw m2, m4 4434 pmulhw m3, m4 4435 paddw m0, m2 4436 paddw m1, m3 4437 pmulhrsw m0, m5 4438 pmulhrsw m1, m5 4439 packuswb m0, m1 4440%endmacro 4441 4442%define W_AVG_INC_PTR AVG_INC_PTR 4443 4444cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 4445%define base r6-w_avg_avx512icl_table 4446 lea r6, [w_avg_avx512icl_table] 4447 tzcnt wd, wm 4448 movifnidn hd, hm 4449 vpbroadcastw m4, r6m ; weight 4450 movsxd wq, dword [r6+wq*4] 4451 vpbroadcastd m5, [base+pw_2048] 4452 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 4453 add wq, r6 4454 cmp dword r6m, 7 4455 jg .weight_gt7 4456 mov r6, tmp1q 4457 pxor m0, m0 4458 mov tmp1q, tmp2q 4459 psubw m4, m0, m4 ; -weight 4460 mov tmp2q, r6 4461.weight_gt7: 4462 BIDIR_FN W_AVG 4463 4464%macro MASK 1 ; src_offset 4465 ; (a * m + b * (64 - m) + 512) >> 10 4466 ; = ((a - b) * m + (b << 6) + 512) >> 10 4467 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 4468%if mmsize == 64 4469 vpermq m3, m8, [maskq+%1*32] 4470%else 4471 vpermq m3, [maskq+%1*16], q3120 4472%endif 4473 mova m0, [tmp2q+(%1+0)*mmsize] 4474 psubw m1, m0, [tmp1q+(%1+0)*mmsize] 4475 psubb m3, m4, m3 4476 paddw m1, m1 ; (b - a) << 1 4477 paddb m3, m3 4478 punpcklbw m2, m4, m3 ; -m << 9 4479 pmulhw m1, m2 4480 paddw m0, m1 4481 mova m1, [tmp2q+(%1+1)*mmsize] 4482 psubw m2, m1, [tmp1q+(%1+1)*mmsize] 4483 paddw m2, m2 4484 punpckhbw m3, m4, m3 4485 pmulhw m2, m3 4486 paddw m1, m2 4487 pmulhrsw m0, m5 4488 pmulhrsw m1, m5 4489 packuswb m0, m1 4490%endmacro 4491 4492%macro MASK_INC_PTR 1 4493 add maskq, %1*32 4494 add tmp2q, %1*64 4495 add tmp1q, %1*64 4496%endmacro 4497 4498cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 4499%define base r7-mask_avx512icl_table 4500 lea r7, [mask_avx512icl_table] 4501 tzcnt wd, wm 4502 movifnidn hd, hm 4503 mov maskq, maskmp 4504 movsxd wq, dword [r7+wq*4] 4505 pxor m4, m4 4506 mova m8, [base+bilin_v_perm64] 4507 vpbroadcastd m5, [base+pw_2048] 4508 add wq, r7 4509 BIDIR_FN MASK 4510 4511%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 4512 mova m%1, [tmp1q+mmsize*%3] 4513 mova m1, [tmp2q+mmsize*%3] 4514 psubw m1, m%1 4515 pabsw m%2, m1 4516 psubusw m%2, m6, m%2 4517 psrlw m%2, 8 ; 64 - m 4518 psllw m2, m%2, 10 4519 pmulhw m1, m2 4520 paddw m%1, m1 4521 mova m1, [tmp1q+mmsize*%4] 4522 mova m2, [tmp2q+mmsize*%4] 4523 psubw m2, m1 4524 pabsw m3, m2 4525 psubusw m3, m6, m3 4526 vpshldw m%2, m3, 8 4527 psllw m3, m%2, 10 4528%if %5 4529 psubb m%2, m5, m%2 4530%endif 4531 pmulhw m2, m3 4532 paddw m1, m2 4533 pmulhrsw m%1, m7 4534 pmulhrsw m1, m7 4535 packuswb m%1, m1 4536%endmacro 4537 4538cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 4539%define base r7-w_mask_420_avx512icl_table 4540 lea r7, [w_mask_420_avx512icl_table] 4541 tzcnt wd, wm 4542 mov r6d, r7m ; sign 4543 movifnidn hd, hm 4544 movsxd wq, [r7+wq*4] 4545 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 4546 vpbroadcastd m7, [base+pw_2048] 4547 vpbroadcastd m9, [base+pb_m64] ; -1 << 6 4548 mova ym10, [base+wm_420_mask+32] 4549 vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 4550 add wq, r7 4551 mov maskq, maskmp 4552 lea stride3q, [strideq*3] 4553 jmp wq 4554.w4: 4555 mova m5, [wm_420_perm4] 4556 cmp hd, 8 4557 jg .w4_h16 4558 WRAP_YMM W_MASK 0, 4, 0, 1 4559 vinserti128 ym5, [wm_420_perm4+32], 1 4560 vpermb ym4, ym5, ym4 4561 vpdpbusd ym8, ym4, ym9 4562 vextracti32x4 xm1, m0, 1 4563 movd [dstq+strideq*0], xm0 4564 pextrd [dstq+strideq*1], xm0, 1 4565 movd [dstq+strideq*2], xm1 4566 pextrd [dstq+stride3q ], xm1, 1 4567 jl .w4_end 4568 lea dstq, [dstq+strideq*4] 4569 pextrd [dstq+strideq*0], xm0, 2 4570 pextrd [dstq+strideq*1], xm0, 3 4571 pextrd [dstq+strideq*2], xm1, 2 4572 pextrd [dstq+stride3q ], xm1, 3 4573.w4_end: 4574 vpermb ym8, ym10, ym8 4575 movq [maskq], xm8 4576 RET 4577.w4_h16: 4578 vpbroadcastd m11, strided 4579 pmulld m11, [bidir_sctr_w4] 4580 W_MASK 0, 4, 0, 1 4581 vpermb m4, m5, m4 4582 vpdpbusd m8, m4, m9 4583 kxnorw k1, k1, k1 4584 vpermb m8, m10, m8 4585 mova [maskq], xm8 4586 vpscatterdd [dstq+m11]{k1}, m0 4587 RET 4588.w8: 4589 mova m5, [wm_420_perm8] 4590 cmp hd, 4 4591 jne .w8_h8 4592 WRAP_YMM W_MASK 0, 4, 0, 1 4593 vinserti128 ym5, [wm_420_perm8+32], 1 4594 vpermb ym4, ym5, ym4 4595 vpdpbusd ym8, ym4, ym9 4596 vpermb m8, m10, m8 4597 mova [maskq], xm8 4598 vextracti32x4 xm1, ym0, 1 4599 movq [dstq+strideq*0], xm0 4600 movq [dstq+strideq*1], xm1 4601 movhps [dstq+strideq*2], xm0 4602 movhps [dstq+stride3q ], xm1 4603 RET 4604.w8_loop: 4605 add tmp1q, 128 4606 add tmp2q, 128 4607 add maskq, 16 4608 lea dstq, [dstq+strideq*4] 4609.w8_h8: 4610 W_MASK 0, 4, 0, 1 4611 vpermb m4, m5, m4 4612 mova m1, m8 4613 vpdpbusd m1, m4, m9 4614 vpermb m1, m10, m1 4615 mova [maskq], xm1 4616 vextracti32x4 xm1, ym0, 1 4617 vextracti32x4 xm2, m0, 2 4618 vextracti32x4 xm3, m0, 3 4619 movq [dstq+strideq*0], xm0 4620 movq [dstq+strideq*1], xm1 4621 movq [dstq+strideq*2], xm2 4622 movq [dstq+stride3q ], xm3 4623 lea dstq, [dstq+strideq*4] 4624 movhps [dstq+strideq*0], xm0 4625 movhps [dstq+strideq*1], xm1 4626 movhps [dstq+strideq*2], xm2 4627 movhps [dstq+stride3q ], xm3 4628 sub hd, 8 4629 jg .w8_loop 4630 RET 4631.w16: 4632 mova m5, [wm_420_perm16] 4633.w16_loop: 4634 W_MASK 0, 4, 0, 1 4635 vpermb m4, m5, m4 4636 mova m1, m8 4637 vpdpbusd m1, m4, m9 4638 add tmp1q, 128 4639 add tmp2q, 128 4640 vpermb m1, m10, m1 4641 vpermq m0, m0, q3120 4642 mova [maskq], xm1 4643 add maskq, 16 4644 mova [dstq+strideq*0], xm0 4645 vextracti32x4 [dstq+strideq*1], m0, 2 4646 vextracti32x4 [dstq+strideq*2], ym0, 1 4647 vextracti32x4 [dstq+stride3q ], m0, 3 4648 lea dstq, [dstq+strideq*4] 4649 sub hd, 4 4650 jg .w16_loop 4651 RET 4652.w32: 4653 pmovzxbq m5, [pb_02461357] 4654.w32_loop: 4655 W_MASK 0, 4, 0, 1 4656 mova m1, m8 4657 vpdpbusd m1, m4, m9 4658 add tmp1q, 128 4659 add tmp2q, 128 4660 vpermb m1, m10, m1 4661 vpermq m0, m5, m0 4662 mova [maskq], xm1 4663 add maskq, 16 4664 mova [dstq+strideq*0], ym0 4665 vextracti32x8 [dstq+strideq*1], m0, 1 4666 lea dstq, [dstq+strideq*2] 4667 sub hd, 2 4668 jg .w32_loop 4669 RET 4670.w64: 4671 pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 4672 psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 4673.w64_loop: 4674 W_MASK 0, 4, 0, 2 4675 W_MASK 11, 5, 1, 3 4676 mova m2, m8 4677 vpdpbusd m2, m4, m9 4678 mova m3, m8 4679 vpdpbusd m3, m5, m9 4680 add tmp1q, 256 4681 add tmp2q, 256 4682 vpermt2b m2, m10, m3 4683 mova m1, m0 4684 vpermt2q m0, m12, m11 4685 vpermt2q m1, m13, m11 4686 mova [maskq], ym2 4687 add maskq, 32 4688 mova [dstq+strideq*0], m0 4689 mova [dstq+strideq*1], m1 4690 lea dstq, [dstq+strideq*2] 4691 sub hd, 2 4692 jg .w64_loop 4693 RET 4694.w128: 4695 pmovzxbq m14, [wm_420_perm64] 4696 mova m10, [wm_420_mask] 4697 psrlq m15, m14, 4 4698.w128_loop: 4699 W_MASK 0, 12, 0, 4 4700 W_MASK 11, 13, 1, 5 4701 mova m4, m8 4702 vpdpbusd m4, m12, m9 4703 mova m5, m8 4704 vpdpbusd m5, m13, m9 4705 mova m1, m0 4706 vpermt2q m0, m14, m11 4707 vpermt2q m1, m15, m11 4708 mova [dstq+strideq*0+64*0], m0 4709 mova [dstq+strideq*1+64*0], m1 4710 W_MASK 0, 12, 2, 6 4711 W_MASK 11, 13, 3, 7 4712 vprold m4, 16 4713 vprold m5, 16 4714 vpdpbusd m4, m12, m9 4715 vpdpbusd m5, m13, m9 4716 add tmp1q, 512 4717 add tmp2q, 512 4718 vpermt2b m4, m10, m5 4719 mova m1, m0 4720 vpermt2q m0, m14, m11 4721 vpermt2q m1, m15, m11 4722 mova [maskq], m4 4723 add maskq, 64 4724 mova [dstq+strideq*0+64*1], m0 4725 mova [dstq+strideq*1+64*1], m1 4726 lea dstq, [dstq+strideq*2] 4727 sub hd, 2 4728 jg .w128_loop 4729 RET 4730 4731cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 4732%define base r7-w_mask_422_avx512icl_table 4733 lea r7, [w_mask_422_avx512icl_table] 4734 tzcnt wd, wm 4735 mov r6d, r7m ; sign 4736 movifnidn hd, hm 4737 movsxd wq, dword [r7+wq*4] 4738 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 4739 vpbroadcastd m7, [base+pw_2048] 4740 vpbroadcastd m9, [base+pw_m128] 4741 mova m10, [base+wm_422_mask] 4742 vpbroadcastd m11, [base+pb_127] 4743 add wq, r7 4744 vpbroadcastd m8, [base+wm_sign+4+r6*4] 4745 mov maskq, maskmp 4746 lea stride3q, [strideq*3] 4747 jmp wq 4748.w4: 4749 cmp hd, 8 4750 jg .w4_h16 4751 WRAP_YMM W_MASK 0, 4, 0, 1 4752 movhps xm10, [wm_422_mask+16] 4753 vpdpwssd ym8, ym4, ym9 4754 vpermb ym8, ym10, ym8 4755 vextracti32x4 xm1, m0, 1 4756 movd [dstq+strideq*0], xm0 4757 pextrd [dstq+strideq*1], xm0, 1 4758 movd [dstq+strideq*2], xm1 4759 pextrd [dstq+stride3q ], xm1, 1 4760 jl .w4_end 4761 lea dstq, [dstq+strideq*4] 4762 pextrd [dstq+strideq*0], xm0, 2 4763 pextrd [dstq+strideq*1], xm0, 3 4764 pextrd [dstq+strideq*2], xm1, 2 4765 pextrd [dstq+stride3q ], xm1, 3 4766.w4_end: 4767 pand xm8, xm11 4768 mova [maskq], xm8 4769 RET 4770.w4_h16: 4771 vpbroadcastd m5, strided 4772 pmulld m5, [bidir_sctr_w4] 4773 W_MASK 0, 4, 0, 1 4774 vpdpwssd m8, m4, m9 4775 kxnorw k1, k1, k1 4776 vpermb m8, m10, m8 4777 pand ym8, ym11 4778 mova [maskq], ym8 4779 vpscatterdd [dstq+m5]{k1}, m0 4780 RET 4781.w8: 4782 cmp hd, 4 4783 jne .w8_h8 4784 WRAP_YMM W_MASK 0, 4, 0, 1 4785 movhps xm10, [wm_422_mask+16] 4786 vpdpwssd ym8, ym4, ym9 4787 vpermb ym8, ym10, ym8 4788 pand xm8, xm11 4789 mova [maskq], xm8 4790 vextracti32x4 xm1, ym0, 1 4791 movq [dstq+strideq*0], xm0 4792 movq [dstq+strideq*1], xm1 4793 movhps [dstq+strideq*2], xm0 4794 movhps [dstq+stride3q ], xm1 4795 RET 4796.w8_loop: 4797 add tmp1q, 128 4798 add tmp2q, 128 4799 add maskq, 32 4800 lea dstq, [dstq+strideq*4] 4801.w8_h8: 4802 W_MASK 0, 4, 0, 1 4803 mova m1, m8 4804 vpdpwssd m1, m4, m9 4805 vpermb m1, m10, m1 4806 pand ym1, ym11 4807 mova [maskq], ym1 4808 vextracti32x4 xm1, ym0, 1 4809 vextracti32x4 xm2, m0, 2 4810 vextracti32x4 xm3, m0, 3 4811 movq [dstq+strideq*0], xm0 4812 movq [dstq+strideq*1], xm1 4813 movq [dstq+strideq*2], xm2 4814 movq [dstq+stride3q ], xm3 4815 lea dstq, [dstq+strideq*4] 4816 movhps [dstq+strideq*0], xm0 4817 movhps [dstq+strideq*1], xm1 4818 movhps [dstq+strideq*2], xm2 4819 movhps [dstq+stride3q ], xm3 4820 sub hd, 8 4821 jg .w8_loop 4822 RET 4823.w16_loop: 4824 add tmp1q, 128 4825 add tmp2q, 128 4826 add maskq, 32 4827 lea dstq, [dstq+strideq*4] 4828.w16: 4829 W_MASK 0, 4, 0, 1 4830 mova m1, m8 4831 vpdpwssd m1, m4, m9 4832 vpermb m1, m10, m1 4833 vpermq m0, m0, q3120 4834 pand ym1, ym11 4835 mova [maskq], ym1 4836 mova [dstq+strideq*0], xm0 4837 vextracti32x4 [dstq+strideq*1], m0, 2 4838 vextracti32x4 [dstq+strideq*2], ym0, 1 4839 vextracti32x4 [dstq+stride3q ], m0, 3 4840 sub hd, 4 4841 jg .w16_loop 4842 RET 4843.w32: 4844 pmovzxbq m5, [pb_02461357] 4845.w32_loop: 4846 W_MASK 0, 4, 0, 1 4847 mova m1, m8 4848 vpdpwssd m1, m4, m9 4849 add tmp1q, 128 4850 add tmp2q, 128 4851 vpermb m1, m10, m1 4852 vpermq m0, m5, m0 4853 pand ym1, ym11 4854 mova [maskq], ym1 4855 add maskq, 32 4856 mova [dstq+strideq*0], ym0 4857 vextracti32x8 [dstq+strideq*1], m0, 1 4858 lea dstq, [dstq+strideq*2] 4859 sub hd, 2 4860 jg .w32_loop 4861 RET 4862.w64: 4863 pmovzxbq m5, [pb_02461357] 4864.w64_loop: 4865 W_MASK 0, 4, 0, 1 4866 mova m1, m8 4867 vpdpwssd m1, m4, m9 4868 add tmp1q, 128 4869 add tmp2q, 128 4870 vpermb m1, m10, m1 4871 vpermq m0, m5, m0 4872 pand ym1, ym11 4873 mova [maskq], ym1 4874 add maskq, 32 4875 mova [dstq], m0 4876 add dstq, strideq 4877 dec hd 4878 jg .w64_loop 4879 RET 4880.w128: 4881 pmovzxbq m13, [pb_02461357] 4882.w128_loop: 4883 W_MASK 0, 4, 0, 1 4884 W_MASK 12, 5, 2, 3 4885 mova m2, m8 4886 vpdpwssd m2, m4, m9 4887 mova m3, m8 4888 vpdpwssd m3, m5, m9 4889 add tmp1q, 256 4890 add tmp2q, 256 4891 vpermt2b m2, m10, m3 4892 vpermq m0, m13, m0 4893 vpermq m1, m13, m12 4894 pand m2, m11 4895 mova [maskq], m2 4896 add maskq, 64 4897 mova [dstq+64*0], m0 4898 mova [dstq+64*1], m1 4899 add dstq, strideq 4900 dec hd 4901 jg .w128_loop 4902 RET 4903 4904cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 4905%define base r7-w_mask_444_avx512icl_table 4906 lea r7, [w_mask_444_avx512icl_table] 4907 tzcnt wd, wm 4908 movifnidn hd, hm 4909 movsxd wq, dword [r7+wq*4] 4910 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 4911 vpbroadcastd m5, [base+pb_64] 4912 vpbroadcastd m7, [base+pw_2048] 4913 mova m8, [base+wm_444_mask] 4914 add wq, r7 4915 mov maskq, maskmp 4916 lea stride3q, [strideq*3] 4917 jmp wq 4918.w4: 4919 cmp hd, 8 4920 jg .w4_h16 4921 WRAP_YMM W_MASK 0, 4, 0, 1, 1 4922 vinserti128 ym8, [wm_444_mask+32], 1 4923 vpermb ym4, ym8, ym4 4924 mova [maskq], ym4 4925 vextracti32x4 xm1, m0, 1 4926 movd [dstq+strideq*0], xm0 4927 pextrd [dstq+strideq*1], xm0, 1 4928 movd [dstq+strideq*2], xm1 4929 pextrd [dstq+stride3q ], xm1, 1 4930 jl .w4_end 4931 lea dstq, [dstq+strideq*4] 4932 pextrd [dstq+strideq*0], xm0, 2 4933 pextrd [dstq+strideq*1], xm0, 3 4934 pextrd [dstq+strideq*2], xm1, 2 4935 pextrd [dstq+stride3q ], xm1, 3 4936.w4_end: 4937 RET 4938.w4_h16: 4939 vpbroadcastd m9, strided 4940 pmulld m9, [bidir_sctr_w4] 4941 W_MASK 0, 4, 0, 1, 1 4942 vpermb m4, m8, m4 4943 kxnorw k1, k1, k1 4944 mova [maskq], m4 4945 vpscatterdd [dstq+m9]{k1}, m0 4946 RET 4947.w8: 4948 cmp hd, 4 4949 jne .w8_h8 4950 WRAP_YMM W_MASK 0, 4, 0, 1, 1 4951 vinserti128 ym8, [wm_444_mask+32], 1 4952 vpermb ym4, ym8, ym4 4953 mova [maskq], ym4 4954 vextracti32x4 xm1, ym0, 1 4955 movq [dstq+strideq*0], xm0 4956 movq [dstq+strideq*1], xm1 4957 movhps [dstq+strideq*2], xm0 4958 movhps [dstq+stride3q ], xm1 4959 RET 4960.w8_loop: 4961 add tmp1q, 128 4962 add tmp2q, 128 4963 add maskq, 64 4964 lea dstq, [dstq+strideq*4] 4965.w8_h8: 4966 W_MASK 0, 4, 0, 1, 1 4967 vpermb m4, m8, m4 4968 mova [maskq], m4 4969 vextracti32x4 xm1, ym0, 1 4970 vextracti32x4 xm2, m0, 2 4971 vextracti32x4 xm3, m0, 3 4972 movq [dstq+strideq*0], xm0 4973 movq [dstq+strideq*1], xm1 4974 movq [dstq+strideq*2], xm2 4975 movq [dstq+stride3q ], xm3 4976 lea dstq, [dstq+strideq*4] 4977 movhps [dstq+strideq*0], xm0 4978 movhps [dstq+strideq*1], xm1 4979 movhps [dstq+strideq*2], xm2 4980 movhps [dstq+stride3q ], xm3 4981 sub hd, 8 4982 jg .w8_loop 4983 RET 4984.w16_loop: 4985 add tmp1q, 128 4986 add tmp2q, 128 4987 add maskq, 64 4988 lea dstq, [dstq+strideq*4] 4989.w16: 4990 W_MASK 0, 4, 0, 1, 1 4991 vpermb m4, m8, m4 4992 vpermq m0, m0, q3120 4993 mova [maskq], m4 4994 mova [dstq+strideq*0], xm0 4995 vextracti32x4 [dstq+strideq*1], m0, 2 4996 vextracti32x4 [dstq+strideq*2], ym0, 1 4997 vextracti32x4 [dstq+stride3q ], m0, 3 4998 sub hd, 4 4999 jg .w16_loop 5000 RET 5001.w32: 5002 pmovzxbq m9, [pb_02461357] 5003.w32_loop: 5004 W_MASK 0, 4, 0, 1, 1 5005 vpermb m4, m8, m4 5006 add tmp1q, 128 5007 add tmp2q, 128 5008 vpermq m0, m9, m0 5009 mova [maskq], m4 5010 add maskq, 64 5011 mova [dstq+strideq*0], ym0 5012 vextracti32x8 [dstq+strideq*1], m0, 1 5013 lea dstq, [dstq+strideq*2] 5014 sub hd, 2 5015 jg .w32_loop 5016 RET 5017.w64: 5018 pmovzxbq m9, [pb_02461357] 5019.w64_loop: 5020 W_MASK 0, 4, 0, 1, 1 5021 vpermb m4, m8, m4 5022 add tmp1q, 128 5023 add tmp2q, 128 5024 vpermq m0, m9, m0 5025 mova [maskq], m4 5026 add maskq, 64 5027 mova [dstq], m0 5028 add dstq, strideq 5029 dec hd 5030 jg .w64_loop 5031 RET 5032.w128: 5033 pmovzxbq m11, [pb_02461357] 5034.w128_loop: 5035 W_MASK 0, 4, 0, 1, 1 5036 W_MASK 10, 9, 2, 3, 1 5037 vpermb m4, m8, m4 5038 vpermb m9, m8, m9 5039 add tmp1q, 256 5040 add tmp2q, 256 5041 vpermq m0, m11, m0 5042 vpermq m10, m11, m10 5043 mova [maskq+64*0], m4 5044 mova [maskq+64*1], m9 5045 add maskq, 128 5046 mova [dstq+64*0], m0 5047 mova [dstq+64*1], m10 5048 add dstq, strideq 5049 dec hd 5050 jg .w128_loop 5051 RET 5052 5053cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask 5054%define base r6-blend_avx512icl_table 5055 lea r6, [blend_avx512icl_table] 5056 tzcnt wd, wm 5057 movifnidn maskq, maskmp 5058 movifnidn hd, hm 5059 movsxd wq, [r6+wq*4] 5060 vpbroadcastd m6, [base+pb_64] 5061 vpbroadcastd m7, [base+pw_512] 5062 sub tmpq, maskq 5063 add wq, r6 5064 lea r6, [dsq*3] 5065 jmp wq 5066.w4: 5067 movd xmm0, [dstq+dsq*0] 5068 pinsrd xmm0, [dstq+dsq*1], 1 5069 vpbroadcastd xmm1, [dstq+dsq*2] 5070 pinsrd xmm1, [dstq+r6 ], 3 5071 mova xmm4, [maskq] 5072 mova xmm5, [maskq+tmpq] 5073 add maskq, 4*4 5074 psubb xmm3, xm6, xmm4 5075 punpcklbw xmm0, xmm5 5076 punpcklbw xmm2, xmm3, xmm4 5077 punpckhbw xmm1, xmm5 5078 punpckhbw xmm3, xmm4 5079 pmaddubsw xmm0, xmm2 5080 pmaddubsw xmm1, xmm3 5081 pmulhrsw xmm0, xm7 5082 pmulhrsw xmm1, xm7 5083 packuswb xmm0, xmm1 5084 movd [dstq+dsq*0], xmm0 5085 pextrd [dstq+dsq*1], xmm0, 1 5086 pextrd [dstq+dsq*2], xmm0, 2 5087 pextrd [dstq+r6 ], xmm0, 3 5088 lea dstq, [dstq+dsq*4] 5089 sub hd, 4 5090 jg .w4 5091 RET 5092.w8: 5093 movq xmm0, [dstq+dsq*0] 5094 vpbroadcastq xmm1, [dstq+dsq*1] 5095 vpbroadcastq ymm2, [dstq+dsq*2] 5096 vpbroadcastq ymm3, [dstq+r6 ] 5097 mova ymm4, [maskq] 5098 mova ymm5, [maskq+tmpq] 5099 add maskq, 8*4 5100 vpblendd ymm0, ymm2, 0x30 5101 vpblendd ymm1, ymm3, 0xc0 5102 psubb ymm3, ym6, ymm4 5103 punpcklbw ymm0, ymm5 5104 punpcklbw ymm2, ymm3, ymm4 5105 punpckhbw ymm1, ymm5 5106 punpckhbw ymm3, ymm4 5107 pmaddubsw ymm0, ymm2 5108 pmaddubsw ymm1, ymm3 5109 pmulhrsw ymm0, ym7 5110 pmulhrsw ymm1, ym7 5111 packuswb ymm0, ymm1 5112 vextracti128 xmm1, ymm0, 1 5113 movq [dstq+dsq*0], xmm0 5114 movhps [dstq+dsq*1], xmm0 5115 movq [dstq+dsq*2], xmm1 5116 movhps [dstq+r6 ], xmm1 5117 lea dstq, [dstq+dsq*4] 5118 sub hd, 4 5119 jg .w8 5120 vzeroupper 5121 RET 5122.w16: 5123 mova xm1, [dstq+dsq*0] 5124 vinserti32x4 ym1, [dstq+dsq*1], 1 5125 vinserti32x4 m1, [dstq+dsq*2], 2 5126 mova m4, [maskq] 5127 vinserti32x4 m1, [dstq+r6 ], 3 5128 mova m5, [maskq+tmpq] 5129 add maskq, 16*4 5130 psubb m3, m6, m4 5131 punpcklbw m0, m1, m5 5132 punpcklbw m2, m3, m4 5133 punpckhbw m1, m5 5134 punpckhbw m3, m4 5135 pmaddubsw m0, m2 5136 pmaddubsw m1, m3 5137 pmulhrsw m0, m7 5138 pmulhrsw m1, m7 5139 packuswb m0, m1 5140 mova [dstq+dsq*0], xm0 5141 vextracti32x4 [dstq+dsq*1], ym0, 1 5142 vextracti32x4 [dstq+dsq*2], m0, 2 5143 vextracti32x4 [dstq+r6 ], m0, 3 5144 lea dstq, [dstq+dsq*4] 5145 sub hd, 4 5146 jg .w16 5147 RET 5148.w32: 5149 mova ym1, [dstq+dsq*0] 5150 vinserti32x8 m1, [dstq+dsq*1], 1 5151 mova m4, [maskq] 5152 mova m5, [maskq+tmpq] 5153 add maskq, 32*2 5154 psubb m3, m6, m4 5155 punpcklbw m0, m1, m5 5156 punpcklbw m2, m3, m4 5157 punpckhbw m1, m5 5158 punpckhbw m3, m4 5159 pmaddubsw m0, m2 5160 pmaddubsw m1, m3 5161 pmulhrsw m0, m7 5162 pmulhrsw m1, m7 5163 packuswb m0, m1 5164 mova [dstq+dsq*0], ym0 5165 vextracti32x8 [dstq+dsq*1], m0, 1 5166 lea dstq, [dstq+dsq*2] 5167 sub hd, 2 5168 jg .w32 5169 RET 5170 5171cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 5172%define base r5-blend_v_avx512icl_table 5173 lea r5, [blend_v_avx512icl_table] 5174 tzcnt wd, wm 5175 movifnidn hd, hm 5176 movsxd wq, [r5+wq*4] 5177 vpbroadcastd m5, [base+pw_512] 5178 add wq, r5 5179 add maskq, obmc_masks-blend_v_avx512icl_table 5180 jmp wq 5181.w2: 5182 vpbroadcastd xmm2, [maskq+2*2] 5183.w2_s0_loop: 5184 movd xmm0, [dstq+dsq*0] 5185 pinsrw xmm0, [dstq+dsq*1], 1 5186 movd xmm1, [tmpq] 5187 add tmpq, 2*2 5188 punpcklbw xmm0, xmm1 5189 pmaddubsw xmm0, xmm2 5190 pmulhrsw xmm0, xm5 5191 packuswb xmm0, xmm0 5192 pextrw [dstq+dsq*0], xmm0, 0 5193 pextrw [dstq+dsq*1], xmm0, 1 5194 lea dstq, [dstq+dsq*2] 5195 sub hd, 2 5196 jg .w2_s0_loop 5197 RET 5198.w4: 5199 vpbroadcastq xmm2, [maskq+4*2] 5200.w4_loop: 5201 movd xmm0, [dstq+dsq*0] 5202 pinsrd xmm0, [dstq+dsq*1], 1 5203 movq xmm1, [tmpq] 5204 add tmpq, 4*2 5205 punpcklbw xmm0, xmm1 5206 pmaddubsw xmm0, xmm2 5207 pmulhrsw xmm0, xm5 5208 packuswb xmm0, xmm0 5209 movd [dstq+dsq*0], xmm0 5210 pextrd [dstq+dsq*1], xmm0, 1 5211 lea dstq, [dstq+dsq*2] 5212 sub hd, 2 5213 jg .w4_loop 5214 RET 5215.w8: 5216 mova xmm3, [maskq+8*2] 5217.w8_loop: 5218 movq xmm0, [dstq+dsq*0] 5219 vpbroadcastq xmm1, [dstq+dsq*1] 5220 mova xmm2, [tmpq] 5221 add tmpq, 8*2 5222 punpcklbw xmm0, xmm2 5223 punpckhbw xmm1, xmm2 5224 pmaddubsw xmm0, xmm3 5225 pmaddubsw xmm1, xmm3 5226 pmulhrsw xmm0, xm5 5227 pmulhrsw xmm1, xm5 5228 packuswb xmm0, xmm1 5229 movq [dstq+dsq*0], xmm0 5230 movhps [dstq+dsq*1], xmm0 5231 lea dstq, [dstq+dsq*2] 5232 sub hd, 2 5233 jg .w8_loop 5234 RET 5235.w16: 5236 vbroadcasti32x4 ym3, [maskq+16*2] 5237 vbroadcasti32x4 ym4, [maskq+16*3] 5238.w16_loop: 5239 mova xm1, [dstq+dsq*0] 5240 vinserti32x4 ym1, [dstq+dsq*1], 1 5241 mova ym2, [tmpq] 5242 add tmpq, 16*2 5243 punpcklbw ym0, ym1, ym2 5244 punpckhbw ym1, ym2 5245 pmaddubsw ym0, ym3 5246 pmaddubsw ym1, ym4 5247 pmulhrsw ym0, ym5 5248 pmulhrsw ym1, ym5 5249 packuswb ym0, ym1 5250 mova [dstq+dsq*0], xm0 5251 vextracti32x4 [dstq+dsq*1], m0, 1 5252 lea dstq, [dstq+dsq*2] 5253 sub hd, 2 5254 jg .w16_loop 5255 RET 5256.w32: 5257 mova m4, [maskq+32*2] 5258 vshufi32x4 m3, m4, m4, q2020 5259 vshufi32x4 m4, m4, q3131 5260.w32_loop: 5261 mova ym1, [dstq+dsq*0] 5262 vinserti32x8 m1, [dstq+dsq*1], 1 5263 mova m2, [tmpq] 5264 add tmpq, 32*2 5265 punpcklbw m0, m1, m2 5266 punpckhbw m1, m2 5267 pmaddubsw m0, m3 5268 pmaddubsw m1, m4 5269 pmulhrsw m0, m5 5270 pmulhrsw m1, m5 5271 packuswb m0, m1 5272 mova [dstq+dsq*0], ym0 5273 vextracti32x8 [dstq+dsq*1], m0, 1 5274 lea dstq, [dstq+dsq*2] 5275 sub hd, 2 5276 jg .w32_loop 5277 RET 5278 5279cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask 5280%define base r6-blend_h_avx512icl_table 5281 lea r6, [blend_h_avx512icl_table] 5282 tzcnt wd, wm 5283 mov hd, hm 5284 movsxd wq, [r6+wq*4] 5285 lea maskq, [base+obmc_masks+hq*2] 5286 vpbroadcastd m5, [base+pw_512] 5287 lea hd, [hq*3] 5288 add wq, r6 5289 shr hd, 2 ; h * 3/4 5290 lea maskq, [maskq+hq*2] 5291 neg hq 5292 jmp wq 5293.w2: 5294 movd xmm0, [dstq+dsq*0] 5295 pinsrw xmm0, [dstq+dsq*1], 1 5296 movd xmm2, [maskq+hq*2] 5297 movd xmm1, [tmpq] 5298 add tmpq, 2*2 5299 punpcklwd xmm2, xmm2 5300 punpcklbw xmm0, xmm1 5301 pmaddubsw xmm0, xmm2 5302 pmulhrsw xmm0, xm5 5303 packuswb xmm0, xmm0 5304 pextrw [dstq+dsq*0], xmm0, 0 5305 pextrw [dstq+dsq*1], xmm0, 1 5306 lea dstq, [dstq+dsq*2] 5307 add hq, 2 5308 jl .w2 5309 RET 5310.w4: 5311 mova xmm3, [blend_shuf] 5312.w4_loop: 5313 movd xmm0, [dstq+dsq*0] 5314 pinsrd xmm0, [dstq+dsq*1], 1 5315 movd xmm2, [maskq+hq*2] 5316 movq xmm1, [tmpq] 5317 add tmpq, 4*2 5318 pshufb xmm2, xmm3 5319 punpcklbw xmm0, xmm1 5320 pmaddubsw xmm0, xmm2 5321 pmulhrsw xmm0, xm5 5322 packuswb xmm0, xmm0 5323 movd [dstq+dsq*0], xmm0 5324 pextrd [dstq+dsq*1], xmm0, 1 5325 lea dstq, [dstq+dsq*2] 5326 add hq, 2 5327 jl .w4_loop 5328 RET 5329.w8: 5330 vbroadcasti128 ymm4, [blend_shuf] 5331 shufpd ymm4, ymm4, 0x03 5332.w8_loop: 5333 vpbroadcastq ymm1, [dstq+dsq*0] 5334 movq xmm0, [dstq+dsq*1] 5335 vpblendd ymm0, ymm1, 0x30 5336 vpbroadcastd ymm3, [maskq+hq*2] 5337 movq xmm1, [tmpq+8*1] 5338 vinserti128 ymm1, [tmpq+8*0], 1 5339 add tmpq, 8*2 5340 pshufb ymm3, ymm4 5341 punpcklbw ymm0, ymm1 5342 pmaddubsw ymm0, ymm3 5343 pmulhrsw ymm0, ym5 5344 vextracti128 xmm1, ymm0, 1 5345 packuswb xmm0, xmm1 5346 movhps [dstq+dsq*0], xmm0 5347 movq [dstq+dsq*1], xmm0 5348 lea dstq, [dstq+dsq*2] 5349 add hq, 2 5350 jl .w8_loop 5351 vzeroupper 5352 RET 5353.w16: 5354 vbroadcasti32x4 ym4, [blend_shuf] 5355 shufpd ym4, ym4, 0x0c 5356.w16_loop: 5357 mova xm1, [dstq+dsq*0] 5358 vinserti32x4 ym1, [dstq+dsq*1], 1 5359 vpbroadcastd ym3, [maskq+hq*2] 5360 mova ym2, [tmpq] 5361 add tmpq, 16*2 5362 pshufb ym3, ym4 5363 punpcklbw ym0, ym1, ym2 5364 punpckhbw ym1, ym2 5365 pmaddubsw ym0, ym3 5366 pmaddubsw ym1, ym3 5367 pmulhrsw ym0, ym5 5368 pmulhrsw ym1, ym5 5369 packuswb ym0, ym1 5370 mova [dstq+dsq*0], xm0 5371 vextracti32x4 [dstq+dsq*1], m0, 1 5372 lea dstq, [dstq+dsq*2] 5373 add hq, 2 5374 jl .w16_loop 5375 RET 5376.w32: 5377 vbroadcasti32x4 m4, [blend_shuf] 5378 shufpd m4, m4, 0xf0 5379.w32_loop: 5380 mova ym1, [dstq+dsq*0] 5381 vinserti32x8 m1, [dstq+dsq*1], 1 5382 vpbroadcastd m3, [maskq+hq*2] 5383 mova m2, [tmpq] 5384 add tmpq, 32*2 5385 pshufb m3, m4 5386 punpcklbw m0, m1, m2 5387 punpckhbw m1, m2 5388 pmaddubsw m0, m3 5389 pmaddubsw m1, m3 5390 pmulhrsw m0, m5 5391 pmulhrsw m1, m5 5392 packuswb m0, m1 5393 mova [dstq+dsq*0], ym0 5394 vextracti32x8 [dstq+dsq*1], m0, 1 5395 lea dstq, [dstq+dsq*2] 5396 add hq, 2 5397 jl .w32_loop 5398 RET 5399.w64: 5400 vpbroadcastw m3, [maskq+hq*2] 5401 mova m1, [dstq] 5402 mova m2, [tmpq] 5403 add tmpq, 32*2 5404 punpcklbw m0, m1, m2 5405 punpckhbw m1, m2 5406 pmaddubsw m0, m3 5407 pmaddubsw m1, m3 5408 pmulhrsw m0, m5 5409 pmulhrsw m1, m5 5410 packuswb m0, m1 5411 mova [dstq], m0 5412 add dstq, dsq 5413 inc hq 5414 jl .w64 5415 RET 5416.w128: 5417 vpbroadcastw m6, [maskq+hq*2] 5418 mova m2, [dstq+64*0] 5419 mova m1, [tmpq+64*0] 5420 mova m3, [dstq+64*1] 5421 mova m4, [tmpq+64*1] 5422 add tmpq, 64*2 5423 punpcklbw m0, m2, m1 5424 punpckhbw m2, m1 5425 pmaddubsw m0, m6 5426 pmaddubsw m2, m6 5427 punpcklbw m1, m3, m4 5428 punpckhbw m3, m4 5429 pmaddubsw m1, m6 5430 pmaddubsw m3, m6 5431 REPX {pmulhrsw x, m5}, m0, m2, m1, m3 5432 packuswb m0, m2 5433 packuswb m1, m3 5434 mova [dstq+64*0], m0 5435 mova [dstq+64*1], m1 5436 add dstq, dsq 5437 inc hq 5438 jl .w128 5439 RET 5440 5441cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \ 5442 dst_w, h, src_w, dx, mx0 5443 sub dword mx0m, 4<<14 5444 sub dword src_wm, 8 5445 mov r6, ~0 5446 vpbroadcastd m5, dxm 5447 vpbroadcastd m8, mx0m 5448 vpbroadcastd m6, src_wm 5449 kmovq k3, r6 5450 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 5451 LEA r7, $$ 5452%define base r7-$$ 5453 vpbroadcastd m3, [base+pw_m256] 5454 vpbroadcastd m7, [base+pd_63] 5455 vbroadcasti32x4 m15, [base+pb_8x0_8x8] 5456 vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] 5457 pslld m5, 4 ; dx*16 5458 pslld m6, 14 5459 pxor m2, m2 5460 mova m16, [base+resize_permA] 5461 mova m17, [base+resize_permB] 5462 mova xm18, [base+resize_permC] 5463.loop_y: 5464 xor xd, xd 5465 mova m4, m8 ; per-line working version of mx 5466.loop_x: 5467 pmaxsd m0, m4, m2 5468 psrad m9, m4, 8 ; filter offset (unmasked) 5469 pminsd m0, m6 ; iclip(mx, 0, src_w-8) 5470 psubd m1, m4, m0 ; pshufb offset 5471 psrad m0, 14 ; clipped src_x offset 5472 psrad m1, 14 ; pshufb edge_emu offset 5473 vptestmd k4, m1, m1 5474 pand m9, m7 ; filter offset (masked) 5475 ktestw k4, k4 5476 jz .load 5477 vextracti32x8 ym12, m0, 1 5478 vextracti32x8 ym13, m1, 1 5479 kmovq k1, k3 5480 kmovq k2, k3 5481 vpgatherdq m10{k1}, [srcq+ym0] 5482 vpgatherdq m11{k2}, [srcq+ym12] 5483 kmovq k1, k3 5484 kmovq k2, k3 5485 vpgatherdq m14{k1}, [base+resize_shuf+4+ym1] 5486 vpgatherdq m0{k2}, [base+resize_shuf+4+ym13] 5487 mova m12, m16 5488 mova m13, m17 5489 paddb m14, m15 5490 paddb m0, m15 5491 pshufb m10, m14 5492 pshufb m11, m0 5493 vpermi2d m12, m10, m11 5494 vpermi2d m13, m10, m11 5495 jmp .filter 5496.load: 5497 kmovq k1, k3 5498 kmovq k2, k3 5499 vpgatherdd m12{k1}, [srcq+m0+0] 5500 vpgatherdd m13{k2}, [srcq+m0+4] 5501.filter: 5502 kmovq k1, k3 5503 kmovq k2, k3 5504 vpgatherdd m10{k1}, [base+resize_filter+m9*8+0] 5505 vpgatherdd m11{k2}, [base+resize_filter+m9*8+4] 5506 mova m14, m2 5507 vpdpbusd m14, m12, m10 5508 vpdpbusd m14, m13, m11 5509 packssdw m14, m14 5510 pmulhrsw m14, m3 5511 packuswb m14, m14 5512 vpermd m14, m18, m14 5513 mova [dstq+xq], xm14 5514 paddd m4, m5 5515 add xd, 16 5516 cmp xd, dst_wd 5517 jl .loop_x 5518 add dstq, dst_strideq 5519 add srcq, src_strideq 5520 dec hd 5521 jg .loop_y 5522 RET 5523 5524%endif ; ARCH_X86_64 5525