1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33%macro SMOOTH_WEIGHT_TABLE 1-* 34 %rep %0 35 db %1-128, 127-%1 36 %rotate 1 37 %endrep 38%endmacro 39 40smooth_weights: SMOOTH_WEIGHT_TABLE \ 41 0, 0, 255, 128, 255, 149, 85, 64, \ 42 255, 197, 146, 105, 73, 50, 37, 32, \ 43 255, 225, 196, 170, 145, 123, 102, 84, \ 44 68, 54, 43, 33, 26, 20, 17, 16, \ 45 255, 240, 225, 210, 196, 182, 169, 157, \ 46 145, 133, 122, 111, 101, 92, 83, 74, \ 47 66, 59, 52, 45, 39, 34, 29, 25, \ 48 21, 17, 14, 12, 10, 9, 8, 8, \ 49 255, 248, 240, 233, 225, 218, 210, 203, \ 50 196, 189, 182, 176, 169, 163, 156, 150, \ 51 144, 138, 133, 127, 121, 116, 111, 106, \ 52 101, 96, 91, 86, 82, 77, 73, 69, \ 53 65, 61, 57, 54, 50, 47, 44, 41, \ 54 38, 35, 32, 29, 27, 25, 22, 20, \ 55 18, 16, 15, 13, 12, 10, 9, 8, \ 56 7, 6, 6, 5, 5, 4, 4, 4 57 58; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ 59filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 60 db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 61 db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 62 db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 63 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 64 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 65 db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 66 db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 67 db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 68 db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 69 db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 70 db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 71 db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 72 db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 73 db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 74 db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 75 db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 76 db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 77 db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 78 db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 79filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 80 db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 81 db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 82 db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 83filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 84smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 85 db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 86 db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 87 db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 88smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 89 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 90 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 91 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 92smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 93 db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 94 db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 95 db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 96ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 97 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 98pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 99pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 100pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 101 db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 102 db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 103 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 104z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 105 db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 106 db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 107 db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 108z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 109 db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 110 db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 111 db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 112z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 113 db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 114 db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 115 db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 116z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 117z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 118z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 119z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 120z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 121 db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 122z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 123z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 124z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 125 db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 126 db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 127 db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 128z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 129 db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 130 db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 131 db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 132z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 133 dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 134z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 135 db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 136 db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 137 db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 138z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 139 db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 140 db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 141 db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 142z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 143 db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 144 db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 145 db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 146z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 147 dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 148z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 149 dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 150z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 151 dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 152 dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 153 dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 154z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 155 dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 156 dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 157 dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 158z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 159z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 160z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 161 db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 162z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 163 db 39, 39, 47, 47, 47, 79, 79, 79 164z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 165 db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 166 db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 167 168pb_8_56_0_0: db 8, 56, 0, 0 169pb_m4_36: times 2 db -4, 36 170pb_127_m127: times 2 db 127, -127 171pb_8: times 4 db 8 172pb_15: times 4 db 15 173pb_16: times 4 db 16 174pb_31: times 4 db 31 175pb_63: times 4 db 63 176pb_90: times 4 db 90 177pb_128: times 4 db 128 178pw_128: times 2 dw 128 179pw_255: times 2 dw 255 180pw_512: times 2 dw 512 181 182%define pb_1 (ipred_h_shuf+24) 183%define pb_2 (ipred_h_shuf+20) 184%define pb_3 (ipred_h_shuf+16) 185%define pb_4 (smooth_shuf +48) 186%define pb_7 (ipred_h_shuf+ 0) 187%define pb_9 (z_xpos_bc + 8) 188%define pb_17 (z_xpos_bc + 0) 189%define pb_33 (z_xpos_bc + 4) 190%define pd_8 (filter_taps+128) 191 192%macro JMP_TABLE 3-* 193 %xdefine %1_%2_table (%%table - 2*4) 194 %xdefine %%base mangle(private_prefix %+ _%1_%2) 195 %%table: 196 %rep %0 - 2 197 dd %%base %+ .%3 - (%%table - 2*4) 198 %rotate 1 199 %endrep 200%endmacro 201 202%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) 203 204JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 205JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 206JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 207JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 208JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 209JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 210JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 211JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 212JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 213 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 214JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 215 216cextern dr_intra_derivative 217cextern pb_0to63 218 219SECTION .text 220 221INIT_ZMM avx512icl 222cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h 223 lea r5, [ipred_dc_left_8bpc_avx512icl_table] 224 movd xm0, wm 225 tzcnt wd, wm 226 inc tlq 227 movifnidn hd, hm 228 movu ym1, [tlq] 229 movd xmm3, wd 230 movsxd r6, [r5+wq*4] 231 vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] 232 psrld xm0, 1 233 vpdpbusd ym0, ym1, ym2 234 add r6, r5 235 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table 236 movsxd wq, [r5+wq*4] 237 add wq, r5 238 jmp r6 239 240cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 241 lea r5, [ipred_dc_left_8bpc_avx512icl_table] 242 mov hd, hm 243 tzcnt r6d, hd 244 sub tlq, hq 245 tzcnt wd, wm 246 movd xm0, hm 247 movu ym1, [tlq] 248 movd xmm3, r6d 249 movsxd r6, [r5+r6*4] 250 vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] 251 psrld xm0, 1 252 vpdpbusd ym0, ym1, ym2 253 add r6, r5 254 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table 255 movsxd wq, [r5+wq*4] 256 add wq, r5 257 jmp r6 258.h64: 259 movu ym1, [tlq+32] ; unaligned when jumping here from dc_top 260 vpdpbusd ym0, ym1, ym2 261.h32: 262 vextracti32x4 xm1, ym0, 1 263 paddd xm0, xm1 264.h16: 265 punpckhqdq xm1, xm0, xm0 266 paddd xm0, xm1 267.h8: 268 psrlq xm1, xm0, 32 269 paddd xm0, xm1 270.h4: 271 vpsrlvd xm0, xmm3 272 lea stride3q, [strideq*3] 273 vpbroadcastb m0, xm0 274 jmp wq 275 276cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 277 movifnidn hd, hm 278 movifnidn wd, wm 279 tzcnt r6d, hd 280 lea r5d, [wq+hq] 281 movd xm0, r5d 282 tzcnt r5d, r5d 283 movd xmm4, r5d 284 lea r5, [ipred_dc_8bpc_avx512icl_table] 285 tzcnt wd, wd 286 movsxd r6, [r5+r6*4] 287 movsxd wq, [r5+wq*4+5*4] 288 vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] 289 psrld xm0, 1 290 add r6, r5 291 add wq, r5 292 lea stride3q, [strideq*3] 293 jmp r6 294.h4: 295 movd xmm1, [tlq-4] 296 vpdpbusd xm0, xmm1, xm3 297 jmp wq 298.w4: 299 movd xmm1, [tlq+1] 300 vpdpbusd xm0, xmm1, xm3 301 cmp hd, 4 302 jg .w4_mul 303 psrlw xmm0, xm0, 3 304 jmp .w4_end 305.w4_mul: 306 punpckhqdq xmm1, xm0, xm0 307 lea r2d, [hq*2] 308 mov r6d, 0x55563334 309 paddd xmm1, xm0 310 shrx r6d, r6d, r2d 311 psrlq xmm0, xmm1, 32 312 paddd xmm0, xmm1 313 movd xmm1, r6d 314 psrld xmm0, 2 315 pmulhuw xmm0, xmm1 316.w4_end: 317 vpbroadcastb xm0, xmm0 318.s4: 319 movd [dstq+strideq*0], xm0 320 movd [dstq+strideq*1], xm0 321 movd [dstq+strideq*2], xm0 322 movd [dstq+stride3q ], xm0 323 lea dstq, [dstq+strideq*4] 324 sub hd, 4 325 jg .s4 326 RET 327.h8: 328 movq xmm1, [tlq-8] 329 vpdpbusd xm0, xmm1, xm3 330 jmp wq 331.w8: 332 movq xmm1, [tlq+1] 333 vextracti32x4 xm2, ym0, 1 334 vpdpbusd xm0, xmm1, xm3 335 paddd xmm2, xm2, xm0 336 punpckhqdq xmm0, xmm2, xmm2 337 paddd xmm0, xmm2 338 psrlq xmm1, xmm0, 32 339 paddd xmm0, xmm1 340 vpsrlvd xmm0, xmm4 341 cmp hd, 8 342 je .w8_end 343 mov r6d, 0x5556 344 mov r2d, 0x3334 345 cmp hd, 32 346 cmove r6d, r2d 347 movd xmm1, r6d 348 pmulhuw xmm0, xmm1 349.w8_end: 350 vpbroadcastb xm0, xmm0 351.s8: 352 movq [dstq+strideq*0], xm0 353 movq [dstq+strideq*1], xm0 354 movq [dstq+strideq*2], xm0 355 movq [dstq+stride3q ], xm0 356 lea dstq, [dstq+strideq*4] 357 sub hd, 4 358 jg .s8 359 RET 360.h16: 361 mova xmm1, [tlq-16] 362 vpdpbusd xm0, xmm1, xm3 363 jmp wq 364.w16: 365 movu xmm1, [tlq+1] 366 vextracti32x4 xm2, ym0, 1 367 vpdpbusd xm0, xmm1, xm3 368 paddd xmm2, xm2, xm0 369 punpckhqdq xmm0, xmm2, xmm2 370 paddd xmm0, xmm2 371 psrlq xmm1, xmm0, 32 372 paddd xmm0, xmm1 373 vpsrlvd xmm0, xmm4 374 cmp hd, 16 375 je .w16_end 376 mov r6d, 0x5556 377 mov r2d, 0x3334 378 test hb, 8|32 379 cmovz r6d, r2d 380 movd xmm1, r6d 381 pmulhuw xmm0, xmm1 382.w16_end: 383 vpbroadcastb xm0, xmm0 384.s16: 385 mova [dstq+strideq*0], xm0 386 mova [dstq+strideq*1], xm0 387 mova [dstq+strideq*2], xm0 388 mova [dstq+stride3q ], xm0 389 lea dstq, [dstq+strideq*4] 390 sub hd, 4 391 jg .s16 392 RET 393.h32: 394 mova ym1, [tlq-32] 395 vpdpbusd ym0, ym1, ym3 396 jmp wq 397.w32: 398 movu ym1, [tlq+1] 399 vpdpbusd ym0, ym1, ym3 400 vextracti32x4 xm1, ym0, 1 401 paddd xmm1, xm1, xm0 402 punpckhqdq xmm0, xmm1, xmm1 403 paddd xmm0, xmm1 404 psrlq xmm1, xmm0, 32 405 paddd xmm0, xmm1 406 vpsrlvd xmm0, xmm4 407 cmp hd, 32 408 je .w32_end 409 lea r2d, [hq*2] 410 mov r6d, 0x33345556 411 shrx r6d, r6d, r2d 412 movd xmm1, r6d 413 pmulhuw xmm0, xmm1 414.w32_end: 415 vpbroadcastb ym0, xmm0 416.s32: 417 mova [dstq+strideq*0], ym0 418 mova [dstq+strideq*1], ym0 419 mova [dstq+strideq*2], ym0 420 mova [dstq+stride3q ], ym0 421 lea dstq, [dstq+strideq*4] 422 sub hd, 4 423 jg .s32 424 RET 425.h64: 426 mova ym1, [tlq-64] 427 mova ym2, [tlq-32] 428 vpdpbusd ym0, ym1, ym3 429 vpdpbusd ym0, ym2, ym3 430 jmp wq 431.w64: 432 movu ym1, [tlq+ 1] 433 movu ym2, [tlq+33] 434 vpdpbusd ym0, ym1, ym3 435 vpdpbusd ym0, ym2, ym3 436 vextracti32x4 xm1, ym0, 1 437 paddd xmm1, xm1, xm0 438 punpckhqdq xmm0, xmm1, xmm1 439 paddd xmm0, xmm1 440 psrlq xmm1, xmm0, 32 441 paddd xmm0, xmm1 442 vpsrlvd xmm0, xmm4 443 cmp hd, 64 444 je .w64_end 445 mov r6d, 0x33345556 446 shrx r6d, r6d, hd 447 movd xmm1, r6d 448 pmulhuw xmm0, xmm1 449.w64_end: 450 vpbroadcastb m0, xmm0 451.s64: 452 mova [dstq+strideq*0], m0 453 mova [dstq+strideq*1], m0 454 mova [dstq+strideq*2], m0 455 mova [dstq+stride3q ], m0 456 lea dstq, [dstq+strideq*4] 457 sub hd, 4 458 jg .s64 459 RET 460 461cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 462 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] 463 tzcnt wd, wm 464 movifnidn hd, hm 465 movsxd wq, [r5+wq*4] 466 vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] 467 add wq, r5 468 lea stride3q, [strideq*3] 469 jmp wq 470 471cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 472 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] 473 tzcnt wd, wm 474 movu m0, [tlq+1] 475 movifnidn hd, hm 476 movsxd wq, [r5+wq*4] 477 add wq, r5 478 lea stride3q, [strideq*3] 479 jmp wq 480 481cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 482%define base r6-ipred_h_8bpc_avx512icl_table 483 lea r6, [ipred_h_8bpc_avx512icl_table] 484 tzcnt wd, wm 485 mov hd, hm 486 movsxd wq, [r6+wq*4] 487 lea stride3q, [strideq*3] 488 sub tlq, hq 489 add wq, r6 490 jmp wq 491.w4: 492 mova xmm1, [base+ipred_h_shuf+16] 493.w4_loop: 494 movd xmm0, [tlq+hq-4] 495 pshufb xmm0, xmm1 496 movd [dstq+strideq*0], xmm0 497 pextrd [dstq+strideq*1], xmm0, 1 498 pextrd [dstq+strideq*2], xmm0, 2 499 pextrd [dstq+stride3q ], xmm0, 3 500 lea dstq, [dstq+strideq*4] 501 sub hd, 4 502 jg .w4_loop 503 RET 504.w8: 505 movsldup xmm2, [base+ipred_h_shuf+16] 506 movshdup xmm3, [base+ipred_h_shuf+16] 507.w8_loop: 508 movd xmm1, [tlq+hq-4] 509 pshufb xmm0, xmm1, xmm2 510 pshufb xmm1, xmm3 511 movq [dstq+strideq*0], xmm0 512 movq [dstq+strideq*1], xmm1 513 movhps [dstq+strideq*2], xmm0 514 movhps [dstq+stride3q ], xmm1 515 lea dstq, [dstq+strideq*4] 516 sub hd, 4 517 jg .w8_loop 518 RET 519.w16: 520 movsldup m1, [base+smooth_shuf] 521.w16_loop: 522 vpbroadcastd m0, [tlq+hq-4] 523 pshufb m0, m1 524 mova [dstq+strideq*0], xm0 525 vextracti32x4 [dstq+strideq*1], m0, 2 526 vextracti32x4 [dstq+strideq*2], ym0, 1 527 vextracti32x4 [dstq+stride3q ], m0, 3 528 lea dstq, [dstq+strideq*4] 529 sub hd, 4 530 jg .w16 531 RET 532.w32: 533 vpbroadcastd ym3, [base+pb_1] 534 vpord m2, m3, [base+pb_2] {1to16} 535.w32_loop: 536 vpbroadcastd m1, [tlq+hq-4] 537 pshufb m0, m1, m2 538 pshufb m1, m3 539 mova [dstq+strideq*0], ym0 540 vextracti32x8 [dstq+strideq*1], m0, 1 541 mova [dstq+strideq*2], ym1 542 vextracti32x8 [dstq+stride3q ], m1, 1 543 lea dstq, [dstq+strideq*4] 544 sub hd, 4 545 jg .w32_loop 546 RET 547.w64: 548 vpbroadcastd m4, [base+pb_3] 549 vpbroadcastd m5, [base+pb_2] 550 vpbroadcastd m6, [base+pb_1] 551 pxor m7, m7 552.w64_loop: 553 vpbroadcastd m3, [tlq+hq-4] 554 pshufb m0, m3, m4 555 pshufb m1, m3, m5 556 pshufb m2, m3, m6 557 pshufb m3, m7 558 mova [dstq+strideq*0], m0 559 mova [dstq+strideq*1], m1 560 mova [dstq+strideq*2], m2 561 mova [dstq+stride3q ], m3 562 lea dstq, [dstq+strideq*4] 563 sub hd, 4 564 jg .w64_loop 565 RET 566 567%macro PAETH 0 568 psubusb m1, m5, m4 569 psubusb m0, m4, m5 570 por m1, m0 ; tdiff 571 pavgb m2, m6, m4 572 vpcmpub k1, m1, m7, 1 ; tdiff < ldiff 573 vpblendmb m0{k1}, m4, m6 574 vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 575 psubusb m3, m5, m2 576 psubb m2, m4 577 psubusb m2, m5 578 por m2, m3 579 pminub m1, m7 580 paddusb m2, m2 581 por m2, m4 ; min(tldiff, 255) 582 vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff 583 vmovdqu8 m0{k1}, m5 584%endmacro 585 586cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 587 lea r6, [ipred_paeth_8bpc_avx512icl_table] 588 tzcnt wd, wm 589 vpbroadcastb m5, [tlq] ; topleft 590 mov hd, hm 591 movsxd wq, [r6+wq*4] 592 vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] 593 lea topq, [tlq+1] 594 sub tlq, hq 595 add wq, r6 596 lea stride3q, [strideq*3] 597 jmp wq 598INIT_YMM avx512icl 599.w4: 600 vpbroadcastd m6, [topq] 601 mova m9, [ipred_h_shuf] 602 psubusb m7, m5, m6 603 psubusb m0, m6, m5 604 por m7, m0 ; ldiff 605.w4_loop: 606 vpbroadcastq m4, [tlq+hq-8] 607 pshufb m4, m9 ; left 608 PAETH 609 movd [dstq+strideq*0], xm0 610 pextrd [dstq+strideq*1], xm0, 1 611 pextrd [dstq+strideq*2], xm0, 2 612 pextrd [dstq+stride3q ], xm0, 3 613 sub hd, 8 614 jl .w4_ret 615 vextracti32x4 xm0, m0, 1 616 lea dstq, [dstq+strideq*4] 617 movd [dstq+strideq*0], xm0 618 pextrd [dstq+strideq*1], xm0, 1 619 pextrd [dstq+strideq*2], xm0, 2 620 pextrd [dstq+stride3q ], xm0, 3 621 lea dstq, [dstq+strideq*4] 622 jg .w4_loop 623.w4_ret: 624 RET 625INIT_ZMM avx512icl 626.w8: 627 vpbroadcastq m6, [topq] 628 movsldup m9, [smooth_shuf] 629 psubusb m7, m5, m6 630 psubusb m0, m6, m5 631 por m7, m0 632.w8_loop: 633 vpbroadcastq m4, [tlq+hq-8] 634 pshufb m4, m9 635 PAETH 636 vextracti32x4 xm1, m0, 2 637 vextracti32x4 xm2, ym0, 1 638 vextracti32x4 xm3, m0, 3 639 movq [dstq+strideq*0], xm0 640 movq [dstq+strideq*1], xm1 641 movq [dstq+strideq*2], xm2 642 movq [dstq+stride3q ], xm3 643 sub hd, 8 644 jl .w8_ret 645 lea dstq, [dstq+strideq*4] 646 movhps [dstq+strideq*0], xm0 647 movhps [dstq+strideq*1], xm1 648 movhps [dstq+strideq*2], xm2 649 movhps [dstq+stride3q ], xm3 650 lea dstq, [dstq+strideq*4] 651 jg .w8_loop 652.w8_ret: 653 RET 654.w16: 655 vbroadcasti32x4 m6, [topq] 656 movsldup m9, [smooth_shuf] 657 psubusb m7, m5, m6 658 psubusb m0, m6, m5 659 por m7, m0 660.w16_loop: 661 vpbroadcastd m4, [tlq+hq-4] 662 pshufb m4, m9 663 PAETH 664 mova [dstq+strideq*0], xm0 665 vextracti32x4 [dstq+strideq*1], m0, 2 666 vextracti32x4 [dstq+strideq*2], ym0, 1 667 vextracti32x4 [dstq+stride3q ], m0, 3 668 lea dstq, [dstq+strideq*4] 669 sub hd, 4 670 jg .w16_loop 671 RET 672.w32: 673 vbroadcasti32x8 m6, [topq] 674 mova ym9, ym8 675 psubusb m7, m5, m6 676 psubusb m0, m6, m5 677 por m7, m0 678.w32_loop: 679 vpbroadcastd m4, [tlq+hq-2] 680 pshufb m4, m9 681 PAETH 682 mova [dstq+strideq*0], ym0 683 vextracti32x8 [dstq+strideq*1], m0, 1 684 lea dstq, [dstq+strideq*2] 685 sub hd, 2 686 jg .w32_loop 687 RET 688.w64: 689 movu m6, [topq] 690 psubusb m7, m5, m6 691 psubusb m0, m6, m5 692 por m7, m0 693.w64_loop: 694 vpbroadcastb m4, [tlq+hq-1] 695 PAETH 696 mova [dstq], m0 697 add dstq, strideq 698 dec hd 699 jg .w64_loop 700 RET 701 702cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 703%define base r6-ipred_smooth_v_8bpc_avx512icl_table 704 lea r6, [ipred_smooth_v_8bpc_avx512icl_table] 705 tzcnt wd, wm 706 mov hd, hm 707 movsxd wq, [r6+wq*4] 708 vpbroadcastd m0, [base+pb_127_m127] 709 vpbroadcastd m1, [base+pw_128] 710 lea weightsq, [base+smooth_weights+hq*4] 711 neg hq 712 vpbroadcastb m4, [tlq+hq] ; bottom 713 add wq, r6 714 lea stride3q, [strideq*3] 715 jmp wq 716.w4: 717 vpbroadcastd m2, [tlq+1] 718 movshdup m5, [smooth_shuf] 719 mova ym6, [smooth_endA] 720 punpcklbw m2, m4 ; top, bottom 721 pmaddubsw m3, m2, m0 722 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok 723 paddw m3, m1 ; 128 * top + 129 * bottom + 128 724.w4_loop: 725 vbroadcasti32x4 m0, [weightsq+hq*2] 726 pshufb m0, m5 727 pmaddubsw m0, m2, m0 728 paddw m0, m3 729 vpermb m0, m6, m0 730 vextracti32x4 xm1, ym0, 1 731 movd [dstq+strideq*0], xm0 732 movd [dstq+strideq*1], xm1 733 pextrd [dstq+strideq*2], xm0, 2 734 pextrd [dstq+stride3q ], xm1, 2 735 add hq, 8 736 jg .ret 737 lea dstq, [dstq+strideq*4] 738 pextrd [dstq+strideq*0], xm0, 1 739 pextrd [dstq+strideq*1], xm1, 1 740 pextrd [dstq+strideq*2], xm0, 3 741 pextrd [dstq+stride3q ], xm1, 3 742 lea dstq, [dstq+strideq*4] 743 jl .w4_loop 744.ret: 745 RET 746.w8: 747 vpbroadcastq m2, [tlq+1] 748 movshdup m5, [smooth_shuf] 749 mova ym6, [smooth_endA] 750 punpcklbw m2, m4 751 pmaddubsw m3, m2, m0 752 paddw m1, m2 753 paddw m3, m1 754.w8_loop: 755 vpbroadcastq m0, [weightsq+hq*2] 756 pshufb m0, m5 757 pmaddubsw m0, m2, m0 758 paddw m0, m3 759 vpermb m0, m6, m0 760 vextracti32x4 xm1, ym0, 1 761 movq [dstq+strideq*0], xm0 762 movq [dstq+strideq*1], xm1 763 movhps [dstq+strideq*2], xm0 764 movhps [dstq+stride3q ], xm1 765 lea dstq, [dstq+strideq*4] 766 add hq, 4 767 jl .w8_loop 768 RET 769.w16: 770 vbroadcasti32x4 m3, [tlq+1] 771 movshdup m6, [smooth_shuf] 772 mova m7, [smooth_endB] 773 punpcklbw m2, m3, m4 774 punpckhbw m3, m4 775 pmaddubsw m4, m2, m0 776 pmaddubsw m5, m3, m0 777 paddw m0, m1, m2 778 paddw m1, m3 779 paddw m4, m0 780 paddw m5, m1 781.w16_loop: 782 vpbroadcastq m1, [weightsq+hq*2] 783 pshufb m1, m6 784 pmaddubsw m0, m2, m1 785 pmaddubsw m1, m3, m1 786 paddw m0, m4 787 paddw m1, m5 788 vpermt2b m0, m7, m1 789 mova [dstq+strideq*0], xm0 790 vextracti32x4 [dstq+strideq*1], m0, 2 791 vextracti32x4 [dstq+strideq*2], ym0, 1 792 vextracti32x4 [dstq+stride3q ], m0, 3 793 lea dstq, [dstq+strideq*4] 794 add hq, 4 795 jl .w16_loop 796 RET 797.w32: 798 vbroadcasti32x8 m3, [tlq+1] 799 movshdup m6, [smooth_shuf] 800 mova m7, [smooth_endB] 801 punpcklbw m2, m3, m4 802 punpckhbw m3, m4 803 pmaddubsw m4, m2, m0 804 pmaddubsw m5, m3, m0 805 paddw m0, m1, m2 806 paddw m1, m3 807 paddw m4, m0 808 paddw m5, m1 809.w32_loop: 810 vpbroadcastd m1, [weightsq+hq*2] 811 pshufb m1, m6 812 pmaddubsw m0, m2, m1 813 pmaddubsw m1, m3, m1 814 paddw m0, m4 815 paddw m1, m5 816 vpermt2b m0, m7, m1 817 mova [dstq+strideq*0], ym0 818 vextracti32x8 [dstq+strideq*1], m0, 1 819 lea dstq, [dstq+strideq*2] 820 add hq, 2 821 jl .w32_loop 822 RET 823.w64: 824 movu m3, [tlq+1] 825 mova m6, [smooth_endB] 826 punpcklbw m2, m3, m4 827 punpckhbw m3, m4 828 pmaddubsw m4, m2, m0 829 pmaddubsw m5, m3, m0 830 paddw m0, m1, m2 831 paddw m1, m3 832 paddw m4, m0 833 paddw m5, m1 834.w64_loop: 835 vpbroadcastw m1, [weightsq+hq*2] 836 pmaddubsw m0, m2, m1 837 pmaddubsw m1, m3, m1 838 paddw m0, m4 839 paddw m1, m5 840 vpermt2b m0, m6, m1 841 mova [dstq], m0 842 add dstq, strideq 843 inc hq 844 jl .w64_loop 845 RET 846 847cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 848%define base r5-ipred_smooth_h_8bpc_avx512icl_table 849 lea r5, [ipred_smooth_h_8bpc_avx512icl_table] 850 mov r6d, wd 851 tzcnt wd, wd 852 vpbroadcastb m4, [tlq+r6] ; right 853 mov hd, hm 854 movsxd wq, [r5+wq*4] 855 vpbroadcastd m5, [base+pb_127_m127] 856 vpbroadcastd m6, [base+pw_128] 857 sub tlq, hq 858 add wq, r5 859 vpmovb2m k1, m6 860 lea stride3q, [strideq*3] 861 jmp wq 862.w4: 863 movsldup m3, [smooth_shuf] 864 vpbroadcastq m7, [smooth_weights+4*2] 865 mova ym8, [smooth_endA] 866.w4_loop: 867 vpbroadcastq m0, [tlq+hq-8] 868 mova m2, m4 869 vpshufb m2{k1}, m0, m3 ; left, right 870 pmaddubsw m0, m2, m5 871 pmaddubsw m1, m2, m7 872 paddw m2, m6 873 paddw m0, m2 874 paddw m0, m1 875 vpermb m0, m8, m0 876 vextracti32x4 xm1, ym0, 1 877 movd [dstq+strideq*0], xm0 878 movd [dstq+strideq*1], xm1 879 pextrd [dstq+strideq*2], xm0, 2 880 pextrd [dstq+stride3q ], xm1, 2 881 sub hd, 8 882 jl .ret 883 lea dstq, [dstq+strideq*4] 884 pextrd [dstq+strideq*0], xm0, 1 885 pextrd [dstq+strideq*1], xm1, 1 886 pextrd [dstq+strideq*2], xm0, 3 887 pextrd [dstq+stride3q ], xm1, 3 888 lea dstq, [dstq+strideq*4] 889 jg .w4_loop 890.ret: 891 RET 892.w8: 893 movsldup m3, [smooth_shuf] 894 vbroadcasti32x4 m7, [smooth_weights+8*2] 895 mova ym8, [smooth_endA] 896.w8_loop: 897 vpbroadcastd m0, [tlq+hq-4] 898 mova m2, m4 899 vpshufb m2{k1}, m0, m3 900 pmaddubsw m0, m2, m5 901 pmaddubsw m1, m2, m7 902 paddw m2, m6 903 paddw m0, m2 904 paddw m0, m1 905 vpermb m0, m8, m0 906 vextracti32x4 xm1, ym0, 1 907 movq [dstq+strideq*0], xm0 908 movq [dstq+strideq*1], xm1 909 movhps [dstq+strideq*2], xm0 910 movhps [dstq+stride3q ], xm1 911 lea dstq, [dstq+strideq*4] 912 sub hd, 4 913 jg .w8_loop 914 RET 915.w16: 916 movsldup m7, [smooth_shuf] 917 vbroadcasti32x4 m8, [smooth_weights+16*2] 918 vbroadcasti32x4 m9, [smooth_weights+16*3] 919 mova m10, [smooth_endB] 920.w16_loop: 921 vpbroadcastd m0, [tlq+hq-4] 922 mova m3, m4 923 vpshufb m3{k1}, m0, m7 924 pmaddubsw m2, m3, m5 925 pmaddubsw m0, m3, m8 926 pmaddubsw m1, m3, m9 927 paddw m3, m6 928 paddw m2, m3 929 paddw m0, m2 930 paddw m1, m2 931 vpermt2b m0, m10, m1 932 mova [dstq+strideq*0], xm0 933 vextracti32x4 [dstq+strideq*1], m0, 2 934 vextracti32x4 [dstq+strideq*2], ym0, 1 935 vextracti32x4 [dstq+stride3q ], m0, 3 936 lea dstq, [dstq+strideq*4] 937 sub hd, 4 938 jg .w16_loop 939 RET 940.w32: 941 mova m10, [smooth_endA] 942 vpbroadcastd ym7, [pb_1] 943 vbroadcasti32x8 m8, [smooth_weights+32*2] 944 vbroadcasti32x8 m9, [smooth_weights+32*3] 945 vshufi32x4 m10, m10, q3120 946.w32_loop: 947 vpbroadcastd m0, [tlq+hq-2] 948 mova m3, m4 949 vpshufb m3{k1}, m0, m7 950 pmaddubsw m2, m3, m5 951 pmaddubsw m0, m3, m8 952 pmaddubsw m1, m3, m9 953 paddw m3, m6 954 paddw m2, m3 955 paddw m0, m2 956 paddw m1, m2 957 vpermt2b m0, m10, m1 958 mova [dstq+strideq*0], ym0 959 vextracti32x8 [dstq+strideq*1], m0, 1 960 lea dstq, [dstq+strideq*2] 961 sub hd, 2 962 jg .w32_loop 963 RET 964.w64: 965 mova m7, [smooth_weights+64*2] 966 mova m8, [smooth_weights+64*3] 967 mova m9, [smooth_endA] 968.w64_loop: 969 mova m3, m4 970 vpbroadcastb m3{k1}, [tlq+hq-1] 971 pmaddubsw m2, m3, m5 972 pmaddubsw m0, m3, m7 973 pmaddubsw m1, m3, m8 974 paddw m3, m6 975 paddw m2, m3 976 paddw m0, m2 977 paddw m1, m2 978 vpermt2b m0, m9, m1 979 mova [dstq], m0 980 add dstq, strideq 981 dec hd 982 jg .w64_loop 983 RET 984 985cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 986%define base r5-ipred_smooth_8bpc_avx512icl_table 987 lea r5, [ipred_smooth_8bpc_avx512icl_table] 988 mov r6d, wd 989 tzcnt wd, wd 990 mov hd, hm 991 vpbroadcastb m6, [tlq+r6] ; right 992 sub tlq, hq 993 movsxd wq, [r5+wq*4] 994 vpbroadcastd m7, [base+pb_127_m127] 995 vpbroadcastb m0, [tlq] ; bottom 996 vpbroadcastd m1, [base+pw_255] 997 add wq, r5 998 lea v_weightsq, [base+smooth_weights+hq*2] 999 vpmovb2m k1, m1 1000 lea stride3q, [strideq*3] 1001 jmp wq 1002.w4: 1003 vpbroadcastd m8, [tlq+hq+1] 1004 movsldup m4, [smooth_shuf] 1005 movshdup m5, [smooth_shuf] 1006 vpbroadcastq m9, [smooth_weights+4*2] 1007 mova ym11, [smooth_endA] 1008 1009 punpcklbw m8, m0 ; top, bottom 1010 pmaddubsw m10, m8, m7 1011 paddw m1, m8 ; 1 * top + 256 * bottom + 255 1012 paddw m10, m1 ; 128 * top + 129 * bottom + 255 1013.w4_loop: 1014 vpbroadcastq m1, [tlq+hq-8] 1015 vbroadcasti32x4 m0, [v_weightsq] 1016 add v_weightsq, 16 1017 mova m2, m6 1018 vpshufb m2{k1}, m1, m4 ; left, right 1019 pmaddubsw m1, m2, m7 ; 127 * left - 127 * right 1020 pshufb m0, m5 1021 pmaddubsw m0, m8, m0 1022 paddw m1, m2 ; 128 * left + 129 * right 1023 pmaddubsw m2, m9 1024 paddw m0, m10 1025 paddw m1, m2 1026 pavgw m0, m1 1027 vpermb m0, m11, m0 1028 vextracti32x4 xm1, ym0, 1 1029 movd [dstq+strideq*0], xm0 1030 movd [dstq+strideq*1], xm1 1031 pextrd [dstq+strideq*2], xm0, 2 1032 pextrd [dstq+stride3q ], xm1, 2 1033 sub hd, 8 1034 jl .ret 1035 lea dstq, [dstq+strideq*4] 1036 pextrd [dstq+strideq*0], xm0, 1 1037 pextrd [dstq+strideq*1], xm1, 1 1038 pextrd [dstq+strideq*2], xm0, 3 1039 pextrd [dstq+stride3q ], xm1, 3 1040 lea dstq, [dstq+strideq*4] 1041 jg .w4_loop 1042.ret: 1043 RET 1044.w8: 1045 vpbroadcastq m8, [tlq+hq+1] 1046 movsldup m4, [smooth_shuf] 1047 movshdup m5, [smooth_shuf] 1048 vbroadcasti32x4 m9, [smooth_weights+8*2] 1049 mova ym11, [smooth_endA] 1050 punpcklbw m8, m0 1051 pmaddubsw m10, m8, m7 1052 paddw m1, m8 1053 paddw m10, m1 1054.w8_loop: 1055 vpbroadcastd m1, [tlq+hq-4] 1056 vpbroadcastq m0, [v_weightsq] 1057 add v_weightsq, 8 1058 mova m2, m6 1059 vpshufb m2{k1}, m1, m4 1060 pmaddubsw m1, m2, m7 1061 pshufb m0, m5 1062 pmaddubsw m0, m8, m0 1063 paddw m1, m2 1064 pmaddubsw m2, m9 1065 paddw m0, m10 1066 paddw m1, m2 1067 pavgw m0, m1 1068 vpermb m0, m11, m0 1069 vextracti32x4 xm1, ym0, 1 1070 movq [dstq+strideq*0], xm0 1071 movq [dstq+strideq*1], xm1 1072 movhps [dstq+strideq*2], xm0 1073 movhps [dstq+stride3q ], xm1 1074 lea dstq, [dstq+strideq*4] 1075 sub hd, 4 1076 jg .w8_loop 1077 RET 1078.w16: 1079 vbroadcasti32x4 m9, [tlq+hq+1] 1080 movsldup m5, [smooth_shuf] 1081 movshdup m10, [smooth_shuf] 1082 vbroadcasti32x4 m11, [smooth_weights+16*2] 1083 vbroadcasti32x4 m12, [smooth_weights+16*3] 1084 mova m15, [smooth_endB] 1085 punpcklbw m8, m9, m0 1086 punpckhbw m9, m0 1087 pmaddubsw m13, m8, m7 1088 pmaddubsw m14, m9, m7 1089 paddw m0, m1, m8 1090 paddw m1, m9 1091 paddw m13, m0 1092 paddw m14, m1 1093.w16_loop: 1094 vpbroadcastd m0, [tlq+hq-4] 1095 vpbroadcastq m1, [v_weightsq] 1096 add v_weightsq, 8 1097 mova m4, m6 1098 vpshufb m4{k1}, m0, m5 1099 pmaddubsw m2, m4, m7 1100 pshufb m1, m10 1101 pmaddubsw m0, m8, m1 1102 pmaddubsw m1, m9, m1 1103 paddw m2, m4 1104 pmaddubsw m3, m4, m11 1105 pmaddubsw m4, m12 1106 paddw m0, m13 1107 paddw m1, m14 1108 paddw m3, m2 1109 paddw m4, m2 1110 pavgw m0, m3 1111 pavgw m1, m4 1112 vpermt2b m0, m15, m1 1113 mova [dstq+strideq*0], xm0 1114 vextracti32x4 [dstq+strideq*1], m0, 2 1115 vextracti32x4 [dstq+strideq*2], ym0, 1 1116 vextracti32x4 [dstq+stride3q ], m0, 3 1117 lea dstq, [dstq+strideq*4] 1118 sub hd, 4 1119 jg .w16_loop 1120 RET 1121.w32: 1122 vbroadcasti32x8 m9, [tlq+hq+1] 1123 movshdup m10, [smooth_shuf] 1124 mova m12, [smooth_weights+32*2] 1125 vpbroadcastd ym5, [pb_1] 1126 mova m15, [smooth_endB] 1127 punpcklbw m8, m9, m0 1128 punpckhbw m9, m0 1129 pmaddubsw m13, m8, m7 1130 pmaddubsw m14, m9, m7 1131 vshufi32x4 m11, m12, m12, q2020 1132 vshufi32x4 m12, m12, q3131 1133 paddw m0, m1, m8 1134 paddw m1, m9 1135 paddw m13, m0 1136 paddw m14, m1 1137.w32_loop: 1138 vpbroadcastd m0, [tlq+hq-2] 1139 vpbroadcastd m1, [v_weightsq] 1140 add v_weightsq, 4 1141 mova m4, m6 1142 vpshufb m4{k1}, m0, m5 1143 pmaddubsw m2, m4, m7 1144 pshufb m1, m10 1145 pmaddubsw m0, m8, m1 1146 pmaddubsw m1, m9, m1 1147 paddw m2, m4 1148 pmaddubsw m3, m4, m11 1149 pmaddubsw m4, m12 1150 paddw m0, m13 1151 paddw m1, m14 1152 paddw m3, m2 1153 paddw m4, m2 1154 pavgw m0, m3 1155 pavgw m1, m4 1156 vpermt2b m0, m15, m1 1157 mova [dstq+strideq*0], ym0 1158 vextracti32x8 [dstq+strideq*1], m0, 1 1159 lea dstq, [dstq+strideq*2] 1160 sub hd, 2 1161 jg .w32_loop 1162 RET 1163.w64: 1164 movu m9, [tlq+hq+1] 1165 mova m11, [smooth_weights+64*2] 1166 mova m2, [smooth_weights+64*3] 1167 mova m14, [smooth_endB] 1168 punpcklbw m8, m9, m0 1169 punpckhbw m9, m0 1170 pmaddubsw m12, m8, m7 1171 pmaddubsw m13, m9, m7 1172 vshufi32x4 m10, m11, m2, q2020 1173 vshufi32x4 m11, m2, q3131 1174 paddw m0, m1, m8 1175 paddw m1, m9 1176 paddw m12, m0 1177 paddw m13, m1 1178.w64_loop: 1179 mova m4, m6 1180 vpbroadcastb m4{k1}, [tlq+hq-1] 1181 vpbroadcastw m1, [v_weightsq] 1182 add v_weightsq, 2 1183 pmaddubsw m2, m4, m7 1184 pmaddubsw m0, m8, m1 1185 pmaddubsw m1, m9, m1 1186 paddw m2, m4 1187 pmaddubsw m3, m4, m10 1188 pmaddubsw m4, m11 1189 paddw m0, m12 1190 paddw m1, m13 1191 paddw m3, m2 1192 paddw m4, m2 1193 pavgw m0, m3 1194 pavgw m1, m4 1195 vpermt2b m0, m14, m1 1196 mova [dstq], m0 1197 add dstq, strideq 1198 dec hd 1199 jg .w64_loop 1200 RET 1201 1202cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 1203 movifnidn wd, wm 1204 movifnidn hd, hm 1205 lea stride3q, [strideq*3] 1206 cmp wd, 8 1207 jg .w32 1208 movq xmm3, [palq] 1209 je .w8 1210.w4: 1211 movq xmm0, [idxq] 1212 add idxq, 8 1213 psrlw xmm1, xmm0, 4 1214 punpcklbw xmm0, xmm1 1215 pshufb xmm0, xmm3, xmm0 1216 movd [dstq+strideq*0], xmm0 1217 pextrd [dstq+strideq*1], xmm0, 1 1218 pextrd [dstq+strideq*2], xmm0, 2 1219 pextrd [dstq+stride3q ], xmm0, 3 1220 lea dstq, [dstq+strideq*4] 1221 sub hd, 4 1222 jg .w4 1223 RET 1224.w8: 1225 movu xmm2, [idxq] 1226 add idxq, 16 1227 pshufb xmm1, xmm3, xmm2 1228 psrlw xmm2, 4 1229 pshufb xmm2, xmm3, xmm2 1230 punpcklbw xmm0, xmm1, xmm2 1231 punpckhbw xmm1, xmm2 1232 movq [dstq+strideq*0], xmm0 1233 movhps [dstq+strideq*1], xmm0 1234 movq [dstq+strideq*2], xmm1 1235 movhps [dstq+stride3q ], xmm1 1236 lea dstq, [dstq+strideq*4] 1237 sub hd, 4 1238 jg .w8 1239 RET 1240.w16: 1241 pmovzxdq m0, [idxq] 1242 add idxq, 32 1243 vpmultishiftqb m0, m3, m0 1244 pshufb m0, m5, m0 1245 mova [dstq+strideq*0], xm0 1246 vextracti32x4 [dstq+strideq*1], ym0, 1 1247 vextracti32x4 [dstq+strideq*2], m0, 2 1248 vextracti32x4 [dstq+stride3q ], m0, 3 1249 lea dstq, [dstq+strideq*4] 1250 sub hd, 4 1251 jg .w16 1252 RET 1253.w32: 1254 vpbroadcastq m3, [pal_unpack+0] 1255 vpbroadcastq m5, [palq] 1256 cmp wd, 32 1257 jl .w16 1258 pmovzxbd m2, [pal_perm] 1259 vpbroadcastq m4, [pal_unpack+8] 1260 jg .w64 1261.w32_loop: 1262 vpermd m1, m2, [idxq] 1263 add idxq, 64 1264 vpmultishiftqb m0, m3, m1 1265 vpmultishiftqb m1, m4, m1 1266 pshufb m0, m5, m0 1267 pshufb m1, m5, m1 1268 mova [dstq+strideq*0], ym0 1269 vextracti32x8 [dstq+strideq*1], m0, 1 1270 mova [dstq+strideq*2], ym1 1271 vextracti32x8 [dstq+stride3q ], m1, 1 1272 lea dstq, [dstq+strideq*4] 1273 sub hd, 4 1274 jg .w32_loop 1275 RET 1276.w64: 1277 vpermd m1, m2, [idxq] 1278 add idxq, 64 1279 vpmultishiftqb m0, m3, m1 1280 vpmultishiftqb m1, m4, m1 1281 pshufb m0, m5, m0 1282 pshufb m1, m5, m1 1283 mova [dstq+strideq*0], m0 1284 mova [dstq+strideq*1], m1 1285 lea dstq, [dstq+strideq*2] 1286 sub hd, 2 1287 jg .w64 1288 RET 1289 1290%if WIN64 1291 DECLARE_REG_TMP 4 1292%else 1293 DECLARE_REG_TMP 8 1294%endif 1295 1296cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx 1297%define base r7-z_filter_t0 1298 lea r7, [z_filter_t0] 1299 tzcnt wd, wm 1300 movifnidn angled, anglem 1301 lea t0, [dr_intra_derivative] 1302 movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] 1303 inc tlq 1304 mov dxd, angled 1305 and dxd, 0x7e 1306 add angled, 165 ; ~90 1307 movzx dxd, word [t0+dxq] 1308 lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] 1309 movifnidn hd, hm 1310 xor angled, 0x4ff ; d = 90 - angle 1311 mova m14, [base+z_frac_table] 1312 vpbroadcastd m15, [base+pw_512] 1313 jmp wq 1314.w4: 1315 mova m9, [pb_0to63] 1316 pminud m8, m9, [base+pb_7] {1to16} 1317 vpbroadcastq m7, [tlq] 1318 pshufb m7, m8 1319 cmp angleb, 40 1320 jae .w4_no_upsample 1321 lea r3d, [angleq-1024] 1322 sar r3d, 7 1323 add r3d, hd 1324 jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 1325 pshufb xmm0, xm7, [base+z_filter_s4] 1326 mova xmm1, [tlq-1] 1327 pshufb xmm1, [base+z_xpos_off2a] 1328 vpbroadcastd xmm2, [base+pb_m4_36] 1329 vpbroadcastq m4, [pb_0to63] 1330 pmaddubsw xmm0, xmm2 1331 pmaddubsw xmm1, xmm2 1332 add dxd, dxd 1333 kxnorw k1, k1, k1 1334 paddw xmm0, xmm1 1335 pmulhrsw xm0, xmm0, xm15 1336 packuswb xm0, xm0 1337 punpcklbw ym7{k1}, ym0 1338 jmp .w4_main2 1339.w4_no_upsample: 1340 test angled, 0x400 1341 jnz .w4_main ; !enable_intra_edge_filter 1342 lea r3d, [hq+3] 1343 vpbroadcastb xm0, r3d 1344 vpbroadcastb xm1, angled 1345 shr angled, 8 ; is_sm << 1 1346 vpcmpeqb k1, xm0, [base+z_filter_wh] 1347 vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] 1348 kmovw r5d, k1 1349 test r5d, r5d 1350 jz .w4_main 1351 vbroadcasti32x4 ym0, [tlq-1] 1352 pshufb ym0, [base+z_filter4_s1] 1353 popcnt r5d, r5d ; filter_strength 1354 pshufb ym1, ym7, [z_filter_s4] 1355 pshufb ym7, [base+z_filter_s3] 1356 vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] 1357 vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] 1358 pmaddubsw ym0, ym11 1359 pmaddubsw ym1, ym11 1360 pmaddubsw ym7, ym12 1361 paddw ym0, ym1 1362 paddw ym7, ym0 1363 pmulhrsw ym7, ym15 1364 cmp hd, 4 1365 je .w4_filter_end 1366 vpbroadcastd m8, [base+pb_9] 1367 pminub m8, m9 1368.w4_filter_end: 1369 paddb m8, m8 1370 vpermb m7, m8, m7 1371.w4_main: 1372 vpbroadcastq m4, [base+z_xpos_off1a] 1373.w4_main2: 1374 movsldup m2, [base+z_xpos_mul] 1375 vpbroadcastw m5, dxd 1376 vbroadcasti32x4 m3, [base+z_xpos_bc] 1377 lea r2, [strideq*3] 1378 pmullw m2, m5 ; xpos 1379 psllw m5, 5 ; dx*8 1380.w4_loop: 1381 psrlw m1, m2, 3 1382 pshufb m0, m2, m3 1383 vpermw m1, m1, m14 ; 64-frac, frac 1384 paddsb m0, m4 ; base, base+1 1385 vpermb m0, m0, m7 ; top[base], top[base+1] 1386 paddsw m2, m5 ; xpos += dx 1387 pmaddubsw m0, m1 ; v 1388 pmulhrsw m0, m15 1389 packuswb m0, m0 1390 vextracti32x4 xm1, ym0, 1 1391 movd [dstq+strideq*0], xm0 1392 pextrd [dstq+strideq*1], xm0, 1 1393 movd [dstq+strideq*2], xm1 1394 pextrd [dstq+r2 ], xm1, 1 1395 sub hd, 8 1396 jl .w4_end 1397 vextracti32x4 xm1, m0, 2 ; top[max_base_x] 1398 lea dstq, [dstq+strideq*4] 1399 vextracti32x4 xm0, m0, 3 1400 movd [dstq+strideq*0], xm1 1401 pextrd [dstq+strideq*1], xm1, 1 1402 movd [dstq+strideq*2], xm0 1403 pextrd [dstq+r2 ], xm0, 1 1404 lea dstq, [dstq+strideq*4] 1405 jg .w4_loop 1406.w4_end: 1407 RET 1408.w8_filter: 1409 mova ym0, [base+z_filter_s1] 1410 popcnt r5d, r5d 1411 vbroadcasti32x4 ym1, [base+z_filter_s2] 1412 vbroadcasti32x4 ym3, [base+z_filter_s3] 1413 vbroadcasti32x4 ym4, [base+z_filter_s4] 1414 vpermi2b ym0, ym7, ym2 ; al bl 1415 mova ym5, [base+z_filter_s5] 1416 pshufb ym1, ym7, ym1 ; ah bh 1417 vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] 1418 pshufb ym3, ym7, ym3 ; cl ch 1419 vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] 1420 pshufb ym4, ym7, ym4 ; el dl 1421 vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] 1422 vpermb ym5, ym5, ym7 ; eh dh 1423 pmaddubsw ym0, ym11 1424 pmaddubsw ym1, ym11 1425 pmaddubsw ym2, ym3, ym12 1426 pmaddubsw ym3, ym13 1427 pmaddubsw ym4, ym11 1428 pmaddubsw ym5, ym11 1429 paddw ym0, ym2 1430 paddw ym1, ym3 1431 paddw ym0, ym4 1432 paddw ym1, ym5 1433 pmulhrsw ym0, ym15 1434 pmulhrsw ym1, ym15 1435 packuswb ym0, ym1 1436 ret 1437.w8: 1438 lea r3d, [angleq+216] 1439 mov r3b, hb 1440 cmp r3d, 8 1441 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1442 lea r3d, [hq-1] 1443 mova xm1, [base+z_filter_s4] 1444 vpbroadcastb xm2, r3d 1445 mova xm7, [tlq-1] 1446 vinserti32x4 ym7, [tlq+7], 1 1447 vbroadcasti32x4 ym0, [base+z_xpos_off1a] 1448 vpbroadcastd ym3, [base+pb_m4_36] 1449 pminub xm2, xm1 1450 pshufb ym0, ym7, ym0 1451 vinserti32x4 ym1, xm2, 1 1452 psrldq ym7, 1 1453 pshufb ym1, ym7, ym1 1454 pmaddubsw ym0, ym3 1455 pmaddubsw ym1, ym3 1456 vbroadcasti32x4 m8, [pb_0to63] 1457 add dxd, dxd 1458 paddw ym0, ym1 1459 pmulhrsw ym0, ym15 1460 packuswb ym0, ym0 1461 punpcklbw ym7, ym0 1462 jmp .w8_main2 1463.w8_no_upsample: 1464 lea r3d, [hq+7] 1465 mova m9, [pb_0to63] 1466 vpbroadcastb ym0, r3d 1467 and r3d, 7 1468 vbroadcasti32x4 m7, [tlq] 1469 or r3d, 8 ; imin(h+7, 15) 1470 vpbroadcastb m8, r3d 1471 pminub m8, m9 1472 pshufb m7, m8 1473 test angled, 0x400 1474 jnz .w8_main 1475 vpbroadcastb ym1, angled 1476 shr angled, 8 1477 vpcmpeqb k1, ym0, [base+z_filter_wh] 1478 mova xm0, [base+z_filter_t0+angleq*8] 1479 vpcmpgtb k1{k1}, ym1, ym0 1480 kmovd r5d, k1 1481 test r5d, r5d 1482 jz .w8_main 1483 vpbroadcastd ym2, [tlq-4] 1484 call .w8_filter 1485 cmp hd, 8 1486 jle .w8_filter_end 1487 vpbroadcastd m8, [base+pb_17] 1488 add r3d, 2 1489 pminub m8, m9 1490.w8_filter_end: 1491 vpermb m7, m8, m0 1492.w8_main: 1493 vbroadcasti32x4 m8, [base+z_xpos_off1a] 1494.w8_main2: 1495 movsldup m4, [base+z_xpos_mul] 1496 vpbroadcastw m9, dxd 1497 shl r3d, 6 1498 vpbroadcastd m5, [base+z_xpos_bc+8*0] 1499 pmullw m4, m9 ; xpos 1500 vpbroadcastd m6, [base+z_xpos_bc+8*1] 1501 sub r3d, dxd 1502 shl dxd, 3 1503 psllw m9, 5 ; dx*8 1504 lea r2, [strideq*3] 1505.w8_loop: 1506 psrlw m3, m4, 3 1507 pshufb m0, m4, m5 1508 pshufb m1, m4, m6 1509 vpermw m3, m3, m14 1510 paddsb m0, m8 1511 paddsb m1, m8 1512 vpermb m0, m0, m7 1513 vpermb m1, m1, m7 1514 paddsw m4, m9 1515 punpcklqdq m2, m3, m3 1516 pmaddubsw m0, m2 1517 punpckhqdq m3, m3 1518 pmaddubsw m1, m3 1519 pmulhrsw m0, m15 1520 pmulhrsw m1, m15 1521 packuswb m0, m1 1522 vextracti32x4 xm1, ym0, 1 1523 movq [dstq+strideq*0], xm0 1524 movhps [dstq+strideq*1], xm0 1525 movq [dstq+strideq*2], xm1 1526 movhps [dstq+r2 ], xm1 1527 sub hd, 8 1528 jl .w8_end 1529 vextracti32x8 ym0, m0, 1 1530 lea dstq, [dstq+strideq*4] 1531 vextracti32x4 xm1, ym0, 1 1532 movq [dstq+strideq*0], xm0 1533 movhps [dstq+strideq*1], xm0 1534 movq [dstq+strideq*2], xm1 1535 movhps [dstq+r2 ], xm1 1536 jz .w8_end 1537 lea dstq, [dstq+strideq*4] 1538 sub r3d, dxd 1539 jg .w8_loop 1540 vextracti32x4 xm7, m7, 3 1541.w8_end_loop: 1542 movq [dstq+strideq*0], xm7 1543 movq [dstq+strideq*1], xm7 1544 movq [dstq+strideq*2], xm7 1545 movq [dstq+r2 ], xm7 1546 lea dstq, [dstq+strideq*4] 1547 sub hd, 4 1548 jg .w8_end_loop 1549.w8_end: 1550 RET 1551.w16_filter: 1552 mova m0, [base+z_filter_s1] 1553 popcnt r5d, r5d 1554 vbroadcasti32x4 m1, [base+z_filter_s2] 1555 vbroadcasti32x4 m3, [base+z_filter_s3] 1556 vbroadcasti32x4 m4, [base+z_filter_s4] 1557 vpermi2b m0, m7, m2 ; al bl 1558 mova m5, [base+z_filter_s5] 1559 pshufb m1, m7, m1 ; ah bh 1560 vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] 1561 pshufb m3, m7, m3 ; cl ch 1562 vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] 1563 pshufb m4, m7, m4 ; el dl 1564 vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] 1565 vpermb m5, m5, m7 ; eh dh 1566 pmaddubsw m0, m11 1567 pmaddubsw m1, m11 1568 pmaddubsw m2, m3, m12 1569 pmaddubsw m3, m13 1570 pmaddubsw m4, m11 1571 pmaddubsw m5, m11 1572 paddw m0, m2 1573 paddw m1, m3 1574 paddw m0, m4 1575 paddw m1, m5 1576 pmulhrsw m0, m15 1577 pmulhrsw m1, m15 1578 packuswb m0, m1 1579 ret 1580.w16: 1581 lea r3d, [hq+15] 1582 mova m9, [pb_0to63] 1583 vpbroadcastb ym0, r3d 1584 and r3d, 15 1585 movu ym7, [tlq] 1586 or r3d, 16 ; imin(h+15, 31) 1587 vpbroadcastb m8, r3d 1588 pminub m8, m9 1589 vpermb m7, m8, m7 1590 test angled, 0x400 1591 jnz .w16_main 1592 vpbroadcastb ym1, angled 1593 shr angled, 8 1594 vpcmpeqb k1, ym0, [base+z_filter_wh] 1595 mova xm0, [base+z_filter_t0+angleq*8] 1596 vpcmpgtb k1{k1}, ym1, ym0 1597 kmovd r5d, k1 1598 test r5d, r5d 1599 jz .w16_main 1600 vpbroadcastd m2, [tlq-4] 1601 call .w16_filter 1602 cmp hd, 16 1603 jle .w16_filter_end 1604 vpbroadcastd m8, [base+pb_33] 1605 add r3d, 2 1606 pminub m8, m9 1607.w16_filter_end: 1608 vpermb m7, m8, m0 1609.w16_main: 1610 movshdup m3, [base+z_xpos_mul] 1611 vpbroadcastw m8, dxd 1612 shl r3d, 6 1613 vpbroadcastd m4, [base+z_xpos_bc] 1614 pmullw m3, m8 ; xpos 1615 vbroadcasti32x4 m5, [base+z_xpos_off1a] 1616 sub r3d, dxd 1617 shl dxd, 2 1618 vbroadcasti32x4 m6, [base+z_xpos_off1b] 1619 psllw m8, 4 ; dx*4 1620 lea r2, [strideq*3] 1621.w16_loop: 1622 pshufb m1, m3, m4 1623 psrlw m2, m3, 3 1624 paddsb m0, m1, m5 1625 vpermw m2, m2, m14 1626 paddsb m1, m6 1627 vpermb m0, m0, m7 1628 vpermb m1, m1, m7 1629 paddsw m3, m8 1630 pmaddubsw m0, m2 1631 pmaddubsw m1, m2 1632 pmulhrsw m0, m15 1633 pmulhrsw m1, m15 1634 packuswb m0, m1 1635 mova [dstq+strideq*0], xm0 1636 vextracti32x4 [dstq+strideq*1], ym0, 1 1637 vextracti32x4 [dstq+strideq*2], m0, 2 1638 vextracti32x4 [dstq+r2 ], m0, 3 1639 sub hd, 4 1640 jz .w16_end 1641 lea dstq, [dstq+strideq*4] 1642 sub r3d, dxd 1643 jg .w16_loop 1644 vextracti32x4 xm7, m7, 3 1645.w16_end_loop: 1646 mova [dstq+strideq*0], xm7 1647 mova [dstq+strideq*1], xm7 1648 mova [dstq+strideq*2], xm7 1649 mova [dstq+r2 ], xm7 1650 lea dstq, [dstq+strideq*4] 1651 sub hd, 4 1652 jg .w16_end_loop 1653.w16_end: 1654 RET 1655.w32_filter: 1656 mova m0, [base+z_filter_s1] 1657 vbroadcasti32x4 m1, [base+z_filter_s2] 1658 vbroadcasti32x4 m3, [base+z_filter_s3] 1659 vbroadcasti32x4 m4, [base+z_filter_s4] 1660 vpermi2b m0, m7, m2 ; al bl 1661 mova m5, [base+z_filter_s5] 1662 pshufb m1, m7, m1 ; ah bh 1663 vpbroadcastd m11, [base+z_filter_k+4*2+12*0] 1664 pshufb m3, m7, m3 ; cl ch 1665 vpbroadcastd m12, [base+z_filter_k+4*2+12*1] 1666 pshufb m4, m7, m4 ; el dl 1667 vpbroadcastd m13, [base+z_filter_k+4*2+12*2] 1668 vpermi2b m5, m7, m8 ; eh dh 1669 pmaddubsw m0, m11 1670 pmaddubsw m1, m11 1671 pmaddubsw m2, m3, m12 1672 pmaddubsw m3, m13 1673 pmaddubsw m4, m11 1674 pmaddubsw m5, m11 1675 paddw m0, m2 1676 paddw m1, m3 1677 paddw m0, m4 1678 paddw m1, m5 1679 pmulhrsw m0, m15 1680 pmulhrsw m1, m15 1681 packuswb m7, m0, m1 1682 ret 1683.w32: 1684 lea r3d, [hq+31] 1685 vpbroadcastb m9, r3d 1686 and r3d, 31 1687 pminub m10, m9, [pb_0to63] 1688 or r3d, 32 ; imin(h+31, 63) 1689 vpermb m7, m10, [tlq] 1690 vpbroadcastb m8, [tlq+r3] 1691 test angled, 0x400 ; !enable_intra_edge_filter 1692 jnz .w32_main 1693 vpbroadcastd m2, [tlq-4] 1694 call .w32_filter 1695 cmp hd, 64 1696 je .w32_h64_filter_end 1697 vpermb m8, m9, m7 1698 vpermb m7, m10, m7 1699 jmp .w32_main 1700.w32_h64_filter_end: ; edge case for 32x64 1701 movd xmm0, [tlq+r3-1] 1702 movd xmm1, [base+pb_8_56_0_0] 1703 add r3d, 2 1704 pmaddubsw xmm0, xmm1 1705 vptestmw k1, xmm1, xmm1 ; 0x01 1706 pmulhrsw xm0, xmm0, xm15 1707 vmovdqu8 m8{k1}, m0 1708.w32_main: 1709 rorx r2d, dxd, 30 1710 vpbroadcastd m4, [base+z_xpos_bc] 1711 vpbroadcastw m3, r2d 1712 vbroadcasti32x8 m5, [base+z_xpos_off2a] 1713 shl r3d, 6 1714 vbroadcasti32x8 m6, [base+z_xpos_off2b] 1715 sub r3d, dxd 1716 paddw m9, m3, m3 1717 add dxd, dxd 1718 vinserti32x8 m3, ym9, 1 1719.w32_loop: 1720 pshufb m1, m3, m4 1721 psrlw m2, m3, 3 1722 paddsb m0, m1, m5 1723 vpermw m2, m2, m14 1724 paddsb m1, m6 1725 vpermi2b m0, m7, m8 1726 vpermi2b m1, m7, m8 1727 paddsw m3, m9 1728 pmaddubsw m0, m2 1729 pmaddubsw m1, m2 1730 pmulhrsw m0, m15 1731 pmulhrsw m1, m15 1732 packuswb m0, m1 1733 mova [dstq+strideq*0], ym0 1734 vextracti32x8 [dstq+strideq*1], m0, 1 1735 sub hd, 2 1736 jz .w32_end 1737 lea dstq, [dstq+strideq*2] 1738 sub r3d, dxd 1739 jg .w32_loop 1740 punpckhqdq ym8, ym8 1741.w32_end_loop: 1742 mova [dstq+strideq*0], ym8 1743 mova [dstq+strideq*1], ym8 1744 lea dstq, [dstq+strideq*2] 1745 sub hd, 2 1746 jg .w32_end_loop 1747.w32_end: 1748 RET 1749.w64_filter: 1750 vbroadcasti32x4 m3, [base+z_filter_s2] 1751 mova m1, [base+z_filter_s1] 1752 pshufb m0, m3 ; al bl 1753 vpermi2b m1, m7, m2 1754 vbroadcasti32x4 m4, [base+z_filter_s4] 1755 pshufb m6, m8, m4 ; el dl 1756 pshufb m9, m7, m4 1757 pminub m10, m13, [base+z_filter_s5] 1758 pshufb m2, m8, m3 ; ah bh 1759 pshufb m3, m7, m3 1760 vbroadcasti32x4 m5, [base+z_filter_s3] 1761 vpermb m10, m10, m8 ; eh dh 1762 pshufb m11, m4 1763 vpbroadcastd m4, [base+z_filter_k+4*2+12*0] 1764 pshufb m8, m5 ; cl ch 1765 pshufb m7, m5 1766 vpbroadcastd m5, [base+z_filter_k+4*2+12*1] 1767 REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 1768 pmaddubsw m4, m8, m5 1769 pmaddubsw m5, m7, m5 1770 paddw m0, m6 1771 vpbroadcastd m6, [base+z_filter_k+4*2+12*2] 1772 paddw m1, m9 1773 pmaddubsw m7, m6 1774 pmaddubsw m8, m6 1775 paddw m2, m10 1776 paddw m3, m11 1777 paddw m0, m4 1778 paddw m1, m5 1779 paddw m2, m8 1780 paddw m3, m7 1781 REPX {pmulhrsw x, m15}, m0, m2, m1, m3 1782 packuswb m0, m2 1783 packuswb m7, m1, m3 1784 vpermb m8, m12, m0 1785 ret 1786.w64: 1787 lea r3d, [hq-1] 1788 movu m7, [tlq+64*0] 1789 vpbroadcastb m13, r3d 1790 pminub m12, m13, [pb_0to63] 1791 or r3d, 64 1792 vpermb m8, m12, [tlq+64*1] 1793 test angled, 0x400 ; !enable_intra_edge_filter 1794 jnz .w64_main 1795 movu m0, [tlq+56] 1796 vpbroadcastd m2, [tlq-4] 1797 movu m11, [tlq+8] 1798 call .w64_filter 1799.w64_main: 1800 rorx r2d, dxd, 30 1801 vpbroadcastd m4, [base+z_xpos_bc] 1802 vpbroadcastw m3, r2d 1803 mova m5, [base+z_xpos_off2a] 1804 shl r3d, 6 1805 mova m6, [base+z_xpos_off2b] 1806 sub r3d, dxd 1807 mova m9, m3 1808.w64_loop: 1809 pshufb m1, m3, m4 1810 psrlw m2, m3, 3 1811 paddsb m0, m1, m5 1812 vpermw m2, m2, m14 1813 paddsb m1, m6 1814 vpermi2b m0, m7, m8 1815 vpermi2b m1, m7, m8 1816 paddsw m3, m9 1817 pmaddubsw m0, m2 1818 pmaddubsw m1, m2 1819 pmulhrsw m0, m15 1820 pmulhrsw m1, m15 1821 packuswb m0, m1 1822 mova [dstq], m0 1823 dec hd 1824 jz .w64_end 1825 add dstq, strideq 1826 sub r3d, dxd 1827 jg .w64_loop 1828 vpermb m8, m13, m8 1829.w64_end_loop: 1830 mova [dstq], m8 1831 add dstq, strideq 1832 dec hd 1833 jg .w64_end_loop 1834.w64_end: 1835 RET 1836 1837cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy 1838 tzcnt wd, wm 1839 movifnidn angled, anglem 1840 lea dxq, [dr_intra_derivative-90] 1841 movzx dyd, angleb 1842 xor angled, 0x400 1843 mov r7, dxq 1844 sub dxq, dyq 1845 movifnidn hd, hm 1846 and dyd, ~1 1847 and dxq, ~1 1848 movzx dyd, word [r7+dyq] ; angle - 90 1849 lea r7, [z_filter_t0] 1850 movzx dxd, word [dxq+270] ; 180 - angle 1851 movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] 1852 mova m8, [base+pb_63to0] 1853 neg dyd 1854 vpermb m8, m8, [tlq-64] ; left 1855 lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] 1856 mova m14, [base+z_frac_table] 1857 inc tlq 1858 vpbroadcastd m15, [base+pw_512] 1859 neg dxd 1860 jmp wq 1861.w4: 1862 movd xm7, [tlq] 1863 vpbroadcastq m10, [base+z_xpos_off2a] 1864 test angled, 0x400 1865 jnz .w4_main ; !enable_intra_edge_filter 1866 lea r3d, [hq+2] 1867 add angled, 1022 1868 shl r3d, 6 1869 test r3d, angled 1870 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1871 vpbroadcastd xm2, [base+pb_4] 1872 sub angled, 1075 ; angle - 53 1873 call .upsample_above 1874 lea r3d, [hq+3] 1875 vpbroadcastq m10, [pb_0to63+1] 1876 punpcklbw xm7, xm0, xm7 1877 call .filter_strength 1878 jmp .w4_filter_left 1879.w4_upsample_left: 1880 call .upsample_left 1881 movsldup m16, [base+z_ypos_off3] 1882 vpbroadcastd m9, [base+pb_16] 1883 punpcklbw xm8, xm0, xm8 1884 jmp .w4_main2 1885.w4_no_upsample_above: 1886 lea r3d, [hq+3] 1887 sub angled, 1112 ; angle - 90 1888 call .filter_strength 1889 test r3d, r3d 1890 jz .w4_no_filter_above 1891 vpbroadcastd xm5, [base+pb_3] 1892 call .filter_top_w16 1893.w4_no_filter_above: 1894 lea r3d, [hq+2] 1895 add angled, 973 ; angle + 883 1896 shl r3d, 6 1897 test r3d, angled 1898 jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1899 vpbroadcastd ym0, [base+pb_90] 1900 psubb ym0, ym17 1901 vpcmpgtb k2{k2}, ym0, ym16 1902 kmovd r3d, k2 1903.w4_filter_left: 1904 test r3d, r3d 1905 jz .w4_main 1906 popcnt r3d, r3d 1907 call .filter_left_h16 1908.w4_main: 1909 movsldup m16, [base+z_ypos_off1] 1910 vpbroadcastd m9, [base+pb_8] 1911.w4_main2: 1912 vpbroadcastq m3, [base+z_ypos_mul1a] 1913 vpbroadcastw m0, dyd 1914 movsldup m1, [base+z_xpos_mul] 1915 vpbroadcastw m5, dxd 1916 vinserti32x4 m7, [tlq-16], 3 1917 vinserti32x4 m8, [tlq-16], 3 1918 pmullw m3, m0 1919 vbroadcasti32x4 m2, [base+z_xpos_bc] 1920 pmullw m1, m5 ; xpos0..3 1921 psllw m5, 5 ; dx*8 1922 psraw m4, m3, 6 1923 psrlw m3, 1 1924 packsswb m4, m4 1925 vpermw m3, m3, m14 ; 64-frac, frac 1926 punpcklbw m4, m4 1927 lea r2, [strideq*3] 1928 paddb m4, m16 ; base, base+1 1929.w4_loop: 1930 pshufb m16, m1, m2 1931 psrlw m0, m1, 3 1932 paddb m16, m10 1933 vpermw m0, m0, m14 1934 vpmovw2m k1, m16 ; base_x < 0 1935 vpermb m16, m16, m7 1936 pmaddubsw m16, m0 1937 vpermb m0, m4, m8 1938 pmaddubsw m16{k1}, m0, m3 1939 pmulhrsw m16, m15 1940 vpmovwb ym16, m16 1941 movd [dstq+strideq*0], xm16 1942 pextrd [dstq+strideq*1], xm16, 1 1943 pextrd [dstq+strideq*2], xm16, 2 1944 pextrd [dstq+r2 ], xm16, 3 1945 sub hd, 8 1946 jl .w4_end 1947 paddsw m1, m5 1948 vextracti128 xm16, ym16, 1 1949 lea dstq, [dstq+strideq*4] 1950 paddb m4, m9 1951 movd [dstq+strideq*0], xm16 1952 pextrd [dstq+strideq*1], xm16, 1 1953 pextrd [dstq+strideq*2], xm16, 2 1954 pextrd [dstq+r2 ], xm16, 3 1955 lea dstq, [dstq+strideq*4] 1956 jg .w4_loop 1957.w4_end: 1958 RET 1959.upsample_above: ; w4/w8 1960 mova xm0, [tlq-1] 1961 xor angled, 0x7f ; 180 - angle 1962 add dxd, dxd 1963 jmp .upsample 1964.upsample_left: ; h4/h8 1965 palignr xm0, xm8, [tlq-16], 15 1966 vpbroadcastb xm2, hd 1967 add dyd, dyd 1968.upsample: 1969 pshufb xm1, xm0, [base+z_filter4_s1] 1970 pminub xm2, [base+z_filter_s4] 1971 vpbroadcastd xm3, [base+pb_m4_36] 1972 pshufb xm0, xm2 1973 pmaddubsw xm1, xm3 1974 pmaddubsw xm0, xm3 1975 paddw xm0, xm1 1976 pmulhrsw xm0, xm15 1977 packuswb xm0, xm0 1978 ret 1979.filter_strength: 1980 vpbroadcastb ym16, r3d 1981 mov r3d, angled 1982 vpbroadcastd m2, [tlq-4] 1983 vpbroadcastb ym17, angled 1984 shr r3d, 8 1985 vpcmpeqb k2, ym16, [base+z_filter_wh] 1986 mova xm16, [base+z_filter_t0+r3*8] 1987 vpcmpgtb k1{k2}, ym17, ym16 1988 mova m9, [pb_0to63] 1989 kmovd r3d, k1 1990 ret 1991.w8: 1992 movq xm7, [tlq] 1993 vbroadcasti32x4 m10, [base+z_xpos_off2a] 1994 test angled, 0x400 1995 jnz .w8_main 1996 lea r3d, [angleq+126] 1997 mov r3b, hb 1998 cmp r3d, 8 1999 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2000 vpbroadcastd xm2, [base+pb_8] 2001 sub angled, 53 ; angle - 53 2002 call .upsample_above 2003 lea r3d, [hq+7] 2004 vbroadcasti32x4 m10, [pb_0to63+1] 2005 punpcklbw xm7, xm0, xm7 2006 call .filter_strength 2007 jmp .w8_filter_left 2008.w8_upsample_left: 2009 call .upsample_left 2010 movshdup m16, [base+z_ypos_off3] 2011 vpbroadcastd m9, [base+pb_8] 2012 punpcklbw xm8, xm0, xm8 2013 jmp .w8_main2 2014.w8_no_upsample_above: 2015 lea r3d, [hq+7] 2016 sub angled, 90 ; angle - 90 2017 call .filter_strength 2018 test r3d, r3d 2019 jz .w8_no_filter_above 2020 vpbroadcastd xm5, [base+pb_7] 2021 call .filter_top_w16 2022.w8_no_filter_above: 2023 lea r3d, [angleq-51] 2024 mov r3b, hb 2025 cmp r3d, 8 2026 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 2027 vpbroadcastd ym0, [base+pb_90] 2028 psubb ym0, ym17 2029 vpcmpgtb k2{k2}, ym0, ym16 2030 kmovd r3d, k2 2031.w8_filter_left: 2032 test r3d, r3d 2033 jz .w8_main 2034 cmp hd, 32 2035 je .w8_filter_left_h32 2036 popcnt r3d, r3d 2037 call .filter_left_h16 2038 jmp .w8_main 2039.w8_filter_left_h32: 2040 call .filter_left_h64 2041.w8_main: 2042 movshdup m16, [base+z_ypos_off2] 2043 vpbroadcastd m9, [base+pb_4] 2044.w8_main2: 2045 vbroadcasti32x4 m3, [base+z_ypos_mul1a] 2046 vpbroadcastw m0, dyd 2047 movshdup m1, [base+z_xpos_mul] 2048 vpbroadcastw m5, dxd 2049 vinserti32x4 m7, [tlq-16], 3 2050 vinserti32x4 m8, [tlq-16], 3 2051 pmullw m3, m0 2052 vpbroadcastd m2, [base+pb_1] 2053 pmullw m1, m5 ; xpos0..3 2054 psllw m5, 4 ; dx*4 2055 psraw m4, m3, 6 2056 psrlw m3, 1 2057 packsswb m4, m4 2058 vpermw m3, m3, m14 ; 64-frac, frac 2059 lea r3d, [dxq+(8<<6)] 2060 paddsb m4, m16 2061 shl dxd, 2 2062 paddsb m0, m4, m2 2063 lea r2, [strideq*3] 2064 punpcklbw m4, m0 ; base, base+1 2065.w8_loop: 2066 pshufb m16, m1, m2 2067 psrlw m0, m1, 3 2068 paddb m16, m10 2069 vpermw m0, m0, m14 2070 vpmovw2m k1, m16 ; base_x < 0 2071 vpermb m16, m16, m7 2072 pmaddubsw m16, m0 2073 vpermb m0, m4, m8 2074 pmaddubsw m16{k1}, m0, m3 2075 pmulhrsw m16, m15 2076 vpmovwb ym16, m16 2077 vextracti128 xm17, ym16, 1 2078 movq [dstq+strideq*0], xm16 2079 movhps [dstq+strideq*1], xm16 2080 movq [dstq+strideq*2], xm17 2081 movhps [dstq+r2 ], xm17 2082 sub hd, 4 2083 jz .w8_end 2084 paddw m1, m5 2085 lea dstq, [dstq+strideq*4] 2086 paddb m4, m9 2087 add r3d, dxd 2088 jge .w8_loop 2089.w8_leftonly_loop: 2090 vpermb m16, m4, m8 2091 pmaddubsw m16, m3 2092 paddb m4, m9 2093 pmulhrsw m16, m15 2094 vpmovwb ym16, m16 2095 vextracti128 xm17, ym16, 1 2096 movq [dstq+strideq*0], xm16 2097 movhps [dstq+strideq*1], xm16 2098 movq [dstq+strideq*2], xm17 2099 movhps [dstq+r2 ], xm17 2100 lea dstq, [dstq+strideq*4] 2101 sub hd, 4 2102 jg .w8_leftonly_loop 2103.w8_end: 2104 RET 2105.filter_top_w16: 2106 mova xm0, [base+z_filter_s1] 2107 popcnt r3d, r3d 2108 pminub xm4, xm5, [base+z_filter_s4] 2109 vpermi2b xm0, xm7, xm2 2110 pminub xm5, [base+z_filter_s5] 2111 pshufb xm1, xm7, [base+z_filter_s2] 2112 vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] 2113 pshufb xm3, xm7, [base+z_filter_s3] 2114 vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] 2115 pshufb xm4, xm7, xm4 2116 vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] 2117 pshufb xm5, xm7, xm5 2118 pmaddubsw xm0, xm11 2119 pmaddubsw xm1, xm11 2120 pmaddubsw xm6, xm3, xm12 2121 vpbroadcastd xm12, r7m ; max_width 2122 pmaddubsw xm3, xm13 2123 pmaddubsw xm4, xm11 2124 pmaddubsw xm5, xm11 2125 packssdw xm12, xm12 2126 paddw xm0, xm6 2127 paddw xm1, xm3 2128 paddw xm0, xm4 2129 paddw xm1, xm5 2130 packsswb xm12, xm12 2131 pmulhrsw xm0, xm15 2132 pmulhrsw xm1, xm15 2133 vpcmpgtb k1, xm12, xm9 ; x < max_width 2134 packuswb xm7{k1}, xm0, xm1 2135 ret 2136.filter_left_h16: 2137 lea r5d, [hq-1] 2138 mova xm0, [base+z_filter_s1] 2139 vpbroadcastb xm5, r5d 2140 vpermi2b xm0, xm8, xm2 2141 pminub xm4, xm5, [base+z_filter_s4] 2142 pshufb xm1, xm8, [base+z_filter_s2] 2143 pminub xm5, [base+z_filter_s5] 2144 pshufb xm3, xm8, [base+z_filter_s3] 2145 vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] 2146 pshufb xm4, xm8, xm4 2147 vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] 2148 pshufb xm5, xm8, xm5 2149 vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] 2150 pmaddubsw xm0, xm11 2151 pmaddubsw xm1, xm11 2152 pmaddubsw xm6, xm3, xm12 2153 vpbroadcastd xm12, r8m ; max_height 2154 pmaddubsw xm3, xm13 2155 pmaddubsw xm4, xm11 2156 pmaddubsw xm5, xm11 2157 packssdw xm12, xm12 2158 paddw xm0, xm6 2159 paddw xm1, xm3 2160 paddw xm0, xm4 2161 paddw xm1, xm5 2162 packsswb xm12, xm12 2163 pmulhrsw xm0, xm15 2164 pmulhrsw xm1, xm15 2165 vpcmpgtb k1, xm12, xm9 ; y < max_height 2166 packuswb xm8{k1}, xm0, xm1 2167 ret 2168.w16: 2169 movu xm7, [tlq] ; top 2170 test angled, 0x400 2171 jnz .w16_main 2172 lea r3d, [hq+15] 2173 sub angled, 90 2174 call .filter_strength 2175 test r3d, r3d 2176 jz .w16_no_filter_above 2177 vpbroadcastd xm5, [base+pb_15] 2178 call .filter_top_w16 2179.w16_no_filter_above: 2180 cmp hd, 16 2181 jg .w16_filter_left_h64 2182 vpbroadcastd ym0, [base+pb_90] 2183 psubb ym0, ym17 2184 vpcmpgtb k2{k2}, ym0, ym16 2185 kmovd r3d, k2 2186 test r3d, r3d 2187 jz .w16_main 2188 popcnt r3d, r3d 2189 call .filter_left_h16 2190 jmp .w16_main 2191.w16_filter_left_h64: 2192 call .filter_left_h64 2193.w16_main: 2194 vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 2195 vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 2196 vpbroadcastw m0, dyd 2197 vinserti32x4 m7, [tlq-16], 3 2198 vpbroadcastd m2, [base+pb_1] 2199 vpbroadcastw m12, dxd 2200 movshdup m1, [base+z_xpos_mul] 2201 pmullw m6, m0 2202 vbroadcasti32x4 m3, [base+z_xpos_off2a] 2203 pmullw m5, m0 2204 vbroadcasti32x4 m4, [base+z_xpos_off2b] 2205 pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 2206 vpbroadcastd m9, [base+pb_4] 2207 psllw m12, 4 ; dx*4 2208 movshdup m16, [base+z_ypos_off2] 2209 psrlw m10, m6, 1 2210 psrlw m11, m5, 1 2211 vpermw m10, m10, m14 ; 64-frac, frac 2212 psraw m6, 6 2213 vpermw m11, m11, m14 2214 psraw m5, 6 2215 mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft 2216 packsswb m6, m5 2217 mov r3d, 1<<6 2218 paddsb m6, m16 2219 sub r5d, dxd ; left-only threshold 2220 paddsb m0, m6, m2 2221 shl dxd, 2 2222 punpcklbw m5, m6, m0 ; base, base+1 2223 lea r2, [strideq*3] 2224 punpckhbw m6, m0 2225.w16_loop: 2226 pshufb m17, m1, m2 2227 psrlw m0, m1, 3 2228 paddb m16, m3, m17 2229 vpermw m0, m0, m14 2230 paddb m17, m4 2231 vpmovw2m k1, m16 2232 vpermb m16, m16, m7 2233 vpmovw2m k2, m17 2234 vpermb m17, m17, m7 2235 pmaddubsw m16, m0 2236 pmaddubsw m17, m0 2237 add r3d, dxd 2238 jge .w16_toponly 2239 mova m0, m8 2240 vpermt2b m0, m5, m7 2241 pmaddubsw m16{k1}, m0, m10 2242 mova m0, m8 2243 vpermt2b m0, m6, m7 2244 pmaddubsw m17{k2}, m0, m11 2245.w16_toponly: 2246 pmulhrsw m16, m15 2247 pmulhrsw m17, m15 2248 packuswb m16, m17 2249 mova [dstq+strideq*0], xm16 2250 vextracti128 [dstq+strideq*1], ym16, 1 2251 vextracti32x4 [dstq+strideq*2], m16, 2 2252 vextracti32x4 [dstq+r2 ], m16, 3 2253 sub hd, 4 2254 jz .w16_end 2255 paddw m1, m12 2256 lea dstq, [dstq+strideq*4] 2257 paddb m5, m9 2258 paddb m6, m9 2259 cmp r3d, r5d 2260 jge .w16_loop 2261.w16_leftonly_loop: 2262 vpermb m16, m5, m8 2263 vpermb m17, m6, m8 2264 pmaddubsw m16, m10 2265 pmaddubsw m17, m11 2266 paddb m5, m9 2267 paddb m6, m9 2268 pmulhrsw m16, m15 2269 pmulhrsw m17, m15 2270 packuswb m16, m17 2271 mova [dstq+strideq*0], xm16 2272 vextracti128 [dstq+strideq*1], ym16, 1 2273 vextracti32x4 [dstq+strideq*2], m16, 2 2274 vextracti32x4 [dstq+r2 ], m16, 3 2275 lea dstq, [dstq+strideq*4] 2276 sub hd, 4 2277 jg .w16_leftonly_loop 2278.w16_end: 2279 RET 2280.w32: 2281 movu ym7, [tlq] 2282 test angled, 0x400 2283 jnz .w32_main 2284 vpbroadcastd m2, [tlq-4] 2285 mova ym0, [base+z_filter_s1] 2286 vbroadcasti32x4 ym1, [base+z_filter_s2] 2287 vbroadcasti32x4 ym3, [base+z_filter_s3] 2288 vbroadcasti32x4 ym4, [base+z_filter_s4] 2289 vpermi2b ym0, ym7, ym2 ; al bl 2290 vpbroadcastd ym5, [base+pb_31] 2291 pminub ym5, [base+z_filter_s5] 2292 pshufb ym1, ym7, ym1 ; ah bh 2293 vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] 2294 pshufb ym3, ym7, ym3 ; cl ch 2295 vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] 2296 pshufb ym4, ym7, ym4 ; el dl 2297 vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] 2298 vpermb ym5, ym5, ym7 ; eh dh 2299 pmaddubsw ym0, ym11 2300 pmaddubsw ym1, ym11 2301 pmaddubsw ym6, ym3, ym12 2302 vpbroadcastd ym12, r6m 2303 pmaddubsw ym3, ym13 2304 pmaddubsw ym4, ym11 2305 pmaddubsw ym5, ym11 2306 mova m9, [pb_0to63] 2307 packssdw ym12, ym12 2308 paddw ym0, ym6 2309 paddw ym1, ym3 2310 paddw ym0, ym4 2311 paddw ym1, ym5 2312 packsswb ym12, ym12 2313 pmulhrsw ym0, ym15 2314 pmulhrsw ym1, ym15 2315 vpcmpgtb k1, ym12, ym9 ; x < max_width 2316 packuswb ym7{k1}, ym0, ym1 2317 cmp hd, 16 2318 jg .w32_filter_h64 2319 mov r3d, 3 2320 call .filter_left_h16 2321 jmp .w32_main 2322.w32_filter_h64: 2323 call .filter_left_h64 2324.w32_main: 2325 vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 2326 vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 2327 vpbroadcastw m0, dyd 2328 vinserti32x4 m7, [tlq-16], 3 2329 rorx r2q, dxq, 62 ; dx << 2 2330 vpbroadcastd m2, [base+pb_1] 2331 vpbroadcastw m1, r2d 2332 pmullw m6, m0 2333 vbroadcasti32x8 m3, [base+z_xpos_off2a] 2334 pmullw m5, m0 2335 vbroadcasti32x8 m4, [base+z_xpos_off2b] 2336 mova ym0, ym1 2337 paddw m12, m1, m1 2338 vpbroadcastd m9, [base+pb_2] 2339 paddw m1, m0 ; xpos1 xpos0 2340 mova ym0, ym2 2341 psrlw m10, m6, 1 2342 psrlw m11, m5, 1 2343 vpermw m10, m10, m14 ; 64-frac, frac 2344 psraw m6, 6 2345 vpermw m11, m11, m14 2346 psraw m5, 6 2347 mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft 2348 packsswb m6, m5 2349 mov r3d, 1<<6 2350 paddsb m6, m0 2351 sub r5d, dxd ; left-only threshold 2352 paddsb m0, m6, m2 2353 add dxd, dxd 2354 punpcklbw m5, m6, m0 ; base, base+1 2355 punpckhbw m6, m0 2356.w32_loop: 2357 pshufb m17, m1, m2 2358 psrlw m0, m1, 3 2359 paddb m16, m3, m17 2360 vpermw m0, m0, m14 2361 paddb m17, m4 2362 vpmovw2m k1, m16 2363 vpermb m16, m16, m7 2364 vpmovw2m k2, m17 2365 vpermb m17, m17, m7 2366 pmaddubsw m16, m0 2367 pmaddubsw m17, m0 2368 add r3d, dxd 2369 jge .w32_toponly 2370 mova m0, m8 2371 vpermt2b m0, m5, m7 2372 pmaddubsw m16{k1}, m0, m10 2373 mova m0, m8 2374 vpermt2b m0, m6, m7 2375 pmaddubsw m17{k2}, m0, m11 2376.w32_toponly: 2377 pmulhrsw m16, m15 2378 pmulhrsw m17, m15 2379 packuswb m16, m17 2380 vextracti32x8 [dstq+strideq*0], m16, 1 2381 mova [dstq+strideq*1], ym16 2382 sub hd, 2 2383 jz .w32_end 2384 paddw m1, m12 2385 lea dstq, [dstq+strideq*2] 2386 paddb m5, m9 2387 paddb m6, m9 2388 cmp r3d, r5d 2389 jge .w32_loop 2390.w32_leftonly_loop: 2391 vpermb m16, m5, m8 2392 vpermb m17, m6, m8 2393 pmaddubsw m16, m10 2394 pmaddubsw m17, m11 2395 paddb m5, m9 2396 paddb m6, m9 2397 pmulhrsw m16, m15 2398 pmulhrsw m17, m15 2399 packuswb m16, m17 2400 vextracti32x8 [dstq+strideq*0], m16, 1 2401 mova [dstq+strideq*1], ym16 2402 lea dstq, [dstq+strideq*2] 2403 sub hd, 2 2404 jg .w32_leftonly_loop 2405.w32_end: 2406 RET 2407.filter_left_h64: 2408 mova m0, [base+z_filter_s1] 2409 lea r3d, [hq-1] 2410 vbroadcasti32x4 m4, [base+z_filter_s4] 2411 vpbroadcastb m5, r3d 2412 vbroadcasti32x4 m1, [base+z_filter_s2] 2413 vbroadcasti32x4 m3, [base+z_filter_s3] 2414 vpermi2b m0, m8, m2 ; al bl 2415 pminub m5, [base+z_filter_s5] 2416 pshufb m1, m8, m1 ; ah bh 2417 vpbroadcastd m11, [base+z_filter_k+4*2+12*0] 2418 pshufb m3, m8, m3 ; cl ch 2419 vpbroadcastd m12, [base+z_filter_k+4*2+12*1] 2420 pshufb m4, m8, m4 ; el dl 2421 vpbroadcastd m13, [base+z_filter_k+4*2+12*2] 2422 vpermb m5, m5, m8 ; eh dh 2423 pmaddubsw m0, m11 2424 pmaddubsw m1, m11 2425 pmaddubsw m6, m3, m12 2426 vpbroadcastd m12, r8m ; max_height 2427 pmaddubsw m3, m13 2428 pmaddubsw m4, m11 2429 pmaddubsw m5, m11 2430 packssdw m12, m12 2431 paddw m0, m6 2432 paddw m1, m3 2433 paddw m0, m4 2434 paddw m1, m5 2435 packsswb m12, m12 2436 pmulhrsw m0, m15 2437 pmulhrsw m1, m15 2438 vpcmpgtb k1, m12, m9 ; y < max_height 2439 packuswb m8{k1}, m0, m1 2440 ret 2441.w64: 2442 movu m7, [tlq] 2443 test angled, 0x400 2444 jnz .w64_main 2445 vpbroadcastd m2, [tlq-4] 2446 mova m0, [base+z_filter_s1] 2447 vbroadcasti32x4 m1, [base+z_filter_s2] 2448 vbroadcasti32x4 m3, [base+z_filter_s3] 2449 vbroadcasti32x4 m4, [base+z_filter_s4] 2450 vpermi2b m0, m7, m2 ; al bl 2451 vpbroadcastd m5, [base+pb_63] 2452 pminub m5, [base+z_filter_s5] 2453 pshufb m1, m7, m1 ; ah bh 2454 vpbroadcastd m11, [base+z_filter_k+4*2+12*0] 2455 pshufb m3, m7, m3 ; cl ch 2456 vpbroadcastd m12, [base+z_filter_k+4*2+12*1] 2457 pshufb m4, m7, m4 ; el dl 2458 vpbroadcastd m13, [base+z_filter_k+4*2+12*2] 2459 vpermb m5, m5, m7 ; eh dh 2460 pmaddubsw m0, m11 2461 pmaddubsw m1, m11 2462 pmaddubsw m6, m3, m12 2463 vpbroadcastd m12, r6m 2464 pmaddubsw m3, m13 2465 pmaddubsw m4, m11 2466 pmaddubsw m5, m11 2467 mova m9, [pb_0to63] 2468 packssdw m12, m12 2469 paddw m0, m6 2470 paddw m1, m3 2471 paddw m0, m4 2472 paddw m1, m5 2473 packsswb m12, m12 2474 pmulhrsw m0, m15 2475 pmulhrsw m1, m15 2476 vpcmpgtb k1, m12, m9 ; x < max_width 2477 packuswb m7{k1}, m0, m1 2478 call .filter_left_h64 ; always filter the full 64 pixels for simplicity 2479.w64_main: 2480 vpbroadcastw m5, dyd 2481 vpbroadcastd m9, [tlq-4] 2482 rorx r2q, dxq, 62 ; dx << 2 2483 pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such 2484 pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge 2485 vpbroadcastw m1, r2d ; xpos 2486 mova m3, [base+z_xpos_off2a] 2487 mova m4, [base+z_xpos_off2b] 2488 mova m12, m1 2489 vpbroadcastd m2, [base+pb_1] 2490 psrlw m10, m6, 1 2491 psrlw m11, m5, 1 2492 vpermw m10, m10, m14 ; 64-frac, frac 2493 psraw m6, 6 2494 vpermw m11, m11, m14 2495 psraw m5, 6 2496 mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft 2497 packsswb m6, m5 2498 mov r3d, 1<<6 2499 paddsb m0, m6, m2 2500 sub r5d, dxd ; left-only threshold 2501 punpcklbw m5, m6, m0 ; base, base+1 2502 punpckhbw m6, m0 2503.w64_loop: 2504 pshufb m17, m1, m2 2505 psrlw m0, m1, 3 2506 paddb m16, m3, m17 2507 vpermw m0, m0, m14 2508 paddb m17, m4 2509 vpmovw2m k1, m16 ; base_x < 0 2510 vpermi2b m16, m7, m9 2511 vpmovw2m k2, m17 2512 vpermi2b m17, m7, m9 2513 pmaddubsw m16, m0 2514 pmaddubsw m17, m0 2515 add r3d, dxd 2516 jge .w64_toponly 2517 mova m0, m8 2518 vpermt2b m0, m5, m9 2519 pmaddubsw m16{k1}, m0, m10 2520 mova m0, m8 2521 vpermt2b m0, m6, m9 2522 pmaddubsw m17{k2}, m0, m11 2523.w64_toponly: 2524 pmulhrsw m16, m15 2525 pmulhrsw m17, m15 2526 packuswb m16, m17 2527 mova [dstq], m16 2528 dec hd 2529 jz .w64_end 2530 paddw m1, m12 2531 add dstq, strideq 2532 paddb m5, m2 2533 paddb m6, m2 2534 cmp r3d, r5d 2535 jge .w64_loop 2536.w64_leftonly_loop: 2537 vpermb m16, m5, m8 2538 vpermb m17, m6, m8 2539 pmaddubsw m16, m10 2540 pmaddubsw m17, m11 2541 paddb m5, m2 2542 paddb m6, m2 2543 pmulhrsw m16, m15 2544 pmulhrsw m17, m15 2545 packuswb m16, m17 2546 mova [dstq], m16 2547 add dstq, strideq 2548 dec hd 2549 jg .w64_leftonly_loop 2550.w64_end: 2551 RET 2552 2553cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy 2554 lea r7, [z_filter_t0] 2555 tzcnt wd, wm 2556 movifnidn angled, anglem 2557 lea t0, [dr_intra_derivative+45*2-1] 2558 movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] 2559 sub angled, 180 2560 mov dyd, angled 2561 neg dyd 2562 xor angled, 0x400 2563 or dyq, ~0x7e 2564 mova m0, [base+pb_63to0] 2565 movzx dyd, word [t0+dyq] 2566 lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] 2567 movifnidn hd, hm 2568 mova m14, [base+z_frac_table] 2569 shl dyd, 6 2570 vpbroadcastd m15, [base+pw_512] 2571 jmp wq 2572.w4: 2573 cmp angleb, 40 2574 jae .w4_no_upsample 2575 lea r3d, [angleq-1024] 2576 sar r3d, 7 2577 add r3d, hd 2578 jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 2579 lea r3d, [hq+4] 2580 call .upsample 2581 movshdup m1, [base+z_ypos_off1] 2582 vpbroadcastd m6, [base+pb_16] 2583 jmp .w4_main2 2584.w4_no_upsample: 2585 lea r3d, [hq+3] 2586 vpbroadcastb m9, r3d 2587 vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) 2588 pmaxub m1, m0 2589 vpermb m7, m1, [tlq-64*1] 2590 test angled, 0x400 ; !enable_intra_edge_filter 2591 jnz .w4_main 2592 vpbroadcastb xm1, angled 2593 shr angled, 8 2594 vpcmpeqb k1, xm9, [base+z_filter_wh] 2595 vpbroadcastd m2, [tlq-3] 2596 vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] 2597 kmovw r5d, k1 2598 test r5d, r5d 2599 jz .w4_main 2600 pminub m9, [pb_0to63] 2601 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter 2602 vpermb m7, m9, m0 2603.w4_main: 2604 movsldup m1, [base+z_ypos_off1] 2605 vpbroadcastd m6, [base+pb_8] 2606.w4_main2: 2607 vpbroadcastw m0, dyd 2608 vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 2609 pmulhuw m2, m0 ; ypos >> 1 2610 lea r2, [strideq*3] 2611 vpermw m3, m2, m14 ; 64-frac, frac 2612 psrlw m2, 5 2613 packsswb m2, m2 2614 punpcklbw m2, m2 2615 paddsb m2, m1 ; base, base+1 2616.w4_loop: 2617 vpermb m0, m2, m7 2618 pmaddubsw m0, m3 2619 paddsb m2, m6 2620 pmulhrsw m0, m15 2621 vpmovwb ym0, m0 2622 movd [dstq+strideq*0], xm0 2623 pextrd [dstq+strideq*1], xm0, 1 2624 pextrd [dstq+strideq*2], xm0, 2 2625 pextrd [dstq+r2 ], xm0, 3 2626 sub hd, 8 2627 jl .w4_end 2628 vextracti32x4 xm0, ym0, 1 2629 lea dstq, [dstq+strideq*4] 2630 movd [dstq+strideq*0], xm0 2631 pextrd [dstq+strideq*1], xm0, 1 2632 pextrd [dstq+strideq*2], xm0, 2 2633 pextrd [dstq+r2 ], xm0, 3 2634 lea dstq, [dstq+strideq*4] 2635 jg .w4_loop 2636.w4_end: 2637 RET 2638.upsample: 2639 xor r3d, 31 ; 31 - (h + imin(w, h)) 2640 vbroadcasti32x4 ym0, [base+z_xpos_off2a] 2641 vpbroadcastb ym7, r3d 2642 pmaxub ym7, [base+z3_upsample] 2643 vbroadcasti32x4 ym1, [base+z_filter_s4] 2644 vpermb ym7, ym7, [tlq-31] 2645 vpbroadcastd ym2, [base+pb_m4_36] 2646 pshufb ym0, ym7, ym0 2647 psrldq ym7, 1 2648 pshufb ym1, ym7, ym1 2649 pmaddubsw ym0, ym2 2650 pmaddubsw ym1, ym2 2651 add dyd, dyd 2652 paddw ym0, ym1 2653 pmulhrsw ym0, ym15 2654 packuswb ym0, ym0 2655 punpcklbw ym7, ym0 2656 ret 2657.w8: 2658 lea r3d, [angleq+216] 2659 mov r3b, hb 2660 cmp r3d, 8 2661 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 2662 lea r3d, [hq*2] 2663 call .upsample 2664 pshufd m1, [base+z_ypos_off1], q0000 2665 vpbroadcastd m6, [base+pb_8] 2666 jmp .w8_main2 2667.w8_no_upsample: 2668 mov r3d, 8 2669 cmp hd, 4 2670 cmove r3d, hd 2671 lea r3d, [r3+hq-1] 2672 xor r3d, 63 ; 63 - (h + imin(w, h)) 2673 vpbroadcastb m1, wd 2674 pmaxub m1, m0 2675 vpermb m7, m1, [tlq-64*1] 2676 test angled, 0x400 ; !enable_intra_edge_filter 2677 jnz .w8_main 2678 lea r3d, [hq+7] 2679 call .filter_strength 2680 test r5d, r5d 2681 jz .w8_main 2682 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter 2683 vpermb m7, m10, m0 2684.w8_main: 2685 movsldup m1, [base+z_ypos_off2] 2686 vpbroadcastd m6, [base+pb_4] 2687.w8_main2: 2688 vpbroadcastw m0, dyd 2689 vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 2690 pmulhuw m2, m0 ; ypos >> 1 2691 lea r2, [strideq*3] 2692 vpermw m3, m2, m14 ; 64-frac, frac 2693 psrlw m2, 5 2694 packsswb m2, m2 2695 punpcklbw m2, m2 2696 paddsb m2, m1 ; base, base+1 2697.w8_loop: 2698 vpermb m0, m2, m7 2699 pmaddubsw m0, m3 2700 paddsb m2, m6 2701 pmulhrsw m0, m15 2702 vpmovwb ym0, m0 2703 vextracti32x4 xm1, ym0, 1 2704 movq [dstq+strideq*0], xm0 2705 movhps [dstq+strideq*1], xm0 2706 movq [dstq+strideq*2], xm1 2707 movhps [dstq+r2 ], xm1 2708 lea dstq, [dstq+strideq*4] 2709 sub hd, 4 2710 jg .w8_loop 2711 RET 2712.filter_strength: 2713 vpbroadcastd m2, [tlq-3] 2714.filter_strength2: 2715 vpbroadcastb m9, r3d 2716 vpbroadcastb ym1, angled 2717 shr angled, 8 2718 vpcmpeqb k1, ym9, [base+z_filter_wh] 2719 mova xm0, [base+z_filter_t0+angleq*8] 2720 vpcmpgtb k1{k1}, ym1, ym0 2721 pminub m10, m9, [pb_0to63] 2722 kmovd r5d, k1 2723 ret 2724.w16_load: 2725 cmp r3d, hd 2726 cmovae r3d, hd 2727 add r3d, hd 2728 mova m7, [tlq-64*1] 2729 neg r3d ; -(h + imin(w, h)) 2730 and r3d, 63 2731 vpbroadcastb m1, r3d 2732 pmaxub m2, m0, m1 2733 cmp hd, 64 2734 je .w16_load_h64 2735 vpermb m8, m1, m7 2736 vpermb m7, m2, m7 2737 ret 2738.w16_load_h64: 2739 vpermb m7, m0, m7 2740 vpermb m8, m2, [tlq-64*2] 2741 ret 2742.w16: 2743 mov r3d, 16 2744 call .w16_load 2745 test angled, 0x400 ; !enable_intra_edge_filter 2746 jnz .w16_main 2747 vpbroadcastd m2, [tlq-3] 2748 cmp hd, 64 2749 je .w16_filter64 2750 lea r3d, [hq+15] 2751 call .filter_strength2 2752 test r5d, r5d 2753 jz .w16_main 2754 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter 2755 pminub m10, m9, [pb_0to63] 2756 vpermb m8, m9, m0 2757 vpermb m7, m10, m0 2758 jmp .w16_main 2759.w16_filter64: 2760 vpbroadcastd m13, [base+pb_15] 2761 valignq m0, m8, m7, 7 2762 pminub m12, m13, [pb_0to63] 2763 valignq m11, m8, m7, 1 2764 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter 2765.w16_main: 2766 vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 2767 vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 2768 vpbroadcastw m0, dyd 2769 vpbroadcastd m6, [base+pb_4] 2770 pmulhuw m3, m0 ; ypos >> 1 2771 pmulhuw m2, m0 2772 movshdup m0, [base+z_ypos_off2] 2773 lea r2, [strideq*3] 2774 vpbroadcastd m1, [base+pb_1] 2775 vpermw m4, m3, m14 ; 64-frac, frac 2776 psrlw m3, 5 2777 vpermw m5, m2, m14 2778 psrlw m2, 5 2779 packsswb m3, m2 2780 paddsb m3, m0 2781 paddsb m1, m3 2782 punpcklbw m2, m3, m1 ; base, base+1 2783 punpckhbw m3, m1 2784.w16_loop: 2785%macro Z3_PERM2 0 2786 mova m0, m7 2787 vpermt2b m0, m2, m8 2788 mova m1, m7 2789 vpermt2b m1, m3, m8 2790 pmaddubsw m0, m4 2791 pmaddubsw m1, m5 2792 paddsb m2, m6 2793 paddsb m3, m6 2794 pmulhrsw m0, m15 2795 pmulhrsw m1, m15 2796 packuswb m0, m1 2797%endmacro 2798 Z3_PERM2 2799 mova [dstq+strideq*0], xm0 2800 vextracti32x4 [dstq+strideq*1], ym0, 1 2801 vextracti32x4 [dstq+strideq*2], m0, 2 2802 vextracti32x4 [dstq+r2 ], m0, 3 2803 lea dstq, [dstq+strideq*4] 2804 sub hd, 4 2805 jg .w16_loop 2806 RET 2807.w32: 2808 mov r3d, 32 2809 call .w16_load 2810 test angled, 0x400 ; !enable_intra_edge_filter 2811 jnz .w32_main 2812 vpbroadcastd m2, [tlq-3] 2813 cmp hd, 64 2814 je .w32_filter64 2815 lea r3d, [hq+31] 2816 vpbroadcastb m9, r3d 2817 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter 2818 vpermb m8, m9, m7 2819 jmp .w32_main 2820.w32_filter64: 2821 vpbroadcastd m13, [base+pb_31] 2822 valignq m0, m8, m7, 7 2823 pminub m12, m13, [pb_0to63] 2824 valignq m11, m8, m7, 1 2825 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter 2826.w32_main: 2827 vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 2828 vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 2829 vpbroadcastw m0, dyd 2830 vpbroadcastd m1, [base+pb_1] 2831 pmulhuw m3, m0 ; ypos >> 1 2832 pmulhuw m2, m0 2833 vpbroadcastd m6, [base+pb_2] 2834 mova ym0, ym1 2835 vpermw m4, m3, m14 ; 64-frac, frac 2836 psrlw m3, 5 2837 vpermw m5, m2, m14 2838 psrlw m2, 5 2839 packsswb m3, m2 2840 paddsb m3, m0 2841 paddsb m1, m3 2842 punpcklbw m2, m3, m1 ; base, base+1 2843 punpckhbw m3, m1 2844.w32_loop: 2845 Z3_PERM2 2846 vextracti32x8 [dstq+strideq*0], m0, 1 2847 mova [dstq+strideq*1], ym0 2848 lea dstq, [dstq+strideq*2] 2849 sub hd, 2 2850 jg .w32_loop 2851 RET 2852.w64: 2853 mova m7, [tlq-64*1] 2854 cmp hd, 64 2855 je .w64_h64 2856 lea r3d, [hq*2-1] 2857 xor r3d, 63 ; -(h + imin(w, h)) & 63 2858 vpbroadcastb m1, r3d 2859 pmaxub m0, m1 2860 vpermb m8, m1, m7 2861 jmp .w64_filter 2862.w64_h64: 2863 vpermb m8, m0, [tlq-64*2] 2864.w64_filter: 2865 vpermb m7, m0, m7 2866 test angled, 0x400 ; !enable_intra_edge_filter 2867 jnz .w64_main 2868 lea r3d, [hq-1] 2869 vpbroadcastd m2, [tlq-3] 2870 vpbroadcastb m13, r3d 2871 valignq m0, m8, m7, 7 2872 pminub m12, m13, [pb_0to63] 2873 valignq m11, m8, m7, 1 2874 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter 2875.w64_main: 2876 vpbroadcastw m2, dyd 2877 pmulhuw m3, m2, [base+z_ypos_mul2a] 2878 pmulhuw m2, [base+z_ypos_mul2b] 2879 vpbroadcastd m6, [base+pb_1] 2880 vpermw m4, m3, m14 ; 64-frac, frac 2881 psrlw m3, 5 2882 vpermw m5, m2, m14 2883 psrlw m2, 5 2884 packsswb m3, m2 2885 paddsb m1, m3, m6 2886 punpcklbw m2, m3, m1 ; base, base+1 2887 punpckhbw m3, m1 2888.w64_loop: 2889 Z3_PERM2 2890 mova [dstq], m0 2891 add dstq, strideq 2892 dec hd 2893 jg .w64_loop 2894 RET 2895 2896; The ipred_filter code processes 4x2 blocks in the following order 2897; which increases parallelism compared to doing things row by row. 2898; Some redundant blocks are calculated for w > 4. 2899; w4 w8 w16 w32 2900; 1 1 2 1 2 3 4 1 2 3 4 9 a b c 2901; 2 2 3 2 3 4 5 2 3 4 5 a b c d 2902; 3 3 4 3 4 5 6 3 4 5 6 b c d e 2903; 4 4 5 4 5 6 7 4 5 6 7 c d e f 2904; 5 5 6 5 6 7 8 5 6 7 8 d e f g 2905; 6 6 7 6 7 8 9 6 7 8 9 e f g h 2906; 7 7 8 7 8 9 a 7 8 9 a f g h i 2907; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ 2908; 9 9 a b h i j 2909; a b i j 2910; b j 2911 2912cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt 2913%define base r6-filter_taps 2914 lea r6, [filter_taps] 2915%ifidn fltd, fltm 2916 movzx fltd, fltb 2917%else 2918 movzx fltd, byte fltm 2919%endif 2920 vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 2921 movifnidn hd, hm 2922 shl fltd, 6 2923 vpbroadcastd m6, [base+pd_8] 2924 vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ 2925 vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 2926 vbroadcasti32x4 m8, [r6+fltq+16*1] 2927 vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ 2928 vbroadcasti32x4 m10, [r6+fltq+16*3] 2929 mova xmm0, xm6 2930 vpdpbusd xmm0, xmm2, xm7 2931 mova xmm1, xm6 2932 vpdpbusd xmm1, xmm2, xm8 2933 vpdpbusd xmm0, xmm3, xm9 2934 vpdpbusd xmm1, xmm3, xm10 2935 packssdw xmm0, xmm1 2936 cmp wd, 8 2937 jb .w4 2938 vpbroadcastd ym2, [tlq+5] 2939 mova m11, [base+filter_perm] 2940 mov r5, 0xffffffffffff000f 2941 psrldq xmm2, 1 ; __ t0 2942 kmovq k1, r5 ; 0x000f 2943 psraw xm5, xmm0, 4 2944 packuswb xmm2, xm5 ; __ t0 a0 b0 2945 pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 2946 je .w8 2947 kxnorb k3, k3, k3 ; 0x00ff 2948 vpbroadcastd xm3, [tlq-4] 2949 kandnq k2, k3, k1 ; 0xffffffffffff0000 2950 vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ 2951 mova ym0, ym6 2952 vpdpbusd ym0, ym2, ym7 2953 mova ym1, ym6 2954 vpdpbusd ym1, ym2, ym8 2955 pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 2956 vpbroadcastd m2, [tlq+9] 2957 vpdpbusd ym0, ym3, ym9 2958 vpdpbusd ym1, ym3, ym10 2959 vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ 2960 kunpckbw k4, k1, k3 ; 0x0fff 2961 packssdw ym0, ym1 2962 psraw ym0, 4 ; a0 d0 a1 b1 2963 packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 2964 pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 2965 vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ 2966 mova m4, m6 2967 vpdpbusd m4, m2, m7 2968 mova m1, m6 2969 vpdpbusd m1, m2, m8 2970 psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 2971 vpbroadcastd m2, [tlq+13] 2972 vpdpbusd m4, m3, m9 2973 vpdpbusd m1, m3, m10 2974 mova m12, [base+filter_end] 2975 lea r5d, [hq-6] 2976 mov r6, dstq 2977 cmovp hd, r5d ; w == 16 ? h : h - 6 2978 packssdw m4, m1 2979 psraw m4, 4 ; e0 f0 c1 d1 a2 b2 2980 packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 2981 pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 2982.w16_loop: 2983 vpbroadcastd xm3, [tlq-8] 2984 vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ 2985 mova m1, m6 2986 vpdpbusd m1, m2, m7 2987 mova m0, m6 2988 vpdpbusd m0, m2, m8 2989 sub tlq, 2 2990 vpdpbusd m1, m3, m9 2991 vpdpbusd m0, m3, m10 2992 packssdw m1, m0 2993 mova m0, m4 2994 psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 2995 packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 2996 pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 2997 vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 2998 vextracti32x4 [dstq+strideq*0], m5, 2 2999 vextracti32x4 [dstq+strideq*1], m5, 3 3000 lea dstq, [dstq+strideq*2] 3001 sub hd, 2 3002 jg .w16_loop 3003 cmp wd, 16 3004 je .ret 3005 mova xm13, [filter_perm+16] 3006 mova xmm3, [r6+strideq*0] 3007 punpckhdq xmm3, [r6+strideq*1] 3008 vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 3009 pinsrb xm3, xmm3, [tlq+r5+16], 7 3010 pshufb xm3, xm13 3011 vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ 3012 mova m0, m6 3013 vpdpbusd m0, m2, m7 3014 mova m1, m6 3015 vpdpbusd m1, m2, m8 3016 kunpckbw k5, k3, k1 ; 0xff0f 3017 lea r3, [strideq*3] 3018 vpdpbusd m0, m3, m9 3019 vpdpbusd m1, m3, m10 3020 packssdw m0, m1 3021 psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 3022 packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 3023 vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 3024 vpbroadcastd ym2, [tlq+r5+21] 3025 pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 3026 vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 3027 vextracti32x4 [dstq+strideq*0], m5, 2 3028 vextracti32x4 [dstq+strideq*1], m5, 3 3029 punpckhqdq xmm3, [r6+r3] 3030 pinsrb xmm3, [r6+strideq*2+15], 11 3031 pshufb xm3, xmm3, xm13 3032 vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ 3033 mova m4, m6 3034 vpdpbusd m4, m2, m7 3035 mova m1, m6 3036 vpdpbusd m1, m2, m8 3037 kxnord k3, k3, k4 ; 0xfffff0ff 3038 lea r4, [strideq*5] 3039 vpdpbusd m4, m3, m9 3040 vpdpbusd m1, m3, m10 3041 packssdw m4, m1 3042 psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 3043 packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 3044 vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 3045 vpbroadcastd m2, [tlq+r5+25] 3046 pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 3047 vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 3048 vextracti32x4 [dstq+strideq*2], m5, 2 3049 vextracti32x4 [dstq+r3 ], m5, 3 3050 punpckhqdq xmm3, [r6+r4] 3051 pinsrb xmm3, [r6+strideq*4+15], 11 3052 pshufb xm3, xmm3, xm13 3053 vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ 3054 mova m0, m6 3055 vpdpbusd m0, m2, m7 3056 mova m1, m6 3057 vpdpbusd m1, m2, m8 3058 kunpckwd k1, k1, k2 ; 0x000f0000 3059 vpdpbusd m0, m3, m9 3060 vpdpbusd m1, m3, m10 3061 packssdw m0, m1 3062 psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 3063 packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 3064 vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 3065 vpbroadcastd m2, [tlq+r5+29] 3066 pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 3067 vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 3068 vextracti32x4 [dstq+strideq*4], m5, 2 3069 vextracti32x4 [dstq+r4 ], m5, 3 3070 lea r0, [strideq+r3*2] 3071.w32_loop: 3072 punpckhqdq xmm3, [r6+r0] 3073 pinsrb xmm3, [r6+r3*2+15], 11 3074 pshufb xm3, xmm3, xm13 3075 vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ 3076.w32_loop_tail: 3077 mova m4, m6 3078 vpdpbusd m4, m2, m7 3079 mova m1, m6 3080 vpdpbusd m1, m2, m8 3081 vpdpbusd m4, m3, m9 3082 vpdpbusd m1, m3, m10 3083 packssdw m4, m1 3084 mova m1, m0 3085 psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 3086 packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 3087 pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 3088 vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 3089 vextracti32x4 [r6+strideq*0+16], m5, 2 3090 vextracti32x4 [r6+strideq*1+16], m5, 3 3091 lea r6, [r6+strideq*2] 3092 sub r5d, 2 3093 jg .w32_loop 3094 vpermb m3, m11, m1 3095 cmp r5d, -6 3096 jg .w32_loop_tail 3097.ret: 3098 RET 3099.w8: 3100 vpermb ym3, ym11, ymm2 3101.w8_loop: 3102 vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ 3103 mova ym0, ym6 3104 vpdpbusd ym0, ym2, ym7 3105 mova ym1, ym6 3106 vpdpbusd ym1, ym2, ym8 3107 sub tlq, 2 3108 vpdpbusd ym0, ym3, ym9 3109 vpdpbusd ym1, ym3, ym10 3110 mova ym3, ym5 3111 packssdw ym0, ym1 3112 psraw ym5, ym0, 4 ; c0 d0 a1 b1 3113 packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 3114 pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 3115 vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 3116 movq [dstq+strideq*0], xm3 3117 movhps [dstq+strideq*1], xm3 3118 lea dstq, [dstq+strideq*2] 3119 sub hd, 2 3120 jg .w8_loop 3121 RET 3122.w4_loop: 3123 vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ 3124 mova xmm0, xm6 3125 vpdpbusd xmm0, xmm2, xm7 3126 mova xmm1, xm6 3127 vpdpbusd xmm1, xmm2, xm8 3128 sub tlq, 2 3129 vpdpbusd xmm0, xmm3, xm9 3130 vpdpbusd xmm1, xmm3, xm10 3131 packssdw xmm0, xmm1 3132.w4: 3133 psraw xmm0, 4 ; a0 b0 3134 packuswb xmm0, xmm0 3135 movd [dstq+strideq*0], xmm0 3136 pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 3137 movd [dstq+strideq*1], xmm2 3138 lea dstq, [dstq+strideq*2] 3139 sub hd, 2 3140 jg .w4_loop 3141 RET 3142 3143%endif ; ARCH_X86_64 3144