1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2018, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32; dav1d_obmc_masks[] with 64-x interleaved 33obmc_masks: db 0, 0, 0, 0 34 ; 2 @4 35 db 45, 19, 64, 0 36 ; 4 @8 37 db 39, 25, 50, 14, 59, 5, 64, 0 38 ; 8 @16 39 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 40 ; 16 @32 41 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 42 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 43 ; 32 @64 44 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 45 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 46 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 47 48warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 49warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 50warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 51warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 52blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 53subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 54 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 55subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 56subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 57subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 58subpel_h_shufD: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 59subpel_h_shufE: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 60subpel_h_shufF: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 61subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 62subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 63bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 64unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 65rescale_mul: dd 0, 1, 2, 3 66resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 67 68wm_420_sign: times 4 dw 258 69 times 4 dw 257 70wm_422_sign: times 8 db 128 71 times 8 db 127 72 73pb_8x0_8x8: times 8 db 0 74 times 8 db 8 75bdct_lb_dw: times 4 db 0 76 times 4 db 4 77 times 4 db 8 78 times 4 db 12 79 80pb_64: times 16 db 64 81pw_m256: times 8 dw -256 82pw_1: times 8 dw 1 83pw_2: times 8 dw 2 84pw_8: times 8 dw 8 85pw_15: times 8 dw 15 86pw_26: times 8 dw 26 87pw_34: times 8 dw 34 88pw_512: times 8 dw 512 89pw_1024: times 8 dw 1024 90pw_2048: times 8 dw 2048 91pw_6903: times 8 dw 6903 92pw_8192: times 8 dw 8192 93pd_32: times 4 dd 32 94pd_63: times 4 dd 63 95pd_512: times 4 dd 512 96pd_16384: times 4 dd 16484 97pd_32768: times 4 dd 32768 98pd_262144:times 4 dd 262144 99pd_0x3ff: times 4 dd 0x3ff 100pd_0x4000:times 4 dd 0x4000 101pq_0x40000000: times 2 dq 0x40000000 102 103const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage 104 ; [-1, 0) 105 db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 106 db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 107 db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 108 db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 109 db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 110 db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 111 db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 112 db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 113 db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 114 db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 115 db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 116 db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 117 db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 118 db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 119 db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 120 db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 121 db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 122 db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 123 db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 124 db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 125 db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 126 db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 127 db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 128 db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 129 db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 130 db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 131 db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 132 db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 133 db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 134 db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 135 db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 136 db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 137 ; [0, 1) 138 db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 139 db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 140 db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 141 db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 142 db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 143 db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 144 db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 145 db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 146 db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 147 db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 148 db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 149 db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 150 db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 151 db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 152 db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 153 db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 154 db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 155 db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 156 db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 157 db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 158 db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 159 db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 160 db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 161 db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 162 db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 163 db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 164 db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 165 db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 166 db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 167 db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 168 db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 169 db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 170 ; [1, 2) 171 db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 172 db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 173 db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 174 db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 175 db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 176 db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 177 db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 178 db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 179 db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 180 db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 181 db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 182 db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 183 db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 184 db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 185 db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 186 db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 187 db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 188 db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 189 db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 190 db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 191 db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 192 db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 193 db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 194 db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 195 db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 196 db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 197 db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 198 db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 199 db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 200 db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 201 db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 202 db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 203 db 0, 0, 2, -1, 0, 0, 127, 0 204 205pw_258: times 2 dw 258 206 207cextern mc_subpel_filters 208%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 209 210%macro BIDIR_JMP_TABLE 2-* 211 ;evaluated at definition time (in loop below) 212 %xdefine %1_%2_table (%%table - 2*%3) 213 %xdefine %%base %1_%2_table 214 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 215 ; dynamically generated label 216 %%table: 217 %rep %0 - 2 ; repeat for num args 218 dd %%prefix %+ .w%3 - %%base 219 %rotate 1 220 %endrep 221%endmacro 222 223BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 224BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 225BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 226BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 227BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 228BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 229BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 230BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 231BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16 232 233%macro BASE_JMP_TABLE 3-* 234 %xdefine %1_%2_table (%%table - %3) 235 %xdefine %%base %1_%2 236 %%table: 237 %rep %0 - 2 238 dw %%base %+ _w%3 - %%base 239 %rotate 1 240 %endrep 241%endmacro 242 243%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) 244%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) 245 246BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 247BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 248 249%macro HV_JMP_TABLE 5-* 250 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) 251 %xdefine %%base %1_%3 252 %assign %%types %4 253 %if %%types & 1 254 %xdefine %1_%2_h_%3_table (%%h - %5) 255 %%h: 256 %rep %0 - 4 257 dw %%prefix %+ .h_w%5 - %%base 258 %rotate 1 259 %endrep 260 %rotate 4 261 %endif 262 %if %%types & 2 263 %xdefine %1_%2_v_%3_table (%%v - %5) 264 %%v: 265 %rep %0 - 4 266 dw %%prefix %+ .v_w%5 - %%base 267 %rotate 1 268 %endrep 269 %rotate 4 270 %endif 271 %if %%types & 4 272 %xdefine %1_%2_hv_%3_table (%%hv - %5) 273 %%hv: 274 %rep %0 - 4 275 dw %%prefix %+ .hv_w%5 - %%base 276 %rotate 1 277 %endrep 278 %endif 279%endmacro 280 281HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 282HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 283HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 284 285%macro SCALED_JMP_TABLE 2-* 286 %xdefine %1_%2_table (%%table - %3) 287 %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 288%%table: 289 %rep %0 - 2 290 dw %%base %+ .w%3 - %%base 291 %rotate 1 292 %endrep 293 %rotate 2 294%%dy_1024: 295 %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 296 %rep %0 - 2 297 dw %%base %+ .dy1_w%3 - %%base 298 %rotate 1 299 %endrep 300 %rotate 2 301%%dy_2048: 302 %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 303 %rep %0 - 2 304 dw %%base %+ .dy2_w%3 - %%base 305 %rotate 1 306 %endrep 307%endmacro 308 309SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 310SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 311 312%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 313 314SECTION .text 315 316INIT_XMM ssse3 317 318%if ARCH_X86_32 319 DECLARE_REG_TMP 1 320 %define base t0-put_ssse3 321%else 322 DECLARE_REG_TMP 7 323 %define base 0 324%endif 325 326%macro RESTORE_DSQ_32 1 327 %if ARCH_X86_32 328 mov %1, dsm ; restore dsq 329 %endif 330%endmacro 331 332cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy 333 movifnidn mxyd, r6m ; mx 334 LEA t0, put_ssse3 335 movifnidn srcq, srcmp 336 movifnidn ssq, ssmp 337 tzcnt wd, wm 338 mov hd, hm 339 test mxyd, mxyd 340 jnz .h 341 mov mxyd, r7m ; my 342 test mxyd, mxyd 343 jnz .v 344.put: 345 movzx wd, word [t0+wq*2+table_offset(put,)] 346 add wq, t0 347 RESTORE_DSQ_32 t0 348 jmp wq 349.put_w2: 350 movzx r4d, word [srcq+ssq*0] 351 movzx r6d, word [srcq+ssq*1] 352 lea srcq, [srcq+ssq*2] 353 mov [dstq+dsq*0], r4w 354 mov [dstq+dsq*1], r6w 355 lea dstq, [dstq+dsq*2] 356 sub hd, 2 357 jg .put_w2 358 RET 359.put_w4: 360 mov r4d, [srcq+ssq*0] 361 mov r6d, [srcq+ssq*1] 362 lea srcq, [srcq+ssq*2] 363 mov [dstq+dsq*0], r4d 364 mov [dstq+dsq*1], r6d 365 lea dstq, [dstq+dsq*2] 366 sub hd, 2 367 jg .put_w4 368 RET 369.put_w8: 370 movq m0, [srcq+ssq*0] 371 movq m1, [srcq+ssq*1] 372 lea srcq, [srcq+ssq*2] 373 movq [dstq+dsq*0], m0 374 movq [dstq+dsq*1], m1 375 lea dstq, [dstq+dsq*2] 376 sub hd, 2 377 jg .put_w8 378 RET 379.put_w16: 380 movu m0, [srcq+ssq*0] 381 movu m1, [srcq+ssq*1] 382 lea srcq, [srcq+ssq*2] 383 mova [dstq+dsq*0], m0 384 mova [dstq+dsq*1], m1 385 lea dstq, [dstq+dsq*2] 386 sub hd, 2 387 jg .put_w16 388 RET 389.put_w32: 390 movu m0, [srcq+ssq*0+16*0] 391 movu m1, [srcq+ssq*0+16*1] 392 movu m2, [srcq+ssq*1+16*0] 393 movu m3, [srcq+ssq*1+16*1] 394 lea srcq, [srcq+ssq*2] 395 mova [dstq+dsq*0+16*0], m0 396 mova [dstq+dsq*0+16*1], m1 397 mova [dstq+dsq*1+16*0], m2 398 mova [dstq+dsq*1+16*1], m3 399 lea dstq, [dstq+dsq*2] 400 sub hd, 2 401 jg .put_w32 402 RET 403.put_w64: 404 movu m0, [srcq+16*0] 405 movu m1, [srcq+16*1] 406 movu m2, [srcq+16*2] 407 movu m3, [srcq+16*3] 408 add srcq, ssq 409 mova [dstq+16*0], m0 410 mova [dstq+16*1], m1 411 mova [dstq+16*2], m2 412 mova [dstq+16*3], m3 413 add dstq, dsq 414 dec hd 415 jg .put_w64 416 RET 417.put_w128: 418 movu m0, [srcq+16*0] 419 movu m1, [srcq+16*1] 420 movu m2, [srcq+16*2] 421 movu m3, [srcq+16*3] 422 mova [dstq+16*0], m0 423 mova [dstq+16*1], m1 424 mova [dstq+16*2], m2 425 mova [dstq+16*3], m3 426 movu m0, [srcq+16*4] 427 movu m1, [srcq+16*5] 428 movu m2, [srcq+16*6] 429 movu m3, [srcq+16*7] 430 mova [dstq+16*4], m0 431 mova [dstq+16*5], m1 432 mova [dstq+16*6], m2 433 mova [dstq+16*7], m3 434 add srcq, ssq 435 add dstq, dsq 436 dec hd 437 jg .put_w128 438 RET 439.h: 440 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 441 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 442 imul mxyd, 0x00ff00ff 443 mova m4, [base+subpel_h_shufD] 444 mova m0, [base+bilin_h_shuf4] 445 add mxyd, 0x00100010 446 movd m5, mxyd 447 mov mxyd, r7m ; my 448 pshufd m5, m5, q0000 449 test mxyd, mxyd 450 jnz .hv 451 movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] 452 mova m3, [base+pw_2048] 453 add wq, t0 454 movifnidn dsq, dsmp 455 jmp wq 456.h_w2: 457 pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} 458.h_w2_loop: 459 movd m0, [srcq+ssq*0] 460 movd m1, [srcq+ssq*1] 461 lea srcq, [srcq+ssq*2] 462 punpckldq m0, m1 463 pshufb m0, m4 464 pmaddubsw m0, m5 465 pmulhrsw m0, m3 466 packuswb m0, m0 467 movd r6d, m0 468 mov [dstq+dsq*0], r6w 469 shr r6d, 16 470 mov [dstq+dsq*1], r6w 471 lea dstq, [dstq+dsq*2] 472 sub hd, 2 473 jg .h_w2_loop 474 RET 475.h_w4: 476 movq m4, [srcq+ssq*0] 477 movhps m4, [srcq+ssq*1] 478 lea srcq, [srcq+ssq*2] 479 pshufb m4, m0 480 pmaddubsw m4, m5 481 pmulhrsw m4, m3 482 packuswb m4, m4 483 movd [dstq+dsq*0], m4 484 psrlq m4, 32 485 movd [dstq+dsq*1], m4 486 lea dstq, [dstq+dsq*2] 487 sub hd, 2 488 jg .h_w4 489 RET 490.h_w8: 491 movu m0, [srcq+ssq*0] 492 movu m1, [srcq+ssq*1] 493 lea srcq, [srcq+ssq*2] 494 pshufb m0, m4 495 pshufb m1, m4 496 pmaddubsw m0, m5 497 pmaddubsw m1, m5 498 pmulhrsw m0, m3 499 pmulhrsw m1, m3 500 packuswb m0, m1 501 movq [dstq+dsq*0], m0 502 movhps [dstq+dsq*1], m0 503 lea dstq, [dstq+dsq*2] 504 sub hd, 2 505 jg .h_w8 506 RET 507.h_w16: 508 movu m0, [srcq+8*0] 509 movu m1, [srcq+8*1] 510 add srcq, ssq 511 pshufb m0, m4 512 pshufb m1, m4 513 pmaddubsw m0, m5 514 pmaddubsw m1, m5 515 pmulhrsw m0, m3 516 pmulhrsw m1, m3 517 packuswb m0, m1 518 mova [dstq], m0 519 add dstq, dsq 520 dec hd 521 jg .h_w16 522 RET 523.h_w32: 524 movu m0, [srcq+mmsize*0+8*0] 525 movu m1, [srcq+mmsize*0+8*1] 526 pshufb m0, m4 527 pshufb m1, m4 528 pmaddubsw m0, m5 529 pmaddubsw m1, m5 530 pmulhrsw m0, m3 531 pmulhrsw m1, m3 532 packuswb m0, m1 533 movu m1, [srcq+mmsize*1+8*0] 534 movu m2, [srcq+mmsize*1+8*1] 535 add srcq, ssq 536 pshufb m1, m4 537 pshufb m2, m4 538 pmaddubsw m1, m5 539 pmaddubsw m2, m5 540 pmulhrsw m1, m3 541 pmulhrsw m2, m3 542 packuswb m1, m2 543 mova [dstq+16*0], m0 544 mova [dstq+16*1], m1 545 add dstq, dsq 546 dec hd 547 jg .h_w32 548 RET 549.h_w64: 550 mov r6, -16*3 551.h_w64_loop: 552 movu m0, [srcq+r6+16*3+8*0] 553 movu m1, [srcq+r6+16*3+8*1] 554 pshufb m0, m4 555 pshufb m1, m4 556 pmaddubsw m0, m5 557 pmaddubsw m1, m5 558 pmulhrsw m0, m3 559 pmulhrsw m1, m3 560 packuswb m0, m1 561 mova [dstq+r6+16*3], m0 562 add r6, 16 563 jle .h_w64_loop 564 add srcq, ssq 565 add dstq, dsq 566 dec hd 567 jg .h_w64 568 RET 569.h_w128: 570 mov r6, -16*7 571.h_w128_loop: 572 movu m0, [srcq+r6+16*7+8*0] 573 movu m1, [srcq+r6+16*7+8*1] 574 pshufb m0, m4 575 pshufb m1, m4 576 pmaddubsw m0, m5 577 pmaddubsw m1, m5 578 pmulhrsw m0, m3 579 pmulhrsw m1, m3 580 packuswb m0, m1 581 mova [dstq+r6+16*7], m0 582 add r6, 16 583 jle .h_w128_loop 584 add srcq, ssq 585 add dstq, dsq 586 dec hd 587 jg .h_w128 588 RET 589.v: 590 movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] 591 imul mxyd, 0x00ff00ff 592 mova m5, [base+pw_2048] 593 add mxyd, 0x00100010 594 add wq, t0 595 movd m4, mxyd 596 pshufd m4, m4, q0000 597 movifnidn dsq, dsmp 598 jmp wq 599.v_w2: 600 movd m0, [srcq+ssq*0] 601.v_w2_loop: 602 pinsrw m0, [srcq+ssq*1], 1 ; 0 1 603 lea srcq, [srcq+ssq*2] 604 pshuflw m1, m0, q2301 605 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 606 punpcklbw m1, m0 607 pmaddubsw m1, m4 608 pmulhrsw m1, m5 609 packuswb m1, m1 610 movd r6d, m1 611 mov [dstq+dsq*1], r6w 612 shr r6d, 16 613 mov [dstq+dsq*0], r6w 614 lea dstq, [dstq+dsq*2] 615 sub hd, 2 616 jg .v_w2_loop 617 RET 618.v_w4: 619 movd m0, [srcq+ssq*0] 620.v_w4_loop: 621 movd m2, [srcq+ssq*1] 622 lea srcq, [srcq+ssq*2] 623 mova m1, m0 624 movd m0, [srcq+ssq*0] 625 punpckldq m1, m2 ; 0 1 626 punpckldq m2, m0 ; 1 2 627 punpcklbw m1, m2 628 pmaddubsw m1, m4 629 pmulhrsw m1, m5 630 packuswb m1, m1 631 movd [dstq+dsq*0], m1 632 psrlq m1, 32 633 movd [dstq+dsq*1], m1 634 ; 635 lea dstq, [dstq+dsq*2] 636 sub hd, 2 637 jg .v_w4_loop 638 RET 639.v_w8: 640 movq m0, [srcq+ssq*0] 641.v_w8_loop: 642 movq m2, [srcq+ssq*1] 643 lea srcq, [srcq+ssq*2] 644 mova m1, m0 645 movq m0, [srcq+ssq*0] 646 punpcklbw m1, m2 647 punpcklbw m2, m0 648 pmaddubsw m1, m4 649 pmaddubsw m2, m4 650 pmulhrsw m1, m5 651 pmulhrsw m2, m5 652 packuswb m1, m2 653 movq [dstq+dsq*0], m1 654 movhps [dstq+dsq*1], m1 655 lea dstq, [dstq+dsq*2] 656 sub hd, 2 657 jg .v_w8_loop 658 RET 659%macro PUT_BILIN_V_W16 0 660 movu m0, [srcq+ssq*0] 661%%loop: 662 movu m3, [srcq+ssq*1] 663 lea srcq, [srcq+ssq*2] 664 mova m1, m0 665 mova m2, m0 666 movu m0, [srcq+ssq*0] 667 punpcklbw m1, m3 668 punpckhbw m2, m3 669 pmaddubsw m1, m4 670 pmaddubsw m2, m4 671 pmulhrsw m1, m5 672 pmulhrsw m2, m5 673 packuswb m1, m2 674 punpcklbw m2, m3, m0 675 punpckhbw m3, m0 676 pmaddubsw m2, m4 677 pmaddubsw m3, m4 678 pmulhrsw m2, m5 679 pmulhrsw m3, m5 680 packuswb m2, m3 681 mova [dstq+dsq*0], m1 682 mova [dstq+dsq*1], m2 683 lea dstq, [dstq+dsq*2] 684 sub hd, 2 685 jg %%loop 686%endmacro 687.v_w16: 688 PUT_BILIN_V_W16 689 RET 690.v_w128: 691 lea r6d, [hq+(7<<16)] 692 jmp .v_w16gt 693.v_w64: 694 lea r6d, [hq+(3<<16)] 695 jmp .v_w16gt 696.v_w32: 697 lea r6d, [hq+(1<<16)] 698.v_w16gt: 699 mov r4, srcq 700%if ARCH_X86_64 701 mov r7, dstq 702%endif 703.v_w16gt_loop: 704 PUT_BILIN_V_W16 705%if ARCH_X86_64 706 add r4, 16 707 add r7, 16 708 movzx hd, r6b 709 mov srcq, r4 710 mov dstq, r7 711%else 712 mov dstq, dstmp 713 add r4, 16 714 movzx hd, r6w 715 add dstq, 16 716 mov srcq, r4 717 mov dstmp, dstq 718%endif 719 sub r6d, 1<<16 720 jg .v_w16gt 721 RET 722.hv: 723 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 724 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 725 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] 726 WIN64_SPILL_XMM 8 727 shl mxyd, 11 ; can't shift by 12 due to signed overflow 728 mova m7, [base+pw_15] 729 movd m6, mxyd 730 add wq, t0 731 pshuflw m6, m6, q0000 732 paddb m5, m5 733 punpcklqdq m6, m6 734 jmp wq 735.hv_w2: 736 RESTORE_DSQ_32 t0 737 movd m0, [srcq+ssq*0] 738 punpckldq m0, m0 739 pshufb m0, m4 740 pmaddubsw m0, m5 741.hv_w2_loop: 742 movd m1, [srcq+ssq*1] 743 lea srcq, [srcq+ssq*2] 744 movd m2, [srcq+ssq*0] 745 punpckldq m1, m2 746 pshufb m1, m4 747 pmaddubsw m1, m5 ; 1 _ 2 _ 748 shufps m2, m0, m1, q1032 ; 0 _ 1 _ 749 mova m0, m1 750 psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) 751 pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 752 pavgw m2, m7 ; src[x] + 8 753 paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 754 psrlw m1, 4 755 packuswb m1, m1 756%if ARCH_X86_64 757 movq r6, m1 758%else 759 pshuflw m1, m1, q2020 760 movd r6d, m1 761%endif 762 mov [dstq+dsq*0], r6w 763 shr r6, gprsize*4 764 mov [dstq+dsq*1], r6w 765 lea dstq, [dstq+dsq*2] 766 sub hd, 2 767 jg .hv_w2_loop 768 RET 769.hv_w4: 770 mova m4, [base+bilin_h_shuf4] 771 movddup m0, [srcq+ssq*0] 772 movifnidn dsq, dsmp 773 pshufb m0, m4 774 pmaddubsw m0, m5 775.hv_w4_loop: 776 movq m1, [srcq+ssq*1] 777 lea srcq, [srcq+ssq*2] 778 movhps m1, [srcq+ssq*0] 779 pshufb m1, m4 780 pmaddubsw m1, m5 ; 1 2 781 shufps m2, m0, m1, q1032 ; 0 1 782 mova m0, m1 783 psubw m1, m2 784 pmulhw m1, m6 785 pavgw m2, m7 786 paddw m1, m2 787 psrlw m1, 4 788 packuswb m1, m1 789 movd [dstq+dsq*0], m1 790 psrlq m1, 32 791 movd [dstq+dsq*1], m1 792 lea dstq, [dstq+dsq*2] 793 sub hd, 2 794 jg .hv_w4_loop 795 RET 796.hv_w8: 797 movu m0, [srcq+ssq*0] 798 movifnidn dsq, dsmp 799 pshufb m0, m4 800 pmaddubsw m0, m5 801.hv_w8_loop: 802 movu m2, [srcq+ssq*1] 803 lea srcq, [srcq+ssq*2] 804 pshufb m2, m4 805 pmaddubsw m2, m5 806 psubw m1, m2, m0 807 pmulhw m1, m6 808 pavgw m0, m7 809 paddw m1, m0 810 movu m0, [srcq+ssq*0] 811 pshufb m0, m4 812 pmaddubsw m0, m5 813 psubw m3, m0, m2 814 pmulhw m3, m6 815 pavgw m2, m7 816 paddw m3, m2 817 psrlw m1, 4 818 psrlw m3, 4 819 packuswb m1, m3 820 movq [dstq+dsq*0], m1 821 movhps [dstq+dsq*1], m1 822 lea dstq, [dstq+dsq*2] 823 sub hd, 2 824 jg .hv_w8_loop 825 RET 826.hv_w128: 827 lea r6d, [hq+(7<<16)] 828 jmp .hv_w16_start 829.hv_w64: 830 lea r6d, [hq+(3<<16)] 831 jmp .hv_w16_start 832.hv_w32: 833 lea r6d, [hq+(1<<16)] 834.hv_w16_start: 835 mov r4, srcq 836%if ARCH_X86_32 837 %define m8 [dstq] 838%else 839 mov r7, dstq 840%endif 841.hv_w16: 842 movifnidn dsq, dsmp 843%if WIN64 844 movaps r4m, m8 845%endif 846.hv_w16_loop0: 847 movu m0, [srcq+8*0] 848 movu m1, [srcq+8*1] 849 pshufb m0, m4 850 pshufb m1, m4 851 pmaddubsw m0, m5 852 pmaddubsw m1, m5 853.hv_w16_loop: 854 add srcq, ssq 855 movu m2, [srcq+8*0] 856 movu m3, [srcq+8*1] 857 pshufb m2, m4 858 pshufb m3, m4 859 pmaddubsw m2, m5 860 pmaddubsw m3, m5 861 mova m8, m2 862 psubw m2, m0 863 pmulhw m2, m6 864 pavgw m0, m7 865 paddw m2, m0 866 mova m0, m3 867 psubw m3, m1 868 pmulhw m3, m6 869 pavgw m1, m7 870 paddw m3, m1 871 mova m1, m0 872 mova m0, m8 873 psrlw m2, 4 874 psrlw m3, 4 875 packuswb m2, m3 876 mova [dstq], m2 877 add dstq, dsmp 878 dec hd 879 jg .hv_w16_loop 880%if ARCH_X86_32 881 mov dstq, dstm 882 add r4, 16 883 movzx hd, r6w 884 add dstq, 16 885 mov srcq, r4 886 mov dstm, dstq 887%else 888 add r4, 16 889 add r7, 16 890 movzx hd, r6b 891 mov srcq, r4 892 mov dstq, r7 893%endif 894 sub r6d, 1<<16 895 jg .hv_w16_loop0 896%if WIN64 897 movaps m8, r4m 898%endif 899 RET 900 901%if ARCH_X86_32 902 %define base r6-prep%+SUFFIX 903%else 904 %define base 0 905%endif 906 907cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 908 movifnidn mxyd, r5m ; mx 909 LEA r6, prep_ssse3 910 tzcnt wd, wm 911 movifnidn hd, hm 912 test mxyd, mxyd 913 jnz .h 914 mov mxyd, r6m ; my 915 test mxyd, mxyd 916 jnz .v 917.prep: 918 movzx wd, word [r6+wq*2+table_offset(prep,)] 919 pxor m4, m4 920 add wq, r6 921 lea stride3q, [strideq*3] 922 jmp wq 923.prep_w4: 924 movd m0, [srcq+strideq*0] 925 movd m1, [srcq+strideq*1] 926 movd m2, [srcq+strideq*2] 927 movd m3, [srcq+stride3q ] 928 lea srcq, [srcq+strideq*4] 929 punpckldq m0, m1 930 punpckldq m2, m3 931 punpcklbw m0, m4 932 punpcklbw m2, m4 933 psllw m0, 4 934 psllw m2, 4 935 mova [tmpq+16*0], m0 936 mova [tmpq+16*1], m2 937 add tmpq, 16*2 938 sub hd, 4 939 jg .prep_w4 940 RET 941.prep_w8: 942 movq m0, [srcq+strideq*0] 943 movq m1, [srcq+strideq*1] 944 movq m2, [srcq+strideq*2] 945 movq m3, [srcq+stride3q ] 946 lea srcq, [srcq+strideq*4] 947 punpcklbw m0, m4 948 punpcklbw m1, m4 949 punpcklbw m2, m4 950 punpcklbw m3, m4 951 psllw m0, 4 952 psllw m1, 4 953 psllw m2, 4 954 psllw m3, 4 955 mova [tmpq+16*0], m0 956 mova [tmpq+16*1], m1 957 mova [tmpq+16*2], m2 958 mova [tmpq+16*3], m3 959 add tmpq, 16*4 960 sub hd, 4 961 jg .prep_w8 962 RET 963.prep_w16: 964 movu m1, [srcq+strideq*0] 965 movu m3, [srcq+strideq*1] 966 lea srcq, [srcq+strideq*2] 967 punpcklbw m0, m1, m4 968 punpckhbw m1, m4 969 punpcklbw m2, m3, m4 970 punpckhbw m3, m4 971 psllw m0, 4 972 psllw m1, 4 973 psllw m2, 4 974 psllw m3, 4 975 mova [tmpq+16*0], m0 976 mova [tmpq+16*1], m1 977 mova [tmpq+16*2], m2 978 mova [tmpq+16*3], m3 979 add tmpq, 16*4 980 sub hd, 2 981 jg .prep_w16 982 RET 983.prep_w128: 984 mov r3, -128 985 jmp .prep_w32_start 986.prep_w64: 987 mov r3, -64 988 jmp .prep_w32_start 989.prep_w32: 990 mov r3, -32 991.prep_w32_start: 992 sub srcq, r3 993.prep_w32_vloop: 994 mov r6, r3 995.prep_w32_hloop: 996 movu m1, [srcq+r6+16*0] 997 movu m3, [srcq+r6+16*1] 998 punpcklbw m0, m1, m4 999 punpckhbw m1, m4 1000 punpcklbw m2, m3, m4 1001 punpckhbw m3, m4 1002 psllw m0, 4 1003 psllw m1, 4 1004 psllw m2, 4 1005 psllw m3, 4 1006 mova [tmpq+16*0], m0 1007 mova [tmpq+16*1], m1 1008 mova [tmpq+16*2], m2 1009 mova [tmpq+16*3], m3 1010 add tmpq, 16*4 1011 add r6, 32 1012 jl .prep_w32_hloop 1013 add srcq, strideq 1014 dec hd 1015 jg .prep_w32_vloop 1016 RET 1017.h: 1018 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 1019 ; = (16 - mx) * src[x] + mx * src[x + 1] 1020 imul mxyd, 0x00ff00ff 1021 mova m4, [base+subpel_h_shufD] 1022 add mxyd, 0x00100010 1023 movd m5, mxyd 1024 mov mxyd, r6m ; my 1025 pshufd m5, m5, q0000 1026 test mxyd, mxyd 1027 jnz .hv 1028 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 1029 add wq, r6 1030 jmp wq 1031.h_w4: 1032 mova m4, [base+bilin_h_shuf4] 1033 lea stride3q, [strideq*3] 1034.h_w4_loop: 1035 movq m0, [srcq+strideq*0] 1036 movhps m0, [srcq+strideq*1] 1037 movq m1, [srcq+strideq*2] 1038 movhps m1, [srcq+stride3q ] 1039 lea srcq, [srcq+strideq*4] 1040 pshufb m0, m4 1041 pshufb m1, m4 1042 pmaddubsw m0, m5 1043 pmaddubsw m1, m5 1044 mova [tmpq+0 ], m0 1045 mova [tmpq+16], m1 1046 add tmpq, 32 1047 sub hd, 4 1048 jg .h_w4_loop 1049 RET 1050.h_w8: 1051 lea stride3q, [strideq*3] 1052.h_w8_loop: 1053 movu m0, [srcq+strideq*0] 1054 movu m1, [srcq+strideq*1] 1055 movu m2, [srcq+strideq*2] 1056 movu m3, [srcq+stride3q ] 1057 lea srcq, [srcq+strideq*4] 1058 REPX {pshufb x, m4}, m0, m1, m2, m3 1059 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 1060 mova [tmpq+16*0], m0 1061 mova [tmpq+16*1], m1 1062 mova [tmpq+16*2], m2 1063 mova [tmpq+16*3], m3 1064 add tmpq, 16*4 1065 sub hd, 4 1066 jg .h_w8_loop 1067 RET 1068.h_w16: 1069 movu m0, [srcq+strideq*0+8*0] 1070 movu m1, [srcq+strideq*0+8*1] 1071 movu m2, [srcq+strideq*1+8*0] 1072 movu m3, [srcq+strideq*1+8*1] 1073 lea srcq, [srcq+strideq*2] 1074 REPX {pshufb x, m4}, m0, m1, m2, m3 1075 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 1076 mova [tmpq+16*0], m0 1077 mova [tmpq+16*1], m1 1078 mova [tmpq+16*2], m2 1079 mova [tmpq+16*3], m3 1080 add tmpq, 16*4 1081 sub hd, 2 1082 jg .h_w16 1083 RET 1084.h_w128: 1085 mov r3, -128 1086 jmp .h_w32_start 1087.h_w64: 1088 mov r3, -64 1089 jmp .h_w32_start 1090.h_w32: 1091 mov r3, -32 1092.h_w32_start: 1093 sub srcq, r3 1094.h_w32_vloop: 1095 mov r6, r3 1096.h_w32_hloop: 1097 movu m0, [srcq+r6+8*0] 1098 movu m1, [srcq+r6+8*1] 1099 movu m2, [srcq+r6+8*2] 1100 movu m3, [srcq+r6+8*3] 1101 REPX {pshufb x, m4}, m0, m1, m2, m3 1102 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 1103 mova [tmpq+16*0], m0 1104 mova [tmpq+16*1], m1 1105 mova [tmpq+16*2], m2 1106 mova [tmpq+16*3], m3 1107 add tmpq, 16*4 1108 add r6, 32 1109 jl .h_w32_hloop 1110 add srcq, strideq 1111 dec hd 1112 jg .h_w32_vloop 1113 RET 1114.v: 1115 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1116 imul mxyd, 0x00ff00ff 1117 add mxyd, 0x00100010 1118 add wq, r6 1119 lea stride3q, [strideq*3] 1120 movd m5, mxyd 1121 pshufd m5, m5, q0000 1122 jmp wq 1123.v_w4: 1124 movd m0, [srcq+strideq*0] 1125.v_w4_loop: 1126 movd m1, [srcq+strideq*1] 1127 movd m2, [srcq+strideq*2] 1128 movd m3, [srcq+stride3q ] 1129 lea srcq, [srcq+strideq*4] 1130 punpckldq m0, m1 1131 punpckldq m1, m2 1132 punpcklbw m0, m1 ; 01 12 1133 pmaddubsw m0, m5 1134 mova [tmpq+16*0], m0 1135 movd m0, [srcq+strideq*0] 1136 punpckldq m2, m3 1137 punpckldq m3, m0 1138 punpcklbw m2, m3 ; 23 34 1139 pmaddubsw m2, m5 1140 mova [tmpq+16*1], m2 1141 add tmpq, 16*2 1142 sub hd, 4 1143 jg .v_w4_loop 1144 RET 1145.v_w8: 1146 movq m0, [srcq+strideq*0] 1147.v_w8_loop: 1148 movq m1, [srcq+strideq*1] 1149 movq m2, [srcq+strideq*2] 1150 movq m3, [srcq+stride3q ] 1151 lea srcq, [srcq+strideq*4] 1152 punpcklbw m0, m1 ; 01 1153 punpcklbw m1, m2 ; 12 1154 pmaddubsw m0, m5 1155 pmaddubsw m1, m5 1156 mova [tmpq+16*0], m0 1157 movq m0, [srcq+strideq*0] 1158 punpcklbw m2, m3 ; 23 1159 punpcklbw m3, m0 ; 34 1160 pmaddubsw m2, m5 1161 mova [tmpq+16*1], m1 1162 pmaddubsw m3, m5 1163 mova [tmpq+16*2], m2 1164 mova [tmpq+16*3], m3 1165 add tmpq, 16*4 1166 sub hd, 4 1167 jg .v_w8_loop 1168 RET 1169.v_w16: 1170 movu m0, [srcq+strideq*0] 1171.v_w16_loop: 1172 movu m1, [srcq+strideq*1] 1173 movu m2, [srcq+strideq*2] 1174 movu m3, [srcq+stride3q ] 1175 lea srcq, [srcq+strideq*4] 1176 punpcklbw m4, m0, m1 1177 punpckhbw m0, m1 1178 pmaddubsw m4, m5 1179 pmaddubsw m0, m5 1180 mova [tmpq+16*0], m4 1181 punpcklbw m4, m1, m2 1182 punpckhbw m1, m2 1183 pmaddubsw m4, m5 1184 mova [tmpq+16*1], m0 1185 movu m0, [srcq+strideq*0] 1186 pmaddubsw m1, m5 1187 mova [tmpq+16*2], m4 1188 punpcklbw m4, m2, m3 1189 punpckhbw m2, m3 1190 pmaddubsw m4, m5 1191 mova [tmpq+16*3], m1 1192 pmaddubsw m2, m5 1193 mova [tmpq+16*4], m4 1194 punpcklbw m4, m3, m0 1195 punpckhbw m3, m0 1196 pmaddubsw m4, m5 1197 mova [tmpq+16*5], m2 1198 pmaddubsw m3, m5 1199 mova [tmpq+16*6], m4 1200 mova [tmpq+16*7], m3 1201 add tmpq, 16*8 1202 sub hd, 4 1203 jg .v_w16_loop 1204 RET 1205.v_w128: 1206 lea r3d, [hq+(3<<8)] 1207 mov r6d, 256 1208 jmp .v_w32_start 1209.v_w64: 1210 lea r3d, [hq+(1<<8)] 1211 mov r6d, 128 1212 jmp .v_w32_start 1213.v_w32: 1214 xor r3d, r3d 1215 mov r6d, 64 1216.v_w32_start: 1217%if ARCH_X86_64 1218 %if WIN64 1219 PUSH r7 1220 %endif 1221 mov r7, tmpq 1222%endif 1223 mov r5, srcq 1224.v_w32_hloop: 1225 movu m0, [srcq+strideq*0+16*0] 1226 movu m1, [srcq+strideq*0+16*1] 1227.v_w32_vloop: 1228 movu m2, [srcq+strideq*1+16*0] 1229 movu m3, [srcq+strideq*1+16*1] 1230 lea srcq, [srcq+strideq*2] 1231 punpcklbw m4, m0, m2 1232 punpckhbw m0, m2 1233 pmaddubsw m4, m5 1234 pmaddubsw m0, m5 1235 mova [tmpq+16*0], m4 1236 mova [tmpq+16*1], m0 1237 movu m0, [srcq+strideq*0+16*0] 1238 punpcklbw m4, m1, m3 1239 punpckhbw m1, m3 1240 pmaddubsw m4, m5 1241 pmaddubsw m1, m5 1242 mova [tmpq+16*2], m4 1243 mova [tmpq+16*3], m1 1244 movu m1, [srcq+strideq*0+16*1] 1245 add tmpq, r6 1246 punpcklbw m4, m2, m0 1247 punpckhbw m2, m0 1248 pmaddubsw m4, m5 1249 pmaddubsw m2, m5 1250 mova [tmpq+16*0], m4 1251 mova [tmpq+16*1], m2 1252 punpcklbw m4, m3, m1 1253 punpckhbw m3, m1 1254 pmaddubsw m4, m5 1255 pmaddubsw m3, m5 1256 mova [tmpq+16*2], m4 1257 mova [tmpq+16*3], m3 1258 add tmpq, r6 1259 sub hd, 2 1260 jg .v_w32_vloop 1261 add r5, 32 1262 movzx hd, r3b 1263 mov srcq, r5 1264%if ARCH_X86_64 1265 add r7, 16*4 1266 mov tmpq, r7 1267%else 1268 mov tmpq, tmpmp 1269 add tmpq, 16*4 1270 mov tmpmp, tmpq 1271%endif 1272 sub r3d, 1<<8 1273 jg .v_w32_hloop 1274%if WIN64 1275 POP r7 1276%endif 1277 RET 1278.hv: 1279 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1280 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1281 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1282 imul mxyd, 0x08000800 1283 WIN64_SPILL_XMM 8 1284 movd m6, mxyd 1285 add wq, r6 1286 pshufd m6, m6, q0000 1287 jmp wq 1288.hv_w4: 1289 mova m4, [base+bilin_h_shuf4] 1290 movddup m0, [srcq+strideq*0] 1291 lea r3, [strideq*3] 1292 pshufb m0, m4 1293 pmaddubsw m0, m5 ; _ 0 1294.hv_w4_loop: 1295 movq m1, [srcq+strideq*1] 1296 movhps m1, [srcq+strideq*2] 1297 movq m2, [srcq+r3 ] 1298 lea srcq, [srcq+strideq*4] 1299 movhps m2, [srcq+strideq*0] 1300 pshufb m1, m4 1301 pshufb m2, m4 1302 pmaddubsw m1, m5 ; 1 2 1303 pmaddubsw m2, m5 ; 3 4 1304 shufpd m0, m1, 0x01 ; 0 1 1305 shufpd m3, m1, m2, 0x01 ; 2 3 1306 psubw m1, m0 1307 pmulhrsw m1, m6 1308 paddw m1, m0 1309 mova m0, m2 1310 psubw m2, m3 1311 pmulhrsw m2, m6 1312 paddw m2, m3 1313 mova [tmpq+16*0], m1 1314 mova [tmpq+16*1], m2 1315 add tmpq, 32 1316 sub hd, 4 1317 jg .hv_w4_loop 1318 RET 1319.hv_w8: 1320 movu m0, [srcq+strideq*0] 1321 pshufb m0, m4 1322 pmaddubsw m0, m5 ; 0 1323.hv_w8_loop: 1324 movu m1, [srcq+strideq*1] 1325 lea srcq, [srcq+strideq*2] 1326 movu m2, [srcq+strideq*0] 1327 pshufb m1, m4 1328 pshufb m2, m4 1329 pmaddubsw m1, m5 ; 1 1330 pmaddubsw m2, m5 ; 2 1331 psubw m3, m1, m0 1332 pmulhrsw m3, m6 1333 paddw m3, m0 1334 mova m0, m2 1335 psubw m2, m1 1336 pmulhrsw m2, m6 1337 paddw m2, m1 1338 mova [tmpq+16*0], m3 1339 mova [tmpq+16*1], m2 1340 add tmpq, 16*2 1341 sub hd, 2 1342 jg .hv_w8_loop 1343 RET 1344.hv_w128: 1345 lea r3d, [hq+(7<<8)] 1346 mov r5d, 256 1347 jmp .hv_w16_start 1348.hv_w64: 1349 lea r3d, [hq+(3<<8)] 1350 mov r5d, 128 1351 jmp .hv_w16_start 1352.hv_w32: 1353 lea r3d, [hq+(1<<8)] 1354 mov r5d, 64 1355 jmp .hv_w16_start 1356.hv_w16: 1357 xor r3d, r3d 1358 mov r5d, 32 1359.hv_w16_start: 1360 mov r6, srcq 1361%if ARCH_X86_64 1362 %if WIN64 1363 PUSH r7 1364 %endif 1365 mov r7, tmpq 1366%endif 1367.hv_w16_hloop: 1368 movu m0, [srcq+strideq*0+8*0] 1369 movu m1, [srcq+strideq*0+8*1] 1370 pshufb m0, m4 1371 pshufb m1, m4 1372 pmaddubsw m0, m5 ; 0a 1373 pmaddubsw m1, m5 ; 0b 1374.hv_w16_vloop: 1375 movu m2, [srcq+strideq*1+8*0] 1376 pshufb m2, m4 1377 pmaddubsw m2, m5 ; 1a 1378 psubw m3, m2, m0 1379 pmulhrsw m3, m6 1380 paddw m3, m0 1381 mova [tmpq+16*0], m3 1382 movu m3, [srcq+strideq*1+8*1] 1383 lea srcq, [srcq+strideq*2] 1384 pshufb m3, m4 1385 pmaddubsw m3, m5 ; 1b 1386 psubw m0, m3, m1 1387 pmulhrsw m0, m6 1388 paddw m0, m1 1389 mova [tmpq+16*1], m0 1390 add tmpq, r5 1391 movu m0, [srcq+strideq*0+8*0] 1392 pshufb m0, m4 1393 pmaddubsw m0, m5 ; 2a 1394 psubw m1, m0, m2 1395 pmulhrsw m1, m6 1396 paddw m1, m2 1397 mova [tmpq+16*0], m1 1398 movu m1, [srcq+strideq*0+8*1] 1399 pshufb m1, m4 1400 pmaddubsw m1, m5 ; 2b 1401 psubw m2, m1, m3 1402 pmulhrsw m2, m6 1403 paddw m2, m3 1404 mova [tmpq+16*1], m2 1405 add tmpq, r5 1406 sub hd, 2 1407 jg .hv_w16_vloop 1408 movzx hd, r3b 1409%if ARCH_X86_64 1410 add r6, 16 1411 add r7, 2*16 1412 mov srcq, r6 1413 mov tmpq, r7 1414%else 1415 mov tmpq, tmpm 1416 add r6, 16 1417 add tmpq, 2*16 1418 mov srcq, r6 1419 mov tmpm, tmpq 1420%endif 1421 sub r3d, 1<<8 1422 jg .hv_w16_hloop 1423%if WIN64 1424 POP r7 1425%endif 1426 RET 1427 1428; int8_t subpel_filters[5][15][8] 1429%assign FILTER_REGULAR (0*15 << 16) | 3*15 1430%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1431%assign FILTER_SHARP (2*15 << 16) | 3*15 1432 1433%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1434cglobal %1_%2_8bpc 1435 mov t0d, FILTER_%3 1436%ifidn %3, %4 1437 mov t1d, t0d 1438%else 1439 mov t1d, FILTER_%4 1440%endif 1441%if %0 == 5 ; skip the jump in the last filter 1442 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1443%endif 1444%endmacro 1445 1446%if ARCH_X86_32 1447DECLARE_REG_TMP 1, 2 1448%elif WIN64 1449DECLARE_REG_TMP 4, 5 1450%else 1451DECLARE_REG_TMP 7, 8 1452%endif 1453 1454%if ARCH_X86_32 1455 %define base_reg r1 1456 %define base base_reg-put_ssse3 1457%else 1458 %define base_reg r8 1459 %define base 0 1460%endif 1461 1462%define PUT_8TAP_FN FN put_8tap, 1463PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc 1464PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc 1465PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc 1466PUT_8TAP_FN regular, REGULAR, REGULAR 1467 1468cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ns 1469 imul mxd, mxm, 0x010101 1470 add mxd, t0d ; 8tap_h, mx, 4tap_h 1471%if ARCH_X86_64 1472 imul myd, mym, 0x010101 1473 add myd, t1d ; 8tap_v, my, 4tap_v 1474%else 1475 imul ssd, mym, 0x010101 1476 add ssd, t1d ; 8tap_v, my, 4tap_v 1477 mov srcq, srcm 1478%endif 1479 mov wd, wm 1480 movifnidn hd, hm 1481 LEA base_reg, put_ssse3 1482 test mxd, 0xf00 1483 jnz .h 1484%if ARCH_X86_32 1485 test ssd, 0xf00 1486%else 1487 test myd, 0xf00 1488%endif 1489 jnz .v 1490.put: 1491 tzcnt wd, wd 1492 movzx wd, word [base_reg+wq*2+table_offset(put,)] 1493 movifnidn ssq, ssmp 1494 add wq, base_reg 1495 movifnidn dsq, dsmp 1496%if WIN64 1497 pop r8 1498%endif 1499 lea r6, [ssq*3] 1500 jmp wq 1501.h: 1502%if ARCH_X86_32 1503 test ssd, 0xf00 1504%else 1505 test myd, 0xf00 1506%endif 1507 jnz .hv 1508 movifnidn ssq, ssmp 1509 mova m5, [base+pw_34] ; 2 + (8 << 2) 1510 cmp wd, 4 1511 jle mangle(private_prefix %+ _put_8tap_8bpc %+ SUFFIX).h_w4 1512 WIN64_SPILL_XMM 11 1513%if ARCH_X86_64 1514 mova m8, [base+subpel_h_shufD] 1515 mova m9, [base+subpel_h_shufE] 1516 mova m10, [base+subpel_h_shufF] 1517%endif 1518 shr mxd, 16 1519 sub srcq, 2 1520 movq m7, [base_reg-put_ssse3+subpel_filters+1+mxq*8] 1521 punpcklwd m7, m7 1522 pshufd m4, m7, q0000 1523 pshufd m6, m7, q1111 1524 pshufd m7, m7, q2222 1525 sub wd, 16 1526 jge .h_w16 1527%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] 1528%if ARCH_X86_32 1529 pshufb %2, %1, [base+subpel_h_shufD] 1530 pshufb %3, %1, [base+subpel_h_shufE] 1531 pshufb %1, [base+subpel_h_shufF] 1532%else 1533 pshufb %2, %1, m8 1534 pshufb %3, %1, m9 1535 pshufb %1, m10 1536%endif 1537 pmaddubsw %2, m4 1538 pmaddubsw %3, m6 1539 pmaddubsw %1, m7 1540 paddw %2, m5 1541 paddw %2, %3 1542 paddw %1, %2 1543 psraw %1, 6 1544%endmacro 1545%if ARCH_X86_32 1546 mov r4, dsm 1547%endif 1548.h_w8: 1549 movu m0, [srcq+ssq*0] 1550 movu m1, [srcq+ssq*1] 1551 lea srcq, [srcq+ssq*2] 1552 PUT_6TAP_H m0, m2, m3 1553 PUT_6TAP_H m1, m2, m3 1554 packuswb m0, m1 1555%if ARCH_X86_32 1556 movq [dstq+r4*0], m0 1557 movhps [dstq+r4*1], m0 1558 lea dstq, [dstq+r4*2] 1559%else 1560 movq [dstq+dsq*0], m0 1561 movhps [dstq+dsq*1], m0 1562 lea dstq, [dstq+dsq*2] 1563%endif 1564 sub hd, 2 1565 jg .h_w8 1566 RET 1567.h_w16: 1568 add srcq, wq 1569 add dstq, wq 1570 neg wq 1571.h_w16_loop_v: 1572 mov r6, wq 1573.h_w16_loop_h: 1574 movu m0, [srcq+r6+8*0] 1575 movu m1, [srcq+r6+8*1] 1576 PUT_6TAP_H m0, m2, m3 1577 PUT_6TAP_H m1, m2, m3 1578 packuswb m0, m1 1579 mova [dstq+r6], m0 1580 add r6, 16 1581 jle .h_w16_loop_h 1582 add srcq, ssq 1583 add dstq, dsmp 1584 dec hd 1585 jg .h_w16_loop_v 1586 RET 1587.v: 1588%if ARCH_X86_32 1589 %define dsq r4 1590 %define m8 [base+pw_512] 1591 movzx mxd, ssb 1592 shr ssd, 16 1593 cmp hd, 6 1594 cmovs ssd, mxd 1595 movq m7, [base_reg-put_ssse3+subpel_filters+1+ssq*8] 1596 mov ssq, ssm 1597 punpcklwd m7, m7 1598 pshufd m5, m7, q0000 1599 mov r6, ssq 1600 pshufd m6, m7, q1111 1601 neg r6 1602 pshufd m7, m7, q2222 1603 cmp wd, 4 1604 jge .v_w4 1605%else 1606 WIN64_SPILL_XMM 9, 12 1607 movzx mxd, myb 1608 shr myd, 16 1609 cmp hd, 6 1610 cmovs myd, mxd 1611 movq m7, [base_reg-put_ssse3+subpel_filters+1+myq*8] 1612 mova m8, [base+pw_512] 1613 punpcklwd m7, m7 1614 pshufd m5, m7, q0000 1615 mov nsq, ssq 1616 pshufd m6, m7, q1111 1617 neg nsq 1618 pshufd m7, m7, q2222 1619 cmp wd, 4 1620 je .v_w4 1621 jg .v_w8 1622%endif 1623.v_w2: 1624%if ARCH_X86_32 1625 mov dsq, dsm 1626 movd m1, [srcq+r6 *2] 1627 movd m3, [srcq+r6 *1] 1628%else 1629 movd m1, [srcq+nsq*2] 1630 movd m3, [srcq+nsq*1] 1631%endif 1632 movd m2, [srcq+ssq*0] 1633 movd m4, [srcq+ssq*1] 1634 lea srcq, [srcq+ssq*2] 1635 movd m0, [srcq+ssq*0] 1636 punpcklwd m1, m3 ; 0 1 1637 punpcklwd m3, m2 ; 1 2 1638 punpcklwd m2, m4 ; 2 3 1639 punpcklwd m4, m0 ; 3 4 1640 punpcklbw m1, m3 ; 01 12 1641 punpcklbw m2, m4 ; 23 34 1642.v_w2_loop: 1643 movd m3, [srcq+ssq*1] 1644 lea srcq, [srcq+ssq*2] 1645 pmaddubsw m4, m1, m5 ; a0 b0 1646 mova m1, m2 1647 pmaddubsw m2, m6 ; a1 b1 1648 paddw m4, m2 1649 punpcklwd m2, m0, m3 ; 4 5 1650 movd m0, [srcq+ssq*0] 1651 punpcklwd m3, m0 ; 5 6 1652 punpcklbw m2, m3 ; 67 78 1653 pmaddubsw m3, m2, m7 ; a2 b2 1654 paddw m4, m3 1655 pmulhrsw m4, m8 1656 packuswb m4, m4 1657 movd r6d, m4 1658 mov [dstq+dsq*0], r6w 1659 shr r6d, 16 1660 mov [dstq+dsq*1], r6w 1661 lea dstq, [dstq+dsq*2] 1662 sub hd, 2 1663 jg .v_w2_loop 1664 RET 1665.v_w4: 1666%if ARCH_X86_32 1667 shl wd, 14 1668 lea srcq, [srcq+r6*2] 1669 lea r6d, [hq+wq-(1<<16)] 1670 mov srcm, srcq 1671 mov dsq, dsm 1672.v_w4_loop0: 1673 movd m1, [srcq+ssq*0] 1674 movd m3, [srcq+ssq*1] 1675 lea srcq, [srcq+ssq*2] 1676%else 1677 movd m1, [srcq+nsq*2] 1678 movd m3, [srcq+nsq*1] 1679%endif 1680 movd m2, [srcq+ssq*0] 1681 movd m4, [srcq+ssq*1] 1682 lea srcq, [srcq+ssq*2] 1683 movd m0, [srcq+ssq*0] 1684 punpckldq m1, m3 ; 0 1 1685 punpckldq m3, m2 ; 1 2 1686 punpckldq m2, m4 ; 2 3 1687 punpckldq m4, m0 ; 3 4 1688 punpcklbw m1, m3 ; 01 12 1689 punpcklbw m2, m4 ; 23 34 1690.v_w4_loop: 1691 movd m3, [srcq+ssq*1] 1692 lea srcq, [srcq+ssq*2] 1693 pmaddubsw m4, m1, m5 ; a0 b0 1694 mova m1, m2 1695 pmaddubsw m2, m6 ; a1 b1 1696 paddw m4, m2 1697 punpckldq m2, m0, m3 ; 4 5 1698 movd m0, [srcq+ssq*0] 1699 punpckldq m3, m0 ; 5 6 1700 punpcklbw m2, m3 ; 67 78 1701 pmaddubsw m3, m2, m7 ; a2 b2 1702 paddw m4, m3 1703 pmulhrsw m4, m8 1704 packuswb m4, m4 1705 movd [dstq+dsq*0], m4 1706 psrlq m4, 32 1707 movd [dstq+dsq*1], m4 1708 lea dstq, [dstq+dsq*2] 1709 sub hd, 2 1710 jg .v_w4_loop 1711%if ARCH_X86_32 1712 mov srcq, srcm 1713 mov dstq, dstm 1714 movzx hd, r6w 1715 add srcq, 4 1716 add dstq, 4 1717 mov srcm, srcq 1718 mov dstm, dstq 1719 sub r6d, 1<<16 1720 jg .v_w4_loop0 1721%endif 1722 RET 1723%if ARCH_X86_64 1724.v_w8: 1725 WIN64_PUSH_XMM 12 1726 shl wd, 5 1727 lea r6d, [hq+wq-256] 1728.v_w8_loop0: 1729 movq m1, [srcq+nsq*2] 1730 movq m2, [srcq+nsq*1] 1731 lea r4, [srcq+ssq*2] 1732 movq m3, [srcq+ssq*0] 1733 movq m4, [srcq+ssq*1] 1734 mov r7, dstq 1735 movq m0, [r4 +ssq*0] 1736 punpcklbw m1, m2 ; 01 1737 punpcklbw m2, m3 ; 12 1738 punpcklbw m3, m4 ; 23 1739 punpcklbw m4, m0 ; 34 1740.v_w8_loop: 1741 pmaddubsw m10, m1, m5 ; a0 1742 mova m1, m3 1743 pmaddubsw m11, m2, m5 ; b0 1744 mova m2, m4 1745 pmaddubsw m3, m6 ; a1 1746 pmaddubsw m4, m6 ; b1 1747 paddw m10, m3 1748 paddw m11, m4 1749 movq m4, [r4+ssq*1] 1750 lea r4, [r4+ssq*2] 1751 punpcklbw m3, m0, m4 ; 67 1752 movq m0, [r4+ssq*0] 1753 punpcklbw m4, m0 ; 78 1754 pmaddubsw m9, m3, m7 ; a2 1755 paddw m10, m9 1756 pmaddubsw m9, m4, m7 ; b2 1757 paddw m11, m9 1758 pmulhrsw m10, m8 1759 pmulhrsw m11, m8 1760 packuswb m10, m11 1761 movq [r7+dsq*0], m10 1762 movhps [r7+dsq*1], m10 1763 lea r7, [r7+dsq*2] 1764 sub hd, 2 1765 jg .v_w8_loop 1766 add srcq, 8 1767 add dstq, 8 1768 movzx hd, r6b 1769 sub r6d, 1<<8 1770 jg .v_w8_loop0 1771 RET 1772%endif ;ARCH_X86_64 1773.hv: 1774 RESET_STACK_STATE 1775 cmp wd, 4 1776 jg .hv_w8 1777%if ARCH_X86_32 1778 and mxd, 0x7f 1779%else 1780 movzx mxd, mxb 1781%endif 1782 dec srcq 1783 movd m1, [base_reg-put_ssse3+subpel_filters+2+mxq*8] 1784%if ARCH_X86_32 1785 movzx mxd, ssb 1786 shr ssd, 16 1787 cmp hd, 6 1788 cmovs ssd, mxd 1789 movq m0, [base_reg-put_ssse3+subpel_filters+1+ssq*8] 1790 mov ssq, ssmp 1791 ALLOC_STACK -mmsize*4 1792 %define m8 [rsp+mmsize*0] 1793 %define m9 [rsp+mmsize*1] 1794 %define m10 [rsp+mmsize*2] 1795 punpcklbw m0, m0 1796 sub srcq, ssq 1797 psraw m0, 8 ; sign-extend 1798 sub srcq, ssq 1799 pshufd m2, m0, q0000 1800 mova m8, m2 1801 pshufd m2, m0, q1111 1802 mova m9, m2 1803 pshufd m2, m0, q2222 1804 mova m10, m2 1805%else 1806 movzx mxd, myb 1807 shr myd, 16 1808 cmp hd, 6 1809 cmovs myd, mxd 1810 movq m0, [base_reg-put_ssse3+subpel_filters+1+myq*8] 1811 WIN64_SPILL_XMM 11, 14 1812 mov nsq, ssq 1813 punpcklbw m0, m0 1814 neg nsq 1815 psraw m0, 8 ; sign-extend 1816 pshufd m8, m0, q0000 1817 pshufd m9, m0, q1111 1818 pshufd m10, m0, q2222 1819%endif 1820 cmp wd, 4 1821 je .hv_w4 1822.hv_w2: 1823 mova m5, [base+subpel_h_shuf4] 1824 mova m6, [base+pw_34] 1825 pshufd m7, m1, q0000 1826%if ARCH_X86_32 1827 movq m2, [srcq+ssq*0] 1828 movhps m2, [srcq+ssq*1] 1829 lea srcq, [srcq+ssq*2] 1830 mov dsq, [rstk+stack_offset+gprsize*2] 1831%else 1832 movq m2, [srcq+nsq*2] 1833 movhps m2, [srcq+nsq*1] ; 0 1 1834%endif 1835 movq m1, [srcq+ssq*0] 1836 movhps m1, [srcq+ssq*1] ; 2 3 1837 lea srcq, [srcq+ssq*2] 1838 movq m0, [srcq+ssq*0] ; 4 1839 REPX {pshufb x, m5}, m2, m1, m0 1840 REPX {pmaddubsw x, m7}, m2, m1, m0 1841 phaddw m2, m1 1842 phaddw m0, m0 1843 paddw m2, m6 1844 paddw m0, m6 1845 psraw m2, 2 ; 0 1 2 3 1846 psraw m0, 2 1847 palignr m0, m2, 4 ; 1 2 3 4 1848 punpcklwd m1, m2, m0 ; 01 12 1849 punpckhwd m2, m0 ; 23 34 1850.hv_w2_loop: 1851 movq m3, [srcq+ssq*1] 1852 lea srcq, [srcq+ssq*2] 1853 movhps m3, [srcq+ssq*0] ; 5 6 1854 pshufb m3, m5 1855 pmaddubsw m3, m7 1856 pmaddwd m4, m8, m1 ; a0 b0 1857 mova m1, m2 1858 pmaddwd m2, m9 ; a1 b1 1859 phaddw m3, m3 1860 paddw m3, m6 1861 psraw m3, 2 1862 paddd m4, m2 1863 palignr m2, m3, m0, 12 ; 4 5 1864 mova m0, m3 1865 punpcklwd m2, m3 ; 45 56 1866 pmaddwd m3, m10, m2 ; a2 b2 1867 paddd m4, m3 1868 psrad m4, 10 1869 packssdw m4, m5 1870 packuswb m4, m4 1871 movd r6d, m4 1872 mov [dstq+dsq*0], r6w 1873 shr r6d, 16 1874 mov [dstq+dsq*1], r6w 1875 lea dstq, [dstq+dsq*2] 1876 sub hd, 2 1877 jg .hv_w2_loop 1878 RET 1879.hv_w4: 1880%if ARCH_X86_32 1881 movq m3, [srcq+ssq*0] 1882 movq m4, [srcq+ssq*1] 1883 lea srcq, [srcq+ssq*2] 1884 mov dsq, [rstk+stack_offset+gprsize*2] 1885 %define m11 [base+pw_34] 1886 %define m12 [base+subpel_h_shufA] 1887 %define m13 [rsp+mmsize*3] 1888 pshufd m1, m1, q0000 1889 mova m13, m1 1890%else 1891 WIN64_PUSH_XMM 14 1892 movq m3, [srcq+nsq*2] 1893 movq m4, [srcq+nsq*1] 1894 pshufd m13, m1, q0000 1895 mova m12, [base+subpel_h_shufA] 1896 mova m11, [base+pw_34] 1897%endif 1898 movq m0, [srcq+ssq*0] 1899 movq m1, [srcq+ssq*1] 1900 lea srcq, [srcq+ssq*2] 1901 movq m2, [srcq+ssq*0] 1902%if ARCH_X86_32 1903 mova m5, m12 1904 mova m6, m13 1905 REPX {pshufb x, m5 }, m3, m4, m0, m1, m2 1906 mova m5, m11 1907 REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2 1908%else 1909 REPX {pshufb x, m12}, m3, m4, m0, m1, m2 1910 REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2 1911%endif 1912 phaddw m3, m0 ; 0 2 1913 phaddw m4, m1 ; 1 3 1914 phaddw m0, m2 ; 2 4 1915%if ARCH_X86_32 1916 REPX {paddw x, m5 }, m3, m4, m0 1917%else 1918 REPX {paddw x, m11}, m3, m4, m0 1919%endif 1920 REPX {psraw x, 2 }, m3, m4, m0 1921 punpcklwd m1, m3, m4 ; 01 1922 punpckhwd m3, m4 ; 23 1923 punpcklwd m2, m4, m0 ; 12 1924 punpckhwd m4, m0 ; 34 1925.hv_w4_loop: 1926 movq m7, [srcq+ssq*1] 1927 lea srcq, [srcq+ssq*2] 1928 movq m6, [srcq+ssq*0] 1929 pshufb m7, m12 1930 pshufb m6, m12 1931 pmaddubsw m7, m13 1932 pmaddubsw m6, m13 1933 pmaddwd m5, m8, m1 ; a0 1934 mova m1, m3 1935 phaddw m7, m6 ; 5 6 1936 pmaddwd m6, m8, m2 ; b0 1937 mova m2, m4 1938 pmaddwd m3, m9 ; a1 1939 pmaddwd m4, m9 ; b1 1940 paddw m7, m11 1941 psraw m7, 2 1942 paddd m5, m3 1943 paddd m6, m4 1944 shufpd m4, m0, m7, 0x01 ; 4 5 1945 mova m0, m7 1946 punpcklwd m3, m4, m7 ; 45 1947 punpckhwd m4, m7 ; 56 1948 pmaddwd m7, m10, m3 ; a2 1949 paddd m5, m7 1950 pmaddwd m7, m10, m4 ; b2 1951 paddd m6, m7 1952 psrad m5, 10 1953 psrad m6, 10 1954 packssdw m5, m6 1955 packuswb m5, m5 1956 movd [dstq+dsq*0], m5 1957 psrlq m5, 32 1958 movd [dstq+dsq*1], m5 1959 lea dstq, [dstq+dsq*2] 1960 sub hd, 2 1961 jg .hv_w4_loop 1962 RET 1963.hv_w8: 1964 RESET_STACK_STATE 1965 shr mxd, 16 1966 sub srcq, 2 1967%if ARCH_X86_32 1968 movq m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8] 1969 movzx mxd, ssb 1970 shr ssd, 16 1971 cmp hd, 6 1972 cmovs ssd, mxd 1973 movq m1, [base_reg-put_ssse3+subpel_filters+1+ssq*8] 1974 shl wd, 13 1975 mov ssq, ssm 1976 lea r6d, [hq+wq-(1<<16)] 1977%assign regs_used 5 1978 ALLOC_STACK -mmsize*16 1979%assign regs_used 7 1980 mov dsq, [rstk+stack_offset+gprsize*2] 1981 sub srcq, ssq 1982 sub srcq, ssq 1983%if STACK_ALIGNMENT < 16 1984 %define srcm [esp+mmsize*15+gprsize*0] 1985 %define dstm [esp+mmsize*15+gprsize*1] 1986 mov dstm, dstq 1987%endif 1988 mov srcm, srcq 1989%else 1990 ALLOC_STACK 16*6, 16 1991 movq m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8] 1992 movzx mxd, myb 1993 shr myd, 16 1994 cmp hd, 6 1995 cmovs myd, mxd 1996 movq m1, [base_reg-put_ssse3+subpel_filters+1+myq*8] 1997 mov nsq, ssq 1998 shl wd, 13 1999 neg nsq 2000 lea r6d, [hq+wq-(1<<16)] 2001%endif 2002 mova m7, [base+pw_34] 2003 punpcklwd m0, m0 2004 punpcklbw m1, m1 2005 psraw m1, 8 ; sign-extend 2006 pshufd m2, m0, q0000 2007 mova [rsp+16*0], m2 2008 pshufd m2, m0, q1111 2009 mova [rsp+16*1], m2 2010 pshufd m0, m0, q2222 2011 mova [rsp+16*2], m0 2012 pshufd m2, m1, q0000 2013 mova [rsp+16*3], m2 2014 pshufd m2, m1, q1111 2015 mova [rsp+16*4], m2 2016 pshufd m1, m1, q2222 2017 mova [rsp+16*5], m1 2018%macro HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \ 2019 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3] 2020 pshufb %2, %1, %4 2021 pshufb %1, %5 2022 pmaddubsw %3, %2, %6 2023 shufps %2, %1, q2121 2024 pmaddubsw %1, %8 2025 pmaddubsw %2, %7 2026 paddw %3, m7 2027 paddw %1, %3 2028 paddw %1, %2 2029 psraw %1, 2 2030%endmacro 2031.hv_w8_loop0: 2032 mova m2, [base+subpel_h_shufD] 2033 mova m3, [base+subpel_h_shufF] 2034 mova m4, [rsp+16*0] 2035%if ARCH_X86_32 2036 movu m0, [srcq+ssq*0] 2037 movu m1, [srcq+ssq*1] 2038 lea srcq, [srcq+ssq*2] 2039 HV_H_6TAP m0, m5, m6, m2, m3, m4 2040 HV_H_6TAP m1, m5, m6, m2, m3, m4 2041 movu m5, [srcq+ssq*0] 2042 punpcklwd m6, m0, m1 ; 01 2043 punpckhwd m0, m1 2044 mova [rsp+16* 6], m6 2045 mova [rsp+16* 7], m0 2046 HV_H_6TAP m5, m0, m6, m2, m3, m4 2047 movu m0, [srcq+ssq*1] 2048 lea srcq, [srcq+ssq*2] 2049 punpcklwd m6, m1, m5 ; 12 2050 punpckhwd m1, m5 2051 mova [rsp+16* 8], m6 2052 mova [rsp+16* 9], m1 2053 HV_H_6TAP m0, m1, m6, m2, m3, m4 2054 movu m1, [srcq+ssq*0] 2055 punpcklwd m6, m5, m0 ; 23 2056 punpckhwd m5, m0 2057 mova [rsp+16*10], m6 2058 mova [rsp+16*11], m5 2059 HV_H_6TAP m1, m5, m6, m2, m3, m4 2060 mova [rsp+16*14], m1 2061 punpcklwd m6, m0, m1 ; 34 2062 punpckhwd m0, m1 2063 mova [rsp+16*12], m6 2064 mova [rsp+16*13], m0 2065.hv_w8_loop: 2066 mova m3, [rsp+16* 3] 2067 pmaddwd m0, m3, [rsp+16* 6] ; a0 2068 pmaddwd m2, m3, [rsp+16* 7] ; a0' 2069 pmaddwd m1, m3, [rsp+16* 8] ; b0 2070 pmaddwd m3, [rsp+16* 9] ; b0' 2071 mova m6, [rsp+16* 4] 2072 mova m4, [rsp+16*10] 2073 mova m5, [rsp+16*11] 2074 mova [rsp+16* 6], m4 2075 pmaddwd m4, m6 ; a1 2076 mova [rsp+16* 7], m5 2077 pmaddwd m5, m6 ; a1' 2078 paddd m0, m4 2079 mova m4, [rsp+16*12] 2080 paddd m2, m5 2081 mova m5, [rsp+16*13] 2082 mova [rsp+16* 8], m4 2083 pmaddwd m4, m6 ; b1 2084 mova [rsp+16* 9], m5 2085 pmaddwd m5, m6 ; b1' 2086 movu m6, [srcq+ssq*1] 2087 lea srcq, [srcq+ssq*2] 2088 paddd m1, m4 2089 paddd m3, m5 2090 HV_H_6TAP m6, m4, m5 2091 mova m5, [rsp+16*14] 2092 punpcklwd m4, m5, m6 ; 45 2093 punpckhwd m5, m6 2094 mova [rsp+16*10], m4 2095 mova [rsp+16*11], m5 2096 pmaddwd m4, [rsp+16*5] ; a2 2097 pmaddwd m5, [rsp+16*5] ; a2' 2098 paddd m0, m4 2099 movu m4, [srcq+ssq*0] 2100 paddd m2, m5 2101 psrad m0, 10 2102 psrad m2, 10 2103 packssdw m0, m2 2104 HV_H_6TAP m4, m2, m5 2105 mova m2, [rsp+16*5] 2106 punpcklwd m5, m6, m4 ; 56 2107 mova [rsp+16*14], m4 2108 punpckhwd m6, m4 2109 mova [rsp+16*12], m5 2110 pmaddwd m5, m2 ; b2 2111 mova [rsp+16*13], m6 2112 pmaddwd m6, m2 ; b2' 2113 paddd m1, m5 2114 paddd m3, m6 2115 psrad m1, 10 2116 psrad m3, 10 2117 packssdw m1, m3 2118 packuswb m0, m1 2119 movq [dstq+dsq*0], m0 2120 movhps [dstq+dsq*1], m0 2121 lea dstq, [dstq+dsq*2] 2122 sub hd, 2 2123 jg .hv_w8_loop 2124 mov srcq, srcm 2125 mov dstq, dstm 2126 movzx hd, r6w 2127 add srcq, 8 2128 add dstq, 8 2129 mov srcm, srcq 2130 mov dstm, dstq 2131%else 2132 movu m9, [srcq+nsq*2] 2133 movu m11, [srcq+nsq*1] 2134 lea r4, [srcq+ssq*2] 2135 movu m13, [srcq+ssq*0] 2136 movu m15, [srcq+ssq*1] 2137 mov r7, dstq 2138 movu m6, [r4 +ssq*0] 2139 mova m5, [rsp+16*1] 2140 mova m8, [rsp+16*2] 2141 HV_H_6TAP m9, m0, m1, m2, m3, m4, m5, m8 2142 HV_H_6TAP m11, m0, m1, m2, m3, m4, m5, m8 2143 HV_H_6TAP m13, m0, m1, m2, m3, m4, m5, m8 2144 HV_H_6TAP m15, m0, m1, m2, m3, m4, m5, m8 2145 HV_H_6TAP m6, m0, m1, m2, m3, m4, m5, m8 2146 punpcklwd m8, m9, m11 ; 01 2147 punpckhwd m9, m11 2148 punpcklwd m10, m11, m13 ; 12 2149 punpckhwd m11, m13 2150 punpcklwd m12, m13, m15 ; 23 2151 punpckhwd m13, m15 2152 punpcklwd m14, m15, m6 ; 34 2153 punpckhwd m15, m6 2154.hv_w8_loop: 2155 mova m3, [rsp+16*3] 2156 mova m4, [rsp+16*4] 2157 pmaddwd m0, m8, m3 ; a0 2158 mova m8, m12 2159 pmaddwd m2, m9, m3 ; a0' 2160 mova m9, m13 2161 pmaddwd m1, m10, m3 ; b0 2162 mova m10, m14 2163 pmaddwd m3, m11 ; b0' 2164 mova m11, m15 2165 REPX {pmaddwd x, m4}, m12, m13, m14, m15 2166 paddd m0, m12 2167 paddd m2, m13 2168 paddd m1, m14 2169 paddd m3, m15 2170 movu m15, [r4+ssq*1] 2171 lea r4, [r4+ssq*2] 2172 HV_H_6TAP m15, m4, m5 2173 punpcklwd m12, m6, m15 2174 punpckhwd m13, m6, m15 2175 movu m6, [r4+ssq*0] 2176 HV_H_6TAP m6, m4, m5 2177 mova m4, [rsp+16*5] 2178 punpcklwd m14, m15, m6 2179 punpckhwd m15, m6 2180 pmaddwd m5, m12, m4 ; a2 2181 paddd m0, m5 2182 pmaddwd m5, m13, m4 ; a2' 2183 paddd m2, m5 2184 pmaddwd m5, m14, m4 ; b2 2185 paddd m1, m5 2186 pmaddwd m4, m15 ; b2' 2187 paddd m3, m4 2188 REPX {psrad x, 10}, m0, m2, m1, m3 2189 packssdw m0, m2 2190 packssdw m1, m3 2191 packuswb m0, m1 2192 movq [r7+dsq*0], m0 2193 movhps [r7+dsq*1], m0 2194 lea r7, [r7+dsq*2] 2195 sub hd, 2 2196 jg .hv_w8_loop 2197 add srcq, 8 2198 add dstq, 8 2199 movzx hd, r6b 2200%endif 2201 sub r6d, 1<<16 2202 jg .hv_w8_loop0 2203 RET 2204 2205PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc 2206PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc 2207PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc 2208PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc 2209PUT_8TAP_FN sharp, SHARP, SHARP 2210 2211cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 2212 imul mxd, mxm, 0x010101 2213 add mxd, t0d ; 8tap_h, mx, 4tap_h 2214%if ARCH_X86_64 2215 imul myd, mym, 0x010101 2216 add myd, t1d ; 8tap_v, my, 4tap_v 2217%else 2218 imul ssd, mym, 0x010101 2219 add ssd, t1d ; 8tap_v, my, 4tap_v 2220 mov srcq, srcm 2221%endif 2222 mov wd, wm 2223 movifnidn hd, hm 2224 LEA base_reg, put_ssse3 2225 test mxd, 0xf00 2226 jnz .h 2227%if ARCH_X86_32 2228 test ssd, 0xf00 2229%else 2230 test myd, 0xf00 2231%endif 2232 jnz .v 2233 tzcnt wd, wd 2234 movzx wd, word [base_reg+wq*2+table_offset(put,)] 2235 movifnidn ssq, ssmp 2236 add wq, base_reg 2237 movifnidn dsq, dsmp 2238%if WIN64 2239 pop r8 2240%endif 2241 lea r6, [ssq*3] 2242 jmp wq 2243.h_w2: 2244 mova m3, [base+subpel_h_shuf4] 2245 movifnidn dsq, dsmp 2246.h_w2_loop: 2247 movq m0, [srcq+ssq*0] 2248 movhps m0, [srcq+ssq*1] 2249 lea srcq, [srcq+ssq*2] 2250 pshufb m0, m3 2251 pmaddubsw m0, m4 2252 phaddw m0, m0 2253 paddw m0, m5 ; pw34 2254 psraw m0, 6 2255 packuswb m0, m0 2256 movd r6d, m0 2257 mov [dstq+dsq*0], r6w 2258 shr r6d, 16 2259 mov [dstq+dsq*1], r6w 2260 lea dstq, [dstq+dsq*2] 2261 sub hd, 2 2262 jg .h_w2_loop 2263 RET 2264.h_w4: 2265%if ARCH_X86_32 2266 and mxd, 0x7f 2267%else 2268 movzx mxd, mxb 2269%endif 2270 movd m4, [base_reg+mxq*8+subpel_filters-put_ssse3+2] 2271 dec srcq 2272 pshufd m4, m4, q0000 2273 cmp wd, 4 2274 jl .h_w2 2275 mova m3, [base+subpel_h_shufA] 2276 movifnidn dsq, dsmp 2277.h_w4_loop: 2278 movq m0, [srcq+ssq*0] ; 1 2279 movq m1, [srcq+ssq*1] ; 2 2280 lea srcq, [srcq+ssq*2] 2281 pshufb m0, m3 ; subpel_h_shufA 2282 pshufb m1, m3 ; subpel_h_shufA 2283 pmaddubsw m0, m4 ; subpel_filters 2284 pmaddubsw m1, m4 ; subpel_filters 2285 phaddw m0, m1 2286 paddw m0, m5 ; pw34 2287 psraw m0, 6 2288 packuswb m0, m0 2289 movd [dstq+dsq*0], m0 2290 psrlq m0, 32 2291 movd [dstq+dsq*1], m0 2292 lea dstq, [dstq+dsq*2] 2293 sub hd, 2 2294 jg .h_w4_loop 2295 RET 2296.h: 2297%if ARCH_X86_32 2298 test ssd, 0xf00 2299%else 2300 test myd, 0xf00 2301%endif 2302 jnz .hv 2303 movifnidn ssq, ssmp 2304 mova m5, [base+pw_34] ; 2 + (8 << 2) 2305 cmp wd, 4 2306 jle .h_w4 2307 WIN64_SPILL_XMM 12 2308%if ARCH_X86_64 2309 mova m10, [base+subpel_h_shufA] 2310 mova m11, [base+subpel_h_shufB] 2311 mova m9, [base+subpel_h_shufC] 2312%endif 2313 shr mxd, 16 2314 sub srcq, 3 2315 movq m7, [base_reg+mxq*8+subpel_filters-put_ssse3] 2316 pshufd m6, m7, q0000 2317 pshufd m7, m7, q1111 2318 sub wd, 16 2319 jge .h_w16 2320%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] 2321 %if ARCH_X86_32 2322 pshufb %2, %1, [base+subpel_h_shufB] 2323 pshufb %3, %1, [base+subpel_h_shufC] 2324 pshufb %1, [base+subpel_h_shufA] 2325 %else 2326 pshufb %2, %1, m11; subpel_h_shufB 2327 pshufb %3, %1, m9 ; subpel_h_shufC 2328 pshufb %1, m10 ; subpel_h_shufA 2329 %endif 2330 pmaddubsw %4, %2, m6 ; subpel +0 B0 2331 pmaddubsw %2, m7 ; subpel +4 B4 2332 pmaddubsw %3, m7 ; C4 2333 pmaddubsw %1, m6 ; A0 2334 paddw %3, %4 ; C4+B0 2335 paddw %1, %2 ; A0+B4 2336 phaddw %1, %3 2337 paddw %1, m5 ; pw34 2338 psraw %1, 6 2339%endmacro 2340.h_w8: 2341 movu m0, [srcq+ssq*0] 2342 movu m1, [srcq+ssq*1] 2343 lea srcq, [srcq+ssq*2] 2344 PUT_8TAP_H m0, m2, m3, m4 2345 PUT_8TAP_H m1, m2, m3, m4 2346 packuswb m0, m1 2347%if ARCH_X86_32 2348 movq [dstq], m0 2349 add dstq, dsm 2350 movhps [dstq], m0 2351 add dstq, dsm 2352%else 2353 movq [dstq+dsq*0], m0 2354 movhps [dstq+dsq*1], m0 2355 lea dstq, [dstq+dsq*2] 2356%endif 2357 sub hd, 2 2358 jg .h_w8 2359 RET 2360.h_w16: 2361 add srcq, wq 2362 add dstq, wq 2363 neg wq 2364.h_w16_loop_v: 2365 mov r6, wq 2366.h_w16_loop_h: 2367 movu m0, [srcq+r6+8*0] 2368 movu m1, [srcq+r6+8*1] 2369 PUT_8TAP_H m0, m2, m3, m4 2370 PUT_8TAP_H m1, m2, m3, m4 2371 packuswb m0, m1 2372 mova [dstq+r6], m0 2373 add r6, 16 2374 jle .h_w16_loop_h 2375 add srcq, ssq 2376 add dstq, dsmp 2377 dec hd 2378 jg .h_w16_loop_v 2379 RET 2380.v: 2381%if ARCH_X86_32 2382 movzx mxd, ssb 2383 shr ssd, 16 2384 cmp hd, 6 2385 cmovs ssd, mxd 2386 movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] 2387%else 2388 WIN64_SPILL_XMM 16 2389 movzx mxd, myb 2390 shr myd, 16 2391 cmp hd, 6 2392 cmovs myd, mxd 2393 movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] 2394%endif 2395 punpcklwd m0, m0 2396 mova m7, [base+pw_512] 2397%if ARCH_X86_32 2398 %define subpel0 [rsp+mmsize*0] 2399 %define subpel1 [rsp+mmsize*1] 2400 %define subpel2 [rsp+mmsize*2] 2401 %define subpel3 [rsp+mmsize*3] 2402%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed 2403 ALLOC_STACK -16*4 2404%assign regs_used 7 2405 pshufd m1, m0, q0000 2406 mova subpel0, m1 2407 pshufd m1, m0, q1111 2408 mova subpel1, m1 2409 pshufd m1, m0, q2222 2410 mova subpel2, m1 2411 pshufd m1, m0, q3333 2412 mova subpel3, m1 2413 mov ssq, [rstk+stack_offset+gprsize*4] 2414 lea ssq, [ssq*3] 2415 sub srcq, ssq 2416 mov ssq, [rstk+stack_offset+gprsize*4] 2417 mov dsq, [rstk+stack_offset+gprsize*2] 2418 cmp wd, 2 2419 jne .v_w4 2420%else 2421 %define subpel0 m8 2422 %define subpel1 m9 2423 %define subpel2 m10 2424 %define subpel3 m11 2425 lea ss3q, [ssq*3] 2426 pshufd m8, m0, q0000 2427 sub srcq, ss3q 2428 pshufd m9, m0, q1111 2429 pshufd m10, m0, q2222 2430 pshufd m11, m0, q3333 2431 cmp wd, 4 2432 je .v_w4 2433 jg .v_w8 2434%endif 2435.v_w2: 2436 movd m1, [srcq+ssq*0] 2437 movd m0, [srcq+ssq*1] 2438%if ARCH_X86_32 2439 lea srcq, [srcq+ssq*2] 2440 movd m2, [srcq+ssq*0] 2441 movd m5, [srcq+ssq*1] 2442 lea srcq, [srcq+ssq*2] 2443 movd m3, [srcq+ssq*0] 2444 movd m4, [srcq+ssq*1] 2445 lea srcq, [srcq+ssq*2] 2446%else 2447 movd m2, [srcq+ssq*2] 2448 add srcq, ss3q 2449 movd m5, [srcq+ssq*0] 2450 movd m3, [srcq+ssq*1] 2451 movd m4, [srcq+ssq*2] 2452 add srcq, ss3q 2453%endif 2454 punpcklwd m1, m0 ; 0 1 2455 punpcklwd m0, m2 ; 1 2 2456 punpcklbw m1, m0 ; 01 12 2457 movd m0, [srcq+ssq*0] 2458 punpcklwd m2, m5 ; 2 3 2459 punpcklwd m5, m3 ; 3 4 2460 punpcklwd m3, m4 ; 4 5 2461 punpcklwd m4, m0 ; 5 6 2462 punpcklbw m2, m5 ; 23 34 2463 punpcklbw m3, m4 ; 45 56 2464.v_w2_loop: 2465 movd m4, [srcq+ssq*1] 2466 lea srcq, [srcq+ssq*2] 2467 pmaddubsw m5, m1, subpel0 ; a0 b0 2468 mova m1, m2 2469 pmaddubsw m2, subpel1 ; a1 b1 2470 paddw m5, m2 2471 mova m2, m3 2472 pmaddubsw m3, subpel2 ; a2 b2 2473 paddw m5, m3 2474 punpcklwd m3, m0, m4 ; 6 7 2475 movd m0, [srcq+ssq*0] 2476 punpcklwd m4, m0 ; 7 8 2477 punpcklbw m3, m4 ; 67 78 2478 pmaddubsw m4, m3, subpel3 ; a3 b3 2479 paddw m5, m4 2480 pmulhrsw m5, m7 2481 packuswb m5, m5 2482 movd r6d, m5 2483 mov [dstq+dsq*0], r6w 2484 shr r6d, 16 2485 mov [dstq+dsq*1], r6w 2486 lea dstq, [dstq+dsq*2] 2487 sub hd, 2 2488 jg .v_w2_loop 2489 RET 2490.v_w4: 2491%if ARCH_X86_32 2492 shl wd, 14 2493%if STACK_ALIGNMENT < 16 2494 %define dstm [rsp+mmsize*4+gprsize] 2495 mov dstm, dstq 2496%endif 2497 lea r6d, [hq+wq-(1<<16)] 2498 mov r4, srcq 2499.v_w4_loop0: 2500%endif 2501 movd m1, [srcq+ssq*0] 2502 movd m0, [srcq+ssq*1] 2503%if ARCH_X86_32 2504 lea srcq, [srcq+ssq*2] 2505 movd m2, [srcq+ssq*0] 2506 movd m5, [srcq+ssq*1] 2507 lea srcq, [srcq+ssq*2] 2508 movd m3, [srcq+ssq*0] 2509 movd m4, [srcq+ssq*1] 2510 lea srcq, [srcq+ssq*2] 2511%else 2512 movd m2, [srcq+ssq*2] 2513 add srcq, ss3q 2514 movd m5, [srcq+ssq*0] 2515 movd m3, [srcq+ssq*1] 2516 movd m4, [srcq+ssq*2] 2517 add srcq, ss3q 2518%endif 2519 punpckldq m1, m0 ; 0 1 2520 punpckldq m0, m2 ; 1 2 2521 punpcklbw m1, m0 ; 01 12 2522 movd m0, [srcq+ssq*0] 2523 punpckldq m2, m5 ; 2 3 2524 punpckldq m5, m3 ; 3 4 2525 punpckldq m3, m4 ; 4 5 2526 punpckldq m4, m0 ; 5 6 2527 punpcklbw m2, m5 ; 23 34 2528 punpcklbw m3, m4 ; 45 56 2529.v_w4_loop: 2530 movd m4, [srcq+ssq*1] 2531 lea srcq, [srcq+ssq*2] 2532 pmaddubsw m5, m1, subpel0 ; a0 b0 2533 mova m1, m2 2534 pmaddubsw m2, subpel1 ; a1 b1 2535 paddw m5, m2 2536 mova m2, m3 2537 pmaddubsw m3, subpel2 ; a2 b2 2538 paddw m5, m3 2539 punpckldq m3, m0, m4 ; 6 7 _ _ 2540 movd m0, [srcq+ssq*0] 2541 punpckldq m4, m0 ; 7 8 _ _ 2542 punpcklbw m3, m4 ; 67 78 2543 pmaddubsw m4, m3, subpel3 ; a3 b3 2544 paddw m5, m4 2545 pmulhrsw m5, m7 2546 packuswb m5, m5 2547 movd [dstq+dsq*0], m5 2548 psrlq m5, 32 2549 movd [dstq+dsq*1], m5 2550 lea dstq, [dstq+dsq*2] 2551 sub hd, 2 2552 jg .v_w4_loop 2553%if ARCH_X86_32 2554 mov dstq, dstm 2555 add r4, 4 2556 movzx hd, r6w 2557 add dstq, 4 2558 mov srcq, r4 2559 mov dstm, dstq 2560 sub r6d, 1<<16 2561 jg .v_w4_loop0 2562%endif 2563 RET 2564%if ARCH_X86_64 2565.v_w8: 2566 shl wd, 5 2567 lea r6d, [hq+wq-256] 2568.v_w8_loop0: 2569 movq m1, [srcq+ssq*0] 2570 movq m2, [srcq+ssq*1] 2571 lea r4, [srcq+ss3q] 2572 movq m3, [srcq+ssq*2] 2573 movq m4, [r4 +ssq*0] 2574 mov r7, dstq 2575 movq m5, [r4 +ssq*1] 2576 movq m6, [r4 +ssq*2] 2577 add r4, ss3q 2578 movq m0, [r4 +ssq*0] 2579 punpcklbw m1, m2 ; 01 2580 punpcklbw m2, m3 ; 12 2581 punpcklbw m3, m4 ; 23 2582 punpcklbw m4, m5 ; 34 2583 punpcklbw m5, m6 ; 45 2584 punpcklbw m6, m0 ; 56 2585.v_w8_loop: 2586 movq m13, [r4+ssq*1] 2587 lea r4, [r4+ssq*2] 2588 pmaddubsw m14, m1, subpel0 ; a0 2589 mova m1, m3 2590 pmaddubsw m15, m2, subpel0 ; b0 2591 mova m2, m4 2592 pmaddubsw m3, subpel1 ; a1 2593 mova m12, m0 2594 pmaddubsw m4, subpel1 ; b1 2595 movq m0, [r4+ssq*0] 2596 paddw m14, m3 2597 paddw m15, m4 2598 mova m3, m5 2599 pmaddubsw m5, subpel2 ; a2 2600 mova m4, m6 2601 pmaddubsw m6, subpel2 ; b2 2602 punpcklbw m12, m13 ; 67 2603 punpcklbw m13, m0 ; 78 2604 paddw m14, m5 2605 mova m5, m12 2606 pmaddubsw m12, subpel3 ; a3 2607 paddw m15, m6 2608 mova m6, m13 2609 pmaddubsw m13, subpel3 ; b3 2610 paddw m14, m12 2611 paddw m15, m13 2612 pmulhrsw m14, m7 2613 pmulhrsw m15, m7 2614 packuswb m14, m15 2615 movq [r7+dsq*0], m14 2616 movhps [r7+dsq*1], m14 2617 lea r7, [r7+dsq*2] 2618 sub hd, 2 2619 jg .v_w8_loop 2620 add srcq, 8 2621 add dstq, 8 2622 movzx hd, r6b 2623 sub r6d, 1<<8 2624 jg .v_w8_loop0 2625 RET 2626%endif ;ARCH_X86_64 2627%undef subpel0 2628%undef subpel1 2629%undef subpel2 2630%undef subpel3 2631.hv: 2632 RESET_STACK_STATE 2633 cmp wd, 4 2634 jg .hv_w8 2635%if ARCH_X86_32 2636 and mxd, 0x7f 2637%else 2638 movzx mxd, mxb 2639%endif 2640 dec srcq 2641 movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] 2642%if ARCH_X86_32 2643 movzx mxd, ssb 2644 shr ssd, 16 2645 cmp hd, 6 2646 cmovs ssd, mxd 2647 movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] 2648 mov ssq, ssmp 2649 lea r6, [ssq*3] 2650 sub srcq, r6 2651 %define base_reg r6 2652 mov r6, r1; use as new base 2653 %assign regs_used 2 2654 ALLOC_STACK -mmsize*14 2655 %assign regs_used 7 2656 mov dsq, [rstk+stack_offset+gprsize*2] 2657 %define subpelv0 [rsp+mmsize*0] 2658 %define subpelv1 [rsp+mmsize*1] 2659 %define subpelv2 [rsp+mmsize*2] 2660 %define subpelv3 [rsp+mmsize*3] 2661 punpcklbw m0, m0 2662 psraw m0, 8 ; sign-extend 2663 pshufd m6, m0, q0000 2664 mova subpelv0, m6 2665 pshufd m6, m0, q1111 2666 mova subpelv1, m6 2667 pshufd m6, m0, q2222 2668 mova subpelv2, m6 2669 pshufd m6, m0, q3333 2670 mova subpelv3, m6 2671%else 2672 movzx mxd, myb 2673 shr myd, 16 2674 cmp hd, 6 2675 cmovs myd, mxd 2676 movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] 2677 ALLOC_STACK mmsize*14, 14 2678 lea ss3q, [ssq*3] 2679 sub srcq, ss3q 2680 %define subpelv0 m10 2681 %define subpelv1 m11 2682 %define subpelv2 m12 2683 %define subpelv3 m13 2684 punpcklbw m0, m0 2685 psraw m0, 8 ; sign-extend 2686 mova m8, [base+pw_8192] 2687 mova m9, [base+pd_512] 2688 pshufd m10, m0, q0000 2689 pshufd m11, m0, q1111 2690 pshufd m12, m0, q2222 2691 pshufd m13, m0, q3333 2692%endif 2693 pshufd m7, m1, q0000 2694 cmp wd, 4 2695 je .hv_w4 2696.hv_w2: 2697 mova m6, [base+subpel_h_shuf4] 2698 movq m2, [srcq+ssq*0] ; 0 2699 movhps m2, [srcq+ssq*1] ; 0 _ 1 2700%if ARCH_X86_32 2701 %define w8192reg [base+pw_8192] 2702 %define d512reg [base+pd_512] 2703 lea srcq, [srcq+ssq*2] 2704 movq m0, [srcq+ssq*0] ; 2 2705 movhps m0, [srcq+ssq*1] ; 2 _ 3 2706 lea srcq, [srcq+ssq*2] 2707%else 2708 %define w8192reg m8 2709 %define d512reg m9 2710 movq m0, [srcq+ssq*2] ; 2 2711 add srcq, ss3q 2712 movhps m0, [srcq+ssq*0] ; 2 _ 3 2713%endif 2714 pshufb m2, m6 ; 0 ~ 1 ~ 2715 pshufb m0, m6 ; 2 ~ 3 ~ 2716 pmaddubsw m2, m7 ; subpel_filters 2717 pmaddubsw m0, m7 ; subpel_filters 2718 phaddw m2, m0 ; 0 1 2 3 2719 pmulhrsw m2, w8192reg 2720%if ARCH_X86_32 2721 movq m3, [srcq+ssq*0] ; 4 2722 movhps m3, [srcq+ssq*1] ; 4 _ 5 2723 lea srcq, [srcq+ssq*2] 2724%else 2725 movq m3, [srcq+ssq*1] ; 4 2726 movhps m3, [srcq+ssq*2] ; 4 _ 5 2727 add srcq, ss3q 2728%endif 2729 movq m0, [srcq+ssq*0] ; 6 2730 pshufb m3, m6 ; 4 ~ 5 ~ 2731 pshufb m0, m6 ; 6 ~ 2732 pmaddubsw m3, m7 ; subpel_filters 2733 pmaddubsw m0, m7 ; subpel_filters 2734 phaddw m3, m0 ; 4 5 6 _ 2735 pmulhrsw m3, w8192reg 2736 palignr m4, m3, m2, 4; V 1 2 3 4 2737 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 2738 punpckhwd m2, m4 ; V 23 34 2 3 3 4 2739 pshufd m0, m3, q2121; V 5 6 5 6 2740 punpcklwd m3, m0 ; V 45 56 4 5 5 6 2741.hv_w2_loop: 2742 movq m4, [srcq+ssq*1] ; V 7 2743 lea srcq, [srcq+ssq*2] ; V 2744 movhps m4, [srcq+ssq*0] ; V 7 8 2745 pshufb m4, m6 2746 pmaddubsw m4, m7 2747 pmaddwd m5, m1, subpelv0; V a0 b0 2748 mova m1, m2 ; V 2749 pmaddwd m2, subpelv1 ; V a1 b1 2750 paddd m5, m2 ; V 2751 mova m2, m3 ; V 2752 pmaddwd m3, subpelv2 ; a2 b2 2753 phaddw m4, m4 2754 pmulhrsw m4, w8192reg 2755 paddd m5, m3 ; V 2756 palignr m3, m4, m0, 12 2757 mova m0, m4 2758 punpcklwd m3, m0 ; V 67 78 2759 pmaddwd m4, m3, subpelv3 ; V a3 b3 2760 paddd m5, d512reg 2761 paddd m5, m4 2762 psrad m5, 10 2763 packssdw m5, m5 2764 packuswb m5, m5 2765 movd r4d, m5 2766 mov [dstq+dsq*0], r4w 2767 shr r4d, 16 2768 mov [dstq+dsq*1], r4w 2769 lea dstq, [dstq+dsq*2] 2770 sub hd, 2 2771 jg .hv_w2_loop 2772 RET 2773%undef w8192reg 2774%undef d512reg 2775.hv_w4: 2776%define hv4_line_0_0 4 2777%define hv4_line_0_1 5 2778%define hv4_line_0_2 6 2779%define hv4_line_0_3 7 2780%define hv4_line_0_4 8 2781%define hv4_line_0_5 9 2782%define hv4_line_1_0 10 2783%define hv4_line_1_1 11 2784%define hv4_line_1_2 12 2785%define hv4_line_1_3 13 2786%macro SAVELINE_W4 3 2787 mova [rsp+mmsize*hv4_line_%3_%2], %1 2788%endmacro 2789%macro RESTORELINE_W4 3 2790 mova %1, [rsp+mmsize*hv4_line_%3_%2] 2791%endmacro 2792%if ARCH_X86_32 2793 %define w8192reg [base+pw_8192] 2794 %define d512reg [base+pd_512] 2795%else 2796 %define w8192reg m8 2797 %define d512reg m9 2798%endif 2799 ; lower shuffle 0 1 2 3 4 2800 mova m6, [base+subpel_h_shuf4] 2801 movq m5, [srcq+ssq*0] ; 0 _ _ _ 2802 movhps m5, [srcq+ssq*1] ; 0 _ 1 _ 2803%if ARCH_X86_32 2804 lea srcq, [srcq+ssq*2] 2805 movq m4, [srcq+ssq*0] ; 2 _ _ _ 2806 movhps m4, [srcq+ssq*1] ; 2 _ 3 _ 2807 lea srcq, [srcq+ssq*2] 2808%else 2809 movq m4, [srcq+ssq*2] ; 2 _ _ _ 2810 movhps m4, [srcq+ss3q ] ; 2 _ 3 _ 2811 lea srcq, [srcq+ssq*4] 2812%endif 2813 pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ 2814 pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ 2815 pmaddubsw m2, m7 ;H subpel_filters 2816 pmaddubsw m0, m7 ;H subpel_filters 2817 phaddw m2, m0 ;H 0 1 2 3 2818 pmulhrsw m2, w8192reg ;H pw_8192 2819 SAVELINE_W4 m2, 2, 0 2820 ; upper shuffle 2 3 4 5 6 2821 mova m6, [base+subpel_h_shuf4+16] 2822 pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ 2823 pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ 2824 pmaddubsw m2, m7 ;H subpel_filters 2825 pmaddubsw m0, m7 ;H subpel_filters 2826 phaddw m2, m0 ;H 0 1 2 3 2827 pmulhrsw m2, w8192reg ;H pw_8192 2828 ; 2829 ; lower shuffle 2830 mova m6, [base+subpel_h_shuf4] 2831 movq m5, [srcq+ssq*0] ; 4 _ _ _ 2832 movhps m5, [srcq+ssq*1] ; 4 _ 5 _ 2833%if ARCH_X86_32 2834 lea srcq, [srcq+ssq*2] 2835 movq m4, [srcq+ssq*0] ; 6 _ _ _ 2836 add srcq, ssq 2837%else 2838 movq m4, [srcq+ssq*2] ; 6 _ _ _ 2839 add srcq, ss3q 2840%endif 2841 pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ 2842 pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ 2843 pmaddubsw m3, m7 ;H subpel_filters 2844 pmaddubsw m0, m7 ;H subpel_filters 2845 phaddw m3, m0 ;H 4 5 6 7 2846 pmulhrsw m3, w8192reg ;H pw_8192 2847 SAVELINE_W4 m3, 3, 0 2848 ; upper shuffle 2849 mova m6, [base+subpel_h_shuf4+16] 2850 pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ 2851 pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ 2852 pmaddubsw m3, m7 ;H subpel_filters 2853 pmaddubsw m0, m7 ;H subpel_filters 2854 phaddw m3, m0 ;H 4 5 6 7 2855 pmulhrsw m3, w8192reg ;H pw_8192 2856 ;process high 2857 palignr m4, m3, m2, 4;V 1 2 3 4 2858 punpcklwd m1, m2, m4 ; V 01 12 2859 punpckhwd m2, m4 ; V 23 34 2860 pshufd m0, m3, q2121;V 5 6 5 6 2861 punpcklwd m3, m0 ; V 45 56 2862 SAVELINE_W4 m0, 0, 1 2863 SAVELINE_W4 m1, 1, 1 2864 SAVELINE_W4 m2, 2, 1 2865 SAVELINE_W4 m3, 3, 1 2866 ;process low 2867 RESTORELINE_W4 m2, 2, 0 2868 RESTORELINE_W4 m3, 3, 0 2869 palignr m4, m3, m2, 4;V 1 2 3 4 2870 punpcklwd m1, m2, m4 ; V 01 12 2871 punpckhwd m2, m4 ; V 23 34 2872 pshufd m0, m3, q2121;V 5 6 5 6 2873 punpcklwd m3, m0 ; V 45 56 2874.hv_w4_loop: 2875 ;process low 2876 pmaddwd m5, m1, subpelv0 ; V a0 b0 2877 mova m1, m2 2878 pmaddwd m2, subpelv1; V a1 b1 2879 paddd m5, m2 2880 mova m2, m3 2881 pmaddwd m3, subpelv2; V a2 b2 2882 paddd m5, m3 2883 mova m6, [base+subpel_h_shuf4] 2884 movq m4, [srcq+ssq*0] ; 7 2885 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ 2886 pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ 2887 pmaddubsw m4, m7 ;H subpel_filters 2888 phaddw m4, m4 ;H 7 8 7 8 2889 pmulhrsw m4, w8192reg ;H pw_8192 2890 palignr m3, m4, m0, 12 ; 6 7 8 7 2891 mova m0, m4 2892 punpcklwd m3, m4 ; 67 78 2893 pmaddwd m4, m3, subpelv3; a3 b3 2894 paddd m5, d512reg ; pd_512 2895 paddd m5, m4 2896 psrad m5, 10 2897 SAVELINE_W4 m0, 0, 0 2898 SAVELINE_W4 m1, 1, 0 2899 SAVELINE_W4 m2, 2, 0 2900 SAVELINE_W4 m3, 3, 0 2901 SAVELINE_W4 m5, 5, 0 2902 ;process high 2903 RESTORELINE_W4 m0, 0, 1 2904 RESTORELINE_W4 m1, 1, 1 2905 RESTORELINE_W4 m2, 2, 1 2906 RESTORELINE_W4 m3, 3, 1 2907 pmaddwd m5, m1, subpelv0; V a0 b0 2908 mova m1, m2 2909 pmaddwd m2, subpelv1; V a1 b1 2910 paddd m5, m2 2911 mova m2, m3 2912 pmaddwd m3, subpelv2; V a2 b2 2913 paddd m5, m3 2914 mova m6, [base+subpel_h_shuf4+16] 2915 movq m4, [srcq+ssq*0] ; 7 2916 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ 2917 lea srcq, [srcq+ssq*2] 2918 pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ 2919 pmaddubsw m4, m7 ;H subpel_filters 2920 phaddw m4, m4 ;H 7 8 7 8 2921 pmulhrsw m4, w8192reg ;H pw_8192 2922 palignr m3, m4, m0, 12 ; 6 7 8 7 2923 mova m0, m4 2924 punpcklwd m3, m4 ; 67 78 2925 pmaddwd m4, m3, subpelv3; a3 b3 2926 paddd m5, d512reg ; pd_512 2927 paddd m5, m4 2928 psrad m4, m5, 10 2929 RESTORELINE_W4 m5, 5, 0 2930 packssdw m5, m4 ; d -> w 2931 packuswb m5, m5 ; w -> b 2932 pshuflw m5, m5, q3120 2933 movd [dstq+dsq*0], m5 2934 psrlq m5, 32 2935 movd [dstq+dsq*1], m5 2936 lea dstq, [dstq+dsq*2] 2937 sub hd, 2 2938 SAVELINE_W4 m0, 0, 1 2939 SAVELINE_W4 m1, 1, 1 2940 SAVELINE_W4 m2, 2, 1 2941 SAVELINE_W4 m3, 3, 1 2942 RESTORELINE_W4 m0, 0, 0 2943 RESTORELINE_W4 m1, 1, 0 2944 RESTORELINE_W4 m2, 2, 0 2945 RESTORELINE_W4 m3, 3, 0 2946 jg .hv_w4_loop 2947 RET 2948%undef subpelv0 2949%undef subpelv1 2950%undef subpelv2 2951%undef subpelv3 2952.hv_w8: 2953 RESET_STACK_STATE 2954%define hv8_line_1 0 2955%define hv8_line_2 1 2956%define hv8_line_3 2 2957%define hv8_line_4 3 2958%define hv8_line_6 4 2959%macro SAVELINE_W8 2 2960 mova [rsp+hv8_line_%1*mmsize], %2 2961%endmacro 2962%macro RESTORELINE_W8 2 2963 mova %2, [rsp+hv8_line_%1*mmsize] 2964%endmacro 2965 shr mxd, 16 2966 sub srcq, 3 2967%if ARCH_X86_32 2968 %define base_reg r1 2969 %define subpelh0 [rsp+mmsize*5] 2970 %define subpelh1 [rsp+mmsize*6] 2971 %define subpelv0 [rsp+mmsize*7] 2972 %define subpelv1 [rsp+mmsize*8] 2973 %define subpelv2 [rsp+mmsize*9] 2974 %define subpelv3 [rsp+mmsize*10] 2975 %define accuv0 [rsp+mmsize*11] 2976 %define accuv1 [rsp+mmsize*12] 2977 movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] 2978 movzx mxd, ssb 2979 shr ssd, 16 2980 cmp hd, 6 2981 cmovs ssd, mxd 2982 movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] 2983 mov ssq, ssmp 2984 ALLOC_STACK -mmsize*13 2985%if STACK_ALIGNMENT < 16 2986 %define dstm [rsp+mmsize*13+gprsize*1] 2987 %define dsm [rsp+mmsize*13+gprsize*2] 2988 mov r6, [rstk+stack_offset+gprsize*2] 2989 mov dsm, r6 2990%endif 2991 pshufd m0, m1, q0000 2992 pshufd m1, m1, q1111 2993 punpcklbw m5, m5 2994 psraw m5, 8 ; sign-extend 2995 pshufd m2, m5, q0000 2996 pshufd m3, m5, q1111 2997 pshufd m4, m5, q2222 2998 pshufd m5, m5, q3333 2999 mova subpelh0, m0 3000 mova subpelh1, m1 3001 mova subpelv0, m2 3002 mova subpelv1, m3 3003 mova subpelv2, m4 3004 mova subpelv3, m5 3005 lea r6, [ssq*3] 3006 mov dstm, dstq 3007 sub srcq, r6 3008%else 3009 ALLOC_STACK 16*5, 16 3010 %define subpelh0 m10 3011 %define subpelh1 m11 3012 %define subpelv0 m12 3013 %define subpelv1 m13 3014 %define subpelv2 m14 3015 %define subpelv3 m15 3016 %define accuv0 m8 3017 %define accuv1 m9 3018 movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] 3019 movzx mxd, myb 3020 shr myd, 16 3021 cmp hd, 6 3022 cmovs myd, mxd 3023 movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] 3024 pshufd subpelh0, m0, q0000 3025 pshufd subpelh1, m0, q1111 3026 punpcklbw m1, m1 3027 psraw m1, 8 ; sign-extend 3028 pshufd subpelv0, m1, q0000 3029 pshufd subpelv1, m1, q1111 3030 pshufd subpelv2, m1, q2222 3031 pshufd subpelv3, m1, q3333 3032 lea ss3q, [ssq*3] 3033 mov r7, dstq 3034 sub srcq, ss3q 3035%endif 3036 shl wd, 14 3037 lea r6d, [hq+wq-(1<<16)] 3038 mov r4, srcq 3039.hv_w8_loop0: 3040 movu m4, [srcq+ssq*0] ; 0 = _ _ 3041 movu m5, [srcq+ssq*1] ; 1 = _ _ 3042%if ARCH_X86_32 3043 lea srcq, [srcq+ssq*2] 3044%endif 3045%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] 3046 %if ARCH_X86_32 3047 pshufb %3, %1, [base+subpel_h_shufB] 3048 pshufb %4, %1, [base+subpel_h_shufC] 3049 pshufb %1, [base+subpel_h_shufA] 3050 %else 3051 pshufb %3, %1, %6 ; subpel_h_shufB 3052 pshufb %4, %1, %7 ; subpel_h_shufC 3053 pshufb %1, %5 ; subpel_h_shufA 3054 %endif 3055 pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 3056 pmaddubsw %4, subpelh1; subpel +4 B4 3057 pmaddubsw %3, subpelh1; C4 3058 pmaddubsw %1, subpelh0; A0 3059 paddw %2, %4 ; C0+B4 3060 paddw %1, %3 ; A0+C4 3061 phaddw %1, %2 3062%endmacro 3063%if ARCH_X86_64 3064 mova m7, [base+subpel_h_shufA] 3065 mova m8, [base+subpel_h_shufB] 3066 mova m9, [base+subpel_h_shufC] 3067%endif 3068 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ 3069 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ 3070%if ARCH_X86_32 3071 movu m6, [srcq+ssq*0] ; 2 = _ _ 3072 movu m0, [srcq+ssq*1] ; 3 = _ _ 3073 lea srcq, [srcq+ssq*2] 3074%else 3075 movu m6, [srcq+ssq*2] ; 2 = _ _ 3076 add srcq, ss3q 3077 movu m0, [srcq+ssq*0] ; 3 = _ _ 3078%endif 3079 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ 3080 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ 3081 mova m7, [base+pw_8192] 3082 pmulhrsw m4, m7 ; H pw_8192 3083 pmulhrsw m5, m7 ; H pw_8192 3084 pmulhrsw m6, m7 ; H pw_8192 3085 pmulhrsw m0, m7 ; H pw_8192 3086 punpcklwd m1, m4, m5 ; 0 1 ~ 3087 punpcklwd m2, m5, m6 ; 1 2 ~ 3088 punpcklwd m3, m6, m0 ; 2 3 ~ 3089 SAVELINE_W8 1, m1 3090 SAVELINE_W8 2, m2 3091 SAVELINE_W8 3, m3 3092 mova m7, [base+subpel_h_shufA] 3093%if ARCH_X86_32 3094 movu m4, [srcq+ssq*0] ; 4 = _ _ 3095 movu m5, [srcq+ssq*1] ; 5 = _ _ 3096 lea srcq, [srcq+ssq*2] 3097%else 3098 movu m4, [srcq+ssq*1] ; 4 = _ _ 3099 movu m5, [srcq+ssq*2] ; 5 = _ _ 3100 add srcq, ss3q 3101%endif 3102 movu m6, [srcq+ssq*0] ; 6 = _ _ 3103 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ 3104 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ 3105 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ 3106 mova m7, [base+pw_8192] 3107 pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ 3108 pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ 3109 pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ 3110 punpcklwd m4, m0, m1 ; 3 4 ~ 3111 punpcklwd m5, m1, m2 ; 4 5 ~ 3112 punpcklwd m6, m2, m3 ; 5 6 ~ 3113 SAVELINE_W8 6, m3 3114 RESTORELINE_W8 1, m1 3115 RESTORELINE_W8 2, m2 3116 RESTORELINE_W8 3, m3 3117.hv_w8_loop: 3118 ; m8 accu for V a 3119 ; m9 accu for V b 3120 SAVELINE_W8 1, m3 3121 SAVELINE_W8 2, m4 3122 SAVELINE_W8 3, m5 3123 SAVELINE_W8 4, m6 3124%if ARCH_X86_32 3125 pmaddwd m0, m1, subpelv0 ; a0 3126 pmaddwd m7, m2, subpelv0 ; b0 3127 pmaddwd m3, subpelv1 ; a1 3128 pmaddwd m4, subpelv1 ; b1 3129 paddd m0, m3 3130 paddd m7, m4 3131 pmaddwd m5, subpelv2 ; a2 3132 pmaddwd m6, subpelv2 ; b2 3133 paddd m0, m5 3134 paddd m7, m6 3135 mova m5, [base+pd_512] 3136 paddd m0, m5 ; pd_512 3137 paddd m7, m5 ; pd_512 3138 mova accuv0, m0 3139 mova accuv1, m7 3140%else 3141 pmaddwd m8, m1, subpelv0 ; a0 3142 pmaddwd m9, m2, subpelv0 ; b0 3143 pmaddwd m3, subpelv1 ; a1 3144 pmaddwd m4, subpelv1 ; b1 3145 paddd m8, m3 3146 paddd m9, m4 3147 pmaddwd m5, subpelv2 ; a2 3148 pmaddwd m6, subpelv2 ; b2 3149 paddd m8, m5 3150 paddd m9, m6 3151 mova m7, [base+pd_512] 3152 paddd m8, m7 ; pd_512 3153 paddd m9, m7 ; pd_512 3154 mova m7, [base+subpel_h_shufB] 3155 mova m6, [base+subpel_h_shufC] 3156 mova m5, [base+subpel_h_shufA] 3157%endif 3158 movu m0, [srcq+ssq*1] ; 7 3159 movu m4, [srcq+ssq*2] ; 8 3160 lea srcq, [srcq+ssq*2] 3161 HV_H_W8 m0, m1, m2, m3, m5, m7, m6 3162 HV_H_W8 m4, m1, m2, m3, m5, m7, m6 3163 mova m5, [base+pw_8192] 3164 pmulhrsw m0, m5 ; H pw_8192 3165 pmulhrsw m4, m5 ; H pw_8192 3166 RESTORELINE_W8 6, m6 3167 punpcklwd m5, m6, m0 ; 6 7 ~ 3168 punpcklwd m6, m0, m4 ; 7 8 ~ 3169 pmaddwd m1, m5, subpelv3 ; a3 3170 paddd m2, m1, accuv0 3171 pmaddwd m1, m6, subpelv3 ; b3 3172 paddd m1, m1, accuv1 ; H + V 3173 psrad m2, 10 3174 psrad m1, 10 3175 packssdw m2, m1 ; d -> w 3176 packuswb m2, m1 ; w -> b 3177 movd [dstq+dsq*0], m2 3178 psrlq m2, 32 3179%if ARCH_X86_32 3180 add dstq, dsm 3181 movd [dstq+dsq*0], m2 3182 add dstq, dsm 3183%else 3184 movd [dstq+dsq*1], m2 3185 lea dstq, [dstq+dsq*2] 3186%endif 3187 sub hd, 2 3188 jle .hv_w8_outer 3189 SAVELINE_W8 6, m4 3190 RESTORELINE_W8 1, m1 3191 RESTORELINE_W8 2, m2 3192 RESTORELINE_W8 3, m3 3193 RESTORELINE_W8 4, m4 3194 jmp .hv_w8_loop 3195.hv_w8_outer: 3196%if ARCH_X86_32 3197 mov dstq, dstm 3198 add r4, 4 3199 movzx hd, r6w 3200 add dstq, 4 3201 mov srcq, r4 3202 mov dstm, dstq 3203%else 3204 add r4, 4 3205 add r7, 4 3206 movzx hd, r6b 3207 mov srcq, r4 3208 mov dstq, r7 3209%endif 3210 sub r6d, 1<<16 3211 jg .hv_w8_loop0 3212 RET 3213 3214%if ARCH_X86_32 3215 DECLARE_REG_TMP 1, 2 3216%elif WIN64 3217 DECLARE_REG_TMP 6, 4 3218%else 3219 DECLARE_REG_TMP 6, 7 3220%endif 3221 3222%if ARCH_X86_32 3223 %define base_reg r2 3224 %define base base_reg-prep_ssse3 3225%else 3226 %define base_reg r7 3227 %define base 0 3228%endif 3229 3230%define PREP_8TAP_FN FN prep_8tap, 3231PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc 3232PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc 3233PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc 3234PREP_8TAP_FN regular, REGULAR, REGULAR 3235 3236cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, mx, my, ns 3237 imul mxd, mxm, 0x010101 3238 add mxd, t0d ; 8tap_h, mx, 4tap_h 3239 imul myd, mym, 0x010101 3240 add myd, t1d ; 8tap_v, my, 4tap_v 3241 mov wd, wm 3242 movifnidn srcd, srcm 3243 movifnidn hd, hm 3244 LEA base_reg, prep_ssse3 3245 test mxd, 0xf00 3246 jnz .h 3247 test myd, 0xf00 3248 jnz .v 3249.prep: 3250 tzcnt wd, wd 3251 movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] 3252 pxor m4, m4 3253 add wq, base_reg 3254 movifnidn ssq, ssmp 3255 lea r6, [ssq*3] 3256%if WIN64 3257 pop r8 3258 pop r7 3259%endif 3260 jmp wq 3261.h: 3262 test myd, 0xf00 3263 jnz .hv 3264 test myd, 0xf00 3265 jnz .hv 3266%if ARCH_X86_32 3267 %define ssq r6 3268 mov ssq, ssmp 3269%endif 3270 cmp wd, 4 3271 jle mangle(private_prefix %+ _prep_8tap_8bpc %+ SUFFIX).h_w4 3272 WIN64_SPILL_XMM 11 3273 mova m5, [base+pw_8192] 3274%if ARCH_X86_64 3275 mova m8, [base+subpel_h_shufD] 3276 mova m9, [base+subpel_h_shufE] 3277 mova m10, [base+subpel_h_shufF] 3278%endif 3279 shr mxd, 16 3280 sub srcq, 2 3281 movq m7, [base_reg-prep_ssse3+subpel_filters+1+mxq*8] 3282 punpcklwd m7, m7 3283 pshufd m4, m7, q0000 3284 pshufd m6, m7, q1111 3285 pshufd m7, m7, q2222 3286 sub wd, 16 3287 jge .h_w16 3288%macro PREP_6TAP_H 3 ; dst/src, tmp[1-2] 3289%if ARCH_X86_32 3290 pshufb %2, %1, [base+subpel_h_shufD] 3291 pshufb %3, %1, [base+subpel_h_shufE] 3292 pshufb %1, [base+subpel_h_shufF] 3293%else 3294 pshufb %2, %1, m8 3295 pshufb %3, %1, m9 3296 pshufb %1, m10 3297%endif 3298 pmaddubsw %2, m4 3299 pmaddubsw %3, m6 3300 pmaddubsw %1, m7 3301 paddw %2, %3 3302 paddw %1, %2 3303 pmulhrsw %1, m5 3304%endmacro 3305.h_w8: 3306 movu m0, [srcq+ssq*0] 3307 movu m1, [srcq+ssq*1] 3308 lea srcq, [srcq+ssq*2] 3309 PREP_6TAP_H m0, m2, m3 3310 PREP_6TAP_H m1, m2, m3 3311 mova [tmpq+16*0], m0 3312 mova [tmpq+16*1], m1 3313 add tmpq, 32 3314 sub hd, 2 3315 jg .h_w8 3316 RET 3317.h_w16: 3318 add srcq, wq 3319 neg wq 3320.h_w16_loop_v: 3321 mov r5, wq 3322.h_w16_loop_h: 3323 movu m0, [srcq+r5+8*0] 3324 movu m1, [srcq+r5+8*1] 3325 PREP_6TAP_H m0, m2, m3 3326 PREP_6TAP_H m1, m2, m3 3327 mova [tmpq+16*0], m0 3328 mova [tmpq+16*1], m1 3329 add tmpq, 32 3330 add r5, 16 3331 jle .h_w16_loop_h 3332 add srcq, ssq 3333 dec hd 3334 jg .h_w16_loop_v 3335 RET 3336.v: 3337%if ARCH_X86_32 3338 mov mxd, myd 3339 and mxd, 0x7f 3340%else 3341 WIN64_SPILL_XMM 9, 12 3342 movzx mxd, myb 3343%endif 3344 shr myd, 16 3345 cmp hd, 6 3346 cmovs myd, mxd 3347 movq m7, [base_reg-prep_ssse3+subpel_filters+1+myq*8] 3348 punpcklwd m7, m7 3349 pshufd m5, m7, q0000 3350 pshufd m6, m7, q1111 3351 pshufd m7, m7, q2222 3352%if ARCH_X86_32 3353 %define m8 [base+pw_8192] 3354 mov ssq, ssm 3355 sub srcq, ssq 3356 sub srcq, ssq 3357%else 3358 mova m8, [base+pw_8192] 3359 mov nsq, ssq 3360 neg nsq 3361 cmp wd, 4 3362 jg .v_w8 3363%endif 3364.v_w4: 3365%if ARCH_X86_32 3366 lea r5d, [wq-4] 3367 shl r5d, 14 3368 add r5d, hd 3369 mov srcm, srcq 3370.v_w4_loop0: 3371 movd m1, [srcq+ssq*0] 3372 movd m3, [srcq+ssq*1] 3373 lea srcq, [srcq+ssq*2] 3374%else 3375 movd m1, [srcq+nsq*2] 3376 movd m3, [srcq+nsq*1] 3377%endif 3378 movd m2, [srcq+ssq*0] 3379 movd m4, [srcq+ssq*1] 3380 lea srcq, [srcq+ssq*2] 3381 movd m0, [srcq+ssq*0] 3382 punpckldq m1, m3 ; 0 1 3383 punpckldq m3, m2 ; 1 2 3384 punpckldq m2, m4 ; 2 3 3385 punpckldq m4, m0 ; 3 4 3386 punpcklbw m1, m3 ; 01 12 3387 punpcklbw m2, m4 ; 23 34 3388.v_w4_loop: 3389 movd m3, [srcq+ssq*1] 3390 lea srcq, [srcq+ssq*2] 3391 pmaddubsw m4, m1, m5 ; a0 b0 3392 mova m1, m2 3393 pmaddubsw m2, m6 ; a1 b1 3394 paddw m4, m2 3395 punpckldq m2, m0, m3 ; 4 5 3396 movd m0, [srcq+ssq*0] 3397 punpckldq m3, m0 ; 5 6 3398 punpcklbw m2, m3 ; 67 78 3399 pmaddubsw m3, m2, m7 ; a2 b2 3400 paddw m4, m3 3401 pmulhrsw m4, m8 3402%if ARCH_X86_32 3403 movq [tmpq+wq*0], m4 3404 movhps [tmpq+wq*2], m4 3405 lea tmpq, [tmpq+wq*4] 3406 sub hd, 2 3407 jg .v_w4_loop 3408 mov srcq, srcm 3409 mov tmpq, tmpm 3410 movzx hd, r5w 3411 add srcq, 4 3412 add tmpq, 8 3413 mov srcm, srcq 3414 mov tmpm, tmpq 3415 sub r5d, 1<<16 3416 jg .v_w4_loop0 3417%else 3418 mova [tmpq], m4 3419 add tmpq, 16 3420 sub hd, 2 3421 jg .v_w4_loop 3422%endif 3423 RET 3424%if ARCH_X86_64 3425.v_w8: 3426 WIN64_PUSH_XMM 12 3427 lea r6d, [wq*4-32] 3428 lea r6d, [r6*8+hq] 3429.v_w8_loop0: 3430 movq m1, [srcq+nsq*2] 3431 movq m2, [srcq+nsq*1] 3432 lea r5, [srcq+ssq*2] 3433 movq m3, [srcq+ssq*0] 3434 movq m4, [srcq+ssq*1] 3435 mov r8, tmpq 3436 movq m0, [r5 +ssq*0] 3437 punpcklbw m1, m2 ; 01 3438 punpcklbw m2, m3 ; 12 3439 punpcklbw m3, m4 ; 23 3440 punpcklbw m4, m0 ; 34 3441.v_w8_loop: 3442 pmaddubsw m10, m1, m5 ; a0 3443 mova m1, m3 3444 pmaddubsw m11, m2, m5 ; b0 3445 mova m2, m4 3446 pmaddubsw m3, m6 ; a1 3447 pmaddubsw m4, m6 ; b1 3448 paddw m10, m3 3449 paddw m11, m4 3450 movq m4, [r5+ssq*1] 3451 lea r5, [r5+ssq*2] 3452 punpcklbw m3, m0, m4 ; 67 3453 movq m0, [r5+ssq*0] 3454 punpcklbw m4, m0 ; 78 3455 pmaddubsw m9, m3, m7 ; a2 3456 paddw m10, m9 3457 pmaddubsw m9, m4, m7 ; b2 3458 paddw m11, m9 3459 pmulhrsw m10, m8 3460 pmulhrsw m11, m8 3461 mova [r8+wq*0], m10 3462 mova [r8+wq*2], m11 3463 lea r8, [r8+wq*4] 3464 sub hd, 2 3465 jg .v_w8_loop 3466 add srcq, 8 3467 add tmpq, 16 3468 movzx hd, r6b 3469 sub r6d, 1<<8 3470 jg .v_w8_loop0 3471 RET 3472%endif ;ARCH_X86_64 3473.hv: 3474 RESET_STACK_STATE 3475 cmp wd, 4 3476 jg .hv_w8 3477%if ARCH_X86_32 3478 and mxd, 0x7f 3479%else 3480 movzx mxd, mxb 3481%endif 3482 dec srcq 3483 movd m1, [base_reg-prep_ssse3+subpel_filters+2+mxq*8] 3484%if ARCH_X86_32 3485 mov mxd, myd 3486 and mxd, 0x7f 3487%else 3488 movzx mxd, myb 3489%endif 3490 shr myd, 16 3491 cmp hd, 6 3492 cmovs myd, mxd 3493 movq m0, [base_reg-prep_ssse3+subpel_filters+1+myq*8] 3494%if ARCH_X86_32 3495 mov ssq, ssmp 3496%define regs_used 6 3497 ALLOC_STACK -mmsize*4 3498%define regs_used 7 3499 %define m8 [rsp+mmsize*0] 3500 %define m9 [rsp+mmsize*1] 3501 %define m10 [rsp+mmsize*2] 3502 punpcklbw m0, m0 3503 sub srcq, ssq 3504 psraw m0, 8 ; sign-extend 3505 sub srcq, ssq 3506 pshufd m2, m0, q0000 3507 mova m8, m2 3508 pshufd m2, m0, q1111 3509 mova m9, m2 3510 pshufd m2, m0, q2222 3511 mova m10, m2 3512 movq m3, [srcq+ssq*0] 3513 movq m4, [srcq+ssq*1] 3514 lea srcq, [srcq+ssq*2] 3515 %define m11 [base+pw_8192] 3516 %define m12 [base+subpel_h_shufA] 3517 %define m13 [rsp+mmsize*3] 3518 %define m14 [base+pd_32] 3519 pshufd m1, m1, q0000 3520 mova m13, m1 3521%else 3522 WIN64_SPILL_XMM 15 3523 mov nsq, ssq 3524 punpcklbw m0, m0 3525 neg nsq 3526 psraw m0, 8 ; sign-extend 3527 pshufd m8, m0, q0000 3528 pshufd m9, m0, q1111 3529 pshufd m10, m0, q2222 3530 movq m3, [srcq+nsq*2] 3531 movq m4, [srcq+nsq*1] 3532 pshufd m13, m1, q0000 3533 mova m12, [base+subpel_h_shufA] 3534 mova m11, [base+pw_8192] 3535 mova m14, [base+pd_32] 3536%endif 3537 movq m0, [srcq+ssq*0] 3538 movq m1, [srcq+ssq*1] 3539 lea srcq, [srcq+ssq*2] 3540 movq m2, [srcq+ssq*0] 3541%if ARCH_X86_32 3542 mova m5, m12 3543 mova m6, m13 3544 REPX {pshufb x, m5 }, m3, m4, m0, m1, m2 3545 mova m5, m11 3546 REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2 3547%else 3548 REPX {pshufb x, m12}, m3, m4, m0, m1, m2 3549 REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2 3550%endif 3551 phaddw m3, m0 ; 0 2 3552 phaddw m4, m1 ; 1 3 3553 phaddw m0, m2 ; 2 4 3554%if ARCH_X86_32 3555 REPX {pmulhrsw x, m5 }, m3, m4, m0 3556%else 3557 REPX {pmulhrsw x, m11}, m3, m4, m0 3558%endif 3559 punpcklwd m1, m3, m4 ; 01 3560 punpckhwd m3, m4 ; 23 3561 punpcklwd m2, m4, m0 ; 12 3562 punpckhwd m4, m0 ; 34 3563.hv_w4_loop: 3564 movq m7, [srcq+ssq*1] 3565 lea srcq, [srcq+ssq*2] 3566 movq m6, [srcq+ssq*0] 3567 pshufb m7, m12 3568 pshufb m6, m12 3569 pmaddubsw m7, m13 3570 pmaddubsw m6, m13 3571 pmaddwd m5, m8, m1 ; a0 3572 mova m1, m3 3573 phaddw m7, m6 ; 5 6 3574 pmaddwd m6, m8, m2 ; b0 3575 mova m2, m4 3576 pmaddwd m3, m9 ; a1 3577 pmaddwd m4, m9 ; b1 3578 pmulhrsw m7, m11 3579 paddd m5, m14 3580 paddd m6, m14 3581 paddd m5, m3 3582 paddd m6, m4 3583 shufpd m4, m0, m7, 0x01 ; 4 5 3584 mova m0, m7 3585 punpcklwd m3, m4, m7 ; 45 3586 punpckhwd m4, m7 ; 56 3587 pmaddwd m7, m10, m3 ; a2 3588 paddd m5, m7 3589 pmaddwd m7, m10, m4 ; b2 3590 paddd m6, m7 3591 psrad m5, 6 3592 psrad m6, 6 3593 packssdw m5, m6 3594 mova [tmpq], m5 3595 add tmpq, 16 3596 sub hd, 2 3597 jg .hv_w4_loop 3598 RET 3599.hv_w8: 3600 RESET_STACK_STATE 3601 shr mxd, 16 3602 sub srcq, 2 3603 movq m0, [base_reg-prep_ssse3+subpel_filters+1+mxq*8] 3604%if ARCH_X86_32 3605 mov mxd, myd 3606 and mxd, 0x7f 3607%else 3608 movzx mxd, myb 3609%endif 3610 shr myd, 16 3611 cmp hd, 6 3612 cmovs myd, mxd 3613 movq m1, [base_reg-prep_ssse3+subpel_filters+1+myq*8] 3614%if ARCH_X86_32 3615 mov ssq, ssm 3616%assign regs_used 6 3617 ALLOC_STACK -mmsize*16 3618%assign regs_used 7 3619 sub srcq, ssq 3620 sub srcq, ssq 3621%if STACK_ALIGNMENT < 16 3622 %define srcm [esp+mmsize*15+gprsize*0] 3623 %define tmpm [esp+mmsize*15+gprsize*1] 3624 mov tmpm, tmpq 3625%endif 3626 mov srcm, srcq 3627%else 3628 ALLOC_STACK 16*6, 16 3629 mov nsq, ssq 3630 neg nsq 3631%endif 3632 mova m7, [base+pw_8192] 3633 lea r5d, [wq-8] 3634 punpcklwd m0, m0 3635 shl r5d, 13 3636 punpcklbw m1, m1 3637 add r5d, hd 3638 psraw m1, 8 ; sign-extend 3639 pshufd m2, m0, q0000 3640 mova [rsp+16*0], m2 3641 pshufd m2, m0, q1111 3642 mova [rsp+16*1], m2 3643 pshufd m0, m0, q2222 3644 mova [rsp+16*2], m0 3645 pshufd m2, m1, q0000 3646 mova [rsp+16*3], m2 3647 pshufd m2, m1, q1111 3648 mova [rsp+16*4], m2 3649 pshufd m1, m1, q2222 3650 mova [rsp+16*5], m1 3651%macro PREP_HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \ 3652 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3] 3653 pshufb %2, %1, %4 3654 pshufb %1, %5 3655 pmaddubsw %3, %2, %6 3656 shufps %2, %1, q2121 3657 pmaddubsw %1, %8 3658 pmaddubsw %2, %7 3659 paddw %1, %3 3660 paddw %1, %2 3661 pmulhrsw %1, m7 3662%endmacro 3663.hv_w8_loop0: 3664 mova m2, [base+subpel_h_shufD] 3665 mova m3, [base+subpel_h_shufF] 3666 mova m4, [rsp+16*0] 3667%if ARCH_X86_32 3668 movu m0, [srcq+ssq*0] 3669 movu m1, [srcq+ssq*1] 3670 lea srcq, [srcq+ssq*2] 3671 PREP_HV_H_6TAP m0, m5, m6, m2, m3, m4 3672 PREP_HV_H_6TAP m1, m5, m6, m2, m3, m4 3673 movu m5, [srcq+ssq*0] 3674 punpcklwd m6, m0, m1 ; 01 3675 punpckhwd m0, m1 3676 mova [rsp+16* 6], m6 3677 mova [rsp+16* 7], m0 3678 PREP_HV_H_6TAP m5, m0, m6, m2, m3, m4 3679 movu m0, [srcq+ssq*1] 3680 lea srcq, [srcq+ssq*2] 3681 punpcklwd m6, m1, m5 ; 12 3682 punpckhwd m1, m5 3683 mova [rsp+16* 8], m6 3684 mova [rsp+16* 9], m1 3685 PREP_HV_H_6TAP m0, m1, m6, m2, m3, m4 3686 movu m1, [srcq+ssq*0] 3687 punpcklwd m6, m5, m0 ; 23 3688 punpckhwd m5, m0 3689 mova [rsp+16*10], m6 3690 mova [rsp+16*11], m5 3691 PREP_HV_H_6TAP m1, m5, m6, m2, m3, m4 3692 mova [rsp+16*14], m1 3693 punpcklwd m6, m0, m1 ; 34 3694 punpckhwd m0, m1 3695 mova [rsp+16*12], m6 3696 mova [rsp+16*13], m0 3697.hv_w8_loop: 3698 mova m3, [rsp+16* 3] 3699 pmaddwd m0, m3, [rsp+16* 6] ; a0 3700 pmaddwd m2, m3, [rsp+16* 7] ; a0' 3701 pmaddwd m1, m3, [rsp+16* 8] ; b0 3702 pmaddwd m3, [rsp+16* 9] ; b0' 3703 mova m6, [rsp+16* 4] 3704 mova m4, [rsp+16*10] 3705 mova m5, [rsp+16*11] 3706 mova [rsp+16* 6], m4 3707 pmaddwd m4, m6 ; a1 3708 mova [rsp+16* 7], m5 3709 pmaddwd m5, m6 ; a1' 3710 paddd m0, m4 3711 mova m4, [rsp+16*12] 3712 paddd m2, m5 3713 mova m5, [rsp+16*13] 3714 mova [rsp+16* 8], m4 3715 pmaddwd m4, m6 ; b1 3716 mova [rsp+16* 9], m5 3717 pmaddwd m5, m6 ; b1' 3718 movu m6, [srcq+ssq*1] 3719 lea srcq, [srcq+ssq*2] 3720 paddd m1, m4 3721 paddd m3, m5 3722 PREP_HV_H_6TAP m6, m4, m5 3723 mova m4, [base+pd_32] 3724 mova m5, [rsp+16*14] 3725 REPX {paddd x, m4}, m0, m2, m1, m3 3726 punpcklwd m4, m5, m6 ; 45 3727 punpckhwd m5, m6 3728 mova [rsp+16*10], m4 3729 mova [rsp+16*11], m5 3730 pmaddwd m4, [rsp+16*5] ; a2 3731 pmaddwd m5, [rsp+16*5] ; a2' 3732 paddd m0, m4 3733 movu m4, [srcq+ssq*0] 3734 paddd m2, m5 3735 psrad m0, 6 3736 psrad m2, 6 3737 packssdw m0, m2 3738 PREP_HV_H_6TAP m4, m2, m5 3739 mova m2, [rsp+16*5] 3740 punpcklwd m5, m6, m4 ; 56 3741 mova [rsp+16*14], m4 3742 punpckhwd m6, m4 3743 mova [rsp+16*12], m5 3744 pmaddwd m5, m2 ; b2 3745 mova [rsp+16*13], m6 3746 pmaddwd m6, m2 ; b2' 3747 paddd m1, m5 3748 paddd m3, m6 3749 psrad m1, 6 3750 psrad m3, 6 3751 packssdw m1, m3 3752 mova [tmpq+wq*0], m0 3753 mova [tmpq+wq*2], m1 3754 lea tmpq, [tmpq+wq*4] 3755 sub hd, 2 3756 jg .hv_w8_loop 3757 mov srcq, srcm 3758 mov tmpq, tmpm 3759 movzx hd, r5w 3760 add srcq, 8 3761 add tmpq, 16 3762 mov srcm, srcq 3763 mov tmpm, tmpq 3764%else 3765 movu m9, [srcq+nsq*2] 3766 movu m11, [srcq+nsq*1] 3767 lea r6, [srcq+ssq*2] 3768 movu m13, [srcq+ssq*0] 3769 movu m15, [srcq+ssq*1] 3770 mov r8, tmpq 3771 movu m6, [r6 +ssq*0] 3772 mova m5, [rsp+16*1] 3773 mova m8, [rsp+16*2] 3774 PREP_HV_H_6TAP m9, m0, m1, m2, m3, m4, m5, m8 3775 PREP_HV_H_6TAP m11, m0, m1, m2, m3, m4, m5, m8 3776 PREP_HV_H_6TAP m13, m0, m1, m2, m3, m4, m5, m8 3777 PREP_HV_H_6TAP m15, m0, m1, m2, m3, m4, m5, m8 3778 PREP_HV_H_6TAP m6, m0, m1, m2, m3, m4, m5, m8 3779 punpcklwd m8, m9, m11 ; 01 3780 punpckhwd m9, m11 3781 punpcklwd m10, m11, m13 ; 12 3782 punpckhwd m11, m13 3783 punpcklwd m12, m13, m15 ; 23 3784 punpckhwd m13, m15 3785 punpcklwd m14, m15, m6 ; 34 3786 punpckhwd m15, m6 3787.hv_w8_loop: 3788 mova m3, [rsp+16*3] 3789 mova m4, [rsp+16*4] 3790 mova m5, [base+pd_32] 3791 pmaddwd m0, m8, m3 ; a0 3792 mova m8, m12 3793 pmaddwd m2, m9, m3 ; a0' 3794 mova m9, m13 3795 pmaddwd m1, m10, m3 ; b0 3796 mova m10, m14 3797 pmaddwd m3, m11 ; b0' 3798 mova m11, m15 3799 REPX {pmaddwd x, m4}, m12, m13, m14, m15 3800 REPX {paddd x, m5}, m0, m2, m1, m3 3801 paddd m0, m12 3802 paddd m2, m13 3803 paddd m1, m14 3804 paddd m3, m15 3805 movu m15, [r6+ssq*1] 3806 lea r6, [r6+ssq*2] 3807 PREP_HV_H_6TAP m15, m4, m5 3808 punpcklwd m12, m6, m15 3809 punpckhwd m13, m6, m15 3810 movu m6, [r6+ssq*0] 3811 PREP_HV_H_6TAP m6, m4, m5 3812 mova m4, [rsp+16*5] 3813 punpcklwd m14, m15, m6 3814 punpckhwd m15, m6 3815 pmaddwd m5, m12, m4 ; a2 3816 paddd m0, m5 3817 pmaddwd m5, m13, m4 ; a2' 3818 paddd m2, m5 3819 pmaddwd m5, m14, m4 ; b2 3820 paddd m1, m5 3821 pmaddwd m4, m15 ; b2' 3822 paddd m3, m4 3823 REPX {psrad x, 6}, m0, m2, m1, m3 3824 packssdw m0, m2 3825 packssdw m1, m3 3826 mova [r8+wq*0], m0 3827 mova [r8+wq*2], m1 3828 lea r8, [r8+wq*4] 3829 sub hd, 2 3830 jg .hv_w8_loop 3831 add srcq, 8 3832 add tmpq, 16 3833 movzx hd, r5b 3834%endif 3835 sub r5d, 1<<16 3836 jg .hv_w8_loop0 3837 RET 3838 3839PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc 3840PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc 3841PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc 3842PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc 3843PREP_8TAP_FN sharp, SHARP, SHARP 3844 3845cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 3846 imul mxd, mxm, 0x010101 3847 add mxd, t0d ; 8tap_h, mx, 4tap_h 3848 imul myd, mym, 0x010101 3849 add myd, t1d ; 8tap_v, my, 4tap_v 3850 mov wd, wm 3851 movifnidn srcd, srcm 3852 movifnidn hd, hm 3853 LEA base_reg, prep_ssse3 3854 test mxd, 0xf00 3855 jnz .h 3856 test myd, 0xf00 3857 jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep 3858.v: 3859%if ARCH_X86_32 3860 mov mxd, myd 3861 and mxd, 0x7f 3862%else 3863 WIN64_SPILL_XMM 16 3864 movzx mxd, myb 3865%endif 3866 shr myd, 16 3867 cmp hd, 6 3868 cmovs myd, mxd 3869 movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] 3870 mova m2, [base+pw_512] 3871 mova m7, [base+pw_8192] 3872 punpcklwd m0, m0 3873%if ARCH_X86_32 3874 %define subpel0 [rsp+mmsize*0] 3875 %define subpel1 [rsp+mmsize*1] 3876 %define subpel2 [rsp+mmsize*2] 3877 %define subpel3 [rsp+mmsize*3] 3878%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed 3879 ALLOC_STACK -mmsize*4 3880%assign regs_used 7 3881 mov strideq, [rstk+stack_offset+gprsize*3] 3882 pshufd m1, m0, q0000 3883 mova subpel0, m1 3884 pshufd m1, m0, q1111 3885 mova subpel1, m1 3886 lea r5, [strideq*3] 3887 pshufd m1, m0, q2222 3888 mova subpel2, m1 3889 pshufd m1, m0, q3333 3890 mova subpel3, m1 3891 sub srcq, r5 3892%else 3893 %define subpel0 m8 3894 %define subpel1 m9 3895 %define subpel2 m10 3896 %define subpel3 m11 3897 pshufd m8, m0, q0000 3898 pshufd m9, m0, q1111 3899 lea stride3q, [strideq*3] 3900 pshufd m10, m0, q2222 3901 pshufd m11, m0, q3333 3902 sub srcq, stride3q 3903 cmp wd, 8 3904 jns .v_w8 3905%endif 3906.v_w4: 3907%if ARCH_X86_32 3908 %if STACK_ALIGNMENT < mmsize 3909 %define srcm [esp+stack_size+gprsize*1] 3910 %define tmpm [esp+stack_size+gprsize*2] 3911 %endif 3912 mov tmpm, tmpq 3913 mov srcm, srcq 3914 lea r5d, [wq - 4] ; horizontal loop 3915 shl r5d, (16 - 2) ; (wq / 4) << 16 3916 mov r5w, hw 3917.v_w4_loop0: 3918%endif 3919 movd m1, [srcq+strideq*0] 3920 movd m0, [srcq+strideq*1] 3921%if ARCH_X86_32 3922 lea srcq, [srcq+strideq*2] 3923 movd m2, [srcq+strideq*0] 3924 movd m4, [srcq+strideq*1] 3925 lea srcq, [srcq+strideq*2] 3926 movd m3, [srcq+strideq*0] 3927 movd m5, [srcq+strideq*1] 3928 lea srcq, [srcq+strideq*2] 3929%else 3930 movd m2, [srcq+strideq*2] 3931 add srcq, stride3q 3932 movd m4, [srcq+strideq*0] 3933 movd m3, [srcq+strideq*1] 3934 movd m5, [srcq+strideq*2] 3935 add srcq, stride3q 3936%endif 3937 punpckldq m1, m0 ; 0 1 3938 punpckldq m0, m2 ; 1 2 3939 punpcklbw m1, m0 ; 01 12 3940 movd m0, [srcq+strideq*0] 3941 punpckldq m2, m4 ; 2 3 3942 punpckldq m4, m3 ; 3 4 3943 punpckldq m3, m5 ; 4 5 3944 punpckldq m5, m0 ; 5 6 3945 punpcklbw m2, m4 ; 23 34 3946 punpcklbw m3, m5 ; 45 56 3947.v_w4_loop: 3948 mova m5, m1 3949 pmaddubsw m5, subpel0 ; a0 b0 3950 mova m1, m2 3951 pmaddubsw m2, subpel1 ; a1 b1 3952 paddw m5, m2 3953 mova m2, m3 3954 pmaddubsw m3, subpel2 ; a2 b2 3955 movd m4, [srcq+strideq*1] 3956 lea srcq, [srcq+strideq*2] 3957 paddw m5, m3 3958 punpckldq m3, m0, m4 ; 6 7 _ _ 3959 movd m0, [srcq+strideq*0] 3960 punpckldq m4, m0 ; 7 8 _ _ 3961 punpcklbw m3, m4 ; 67 78 3962 mova m4, m3 3963 pmaddubsw m4, subpel3 ; a3 b3 3964 paddw m5, m4 3965 pmulhrsw m5, m7 3966 movq [tmpq+wq*0], m5 3967 movhps [tmpq+wq*2], m5 3968 lea tmpq, [tmpq+wq*4] 3969 sub hd, 2 3970 jg .v_w4_loop 3971%if ARCH_X86_32 3972 mov srcq, srcm 3973 mov tmpq, tmpm 3974 movzx hd, r5w 3975 add srcq, 4 3976 add tmpq, 8 3977 mov srcm, srcq 3978 mov tmpm, tmpq 3979 sub r5d, 1<<16 ; horizontal-- 3980 jg .v_w4_loop0 3981%endif 3982 RET 3983%if ARCH_X86_64 3984.v_w8: 3985 lea r6d, [wq*8-64] 3986 mov r5, srcq 3987 mov r8, tmpq 3988 lea r6d, [hq+r6*4] 3989.v_w8_loop0: 3990 movq m1, [srcq+strideq*0] 3991 movq m2, [srcq+strideq*1] 3992 movq m3, [srcq+strideq*2] 3993 add srcq, stride3q 3994 movq m4, [srcq+strideq*0] 3995 movq m5, [srcq+strideq*1] 3996 movq m6, [srcq+strideq*2] 3997 add srcq, stride3q 3998 movq m0, [srcq+strideq*0] 3999 punpcklbw m1, m2 ; 01 4000 punpcklbw m2, m3 ; 12 4001 punpcklbw m3, m4 ; 23 4002 punpcklbw m4, m5 ; 34 4003 punpcklbw m5, m6 ; 45 4004 punpcklbw m6, m0 ; 56 4005.v_w8_loop: 4006 movq m13, [srcq+strideq*1] 4007 lea srcq, [srcq+strideq*2] 4008 pmaddubsw m14, m1, subpel0 ; a0 4009 pmaddubsw m15, m2, subpel0 ; b0 4010 mova m1, m3 4011 mova m2, m4 4012 pmaddubsw m3, subpel1 ; a1 4013 pmaddubsw m4, subpel1 ; b1 4014 paddw m14, m3 4015 paddw m15, m4 4016 mova m3, m5 4017 mova m4, m6 4018 pmaddubsw m5, subpel2 ; a2 4019 pmaddubsw m6, subpel2 ; b2 4020 punpcklbw m12, m0, m13 ; 67 4021 movq m0, [srcq+strideq*0] 4022 punpcklbw m13, m0 ; 78 4023 paddw m14, m5 4024 mova m5, m12 4025 pmaddubsw m12, subpel3 ; a3 4026 paddw m15, m6 4027 mova m6, m13 4028 pmaddubsw m13, subpel3 ; b3 4029 paddw m14, m12 4030 paddw m15, m13 4031 pmulhrsw m14, m7 4032 pmulhrsw m15, m7 4033 movu [tmpq+wq*0], m14 4034 movu [tmpq+wq*2], m15 4035 lea tmpq, [tmpq+wq*4] 4036 sub hd, 2 4037 jg .v_w8_loop 4038 add r5, 8 4039 add r8, 16 4040 movzx hd, r6b 4041 mov srcq, r5 4042 mov tmpq, r8 4043 sub r6d, 1<<8 4044 jg .v_w8_loop0 4045 RET 4046%endif ;ARCH_X86_64 4047%undef subpel0 4048%undef subpel1 4049%undef subpel2 4050%undef subpel3 4051.h_w4: 4052 WIN64_SPILL_XMM 7 4053%if ARCH_X86_32 4054 and mxd, 0x7f 4055%else 4056 movzx mxd, mxb 4057%endif 4058 dec srcq 4059 movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] 4060 mova m5, [base+subpel_h_shufA] 4061 mova m6, [base+pw_8192] 4062 movifnidn r2, stridemp 4063 pshufd m4, m4, q0000 4064 lea r3, [r2*3] 4065.h_w4_loop: 4066 movq m0, [srcq+r2*0] 4067 movq m1, [srcq+r2*1] 4068 movq m2, [srcq+r2*2] 4069 movq m3, [srcq+r3 ] 4070 lea srcq, [srcq+r2*4] 4071 REPX {pshufb x, m5}, m0, m1, m2, m3 4072 REPX {pmaddubsw x, m4}, m0, m1, m2, m3 4073 phaddw m0, m1 4074 phaddw m2, m3 4075 pmulhrsw m0, m6 4076 pmulhrsw m2, m6 4077 mova [tmpq+16*0], m0 4078 mova [tmpq+16*1], m2 4079 add tmpq, 32 4080 sub hd, 4 4081 jg .h_w4_loop 4082 RET 4083.h: 4084 test myd, 0xf00 4085 jnz .hv 4086 cmp wd, 4 4087 je .h_w4 4088 WIN64_SPILL_XMM 12 4089%if ARCH_X86_32 4090 %define strideq r6 4091 mov strideq, stridem 4092%endif 4093 tzcnt wd, wd 4094%if ARCH_X86_64 4095 mova m10, [base+subpel_h_shufA] 4096 mova m11, [base+subpel_h_shufB] 4097 mova m9, [base+subpel_h_shufC] 4098%else 4099 %define m10 [base+subpel_h_shufA] 4100 %define m11 [base+subpel_h_shufB] 4101 %define m9 [base+subpel_h_shufC] 4102%endif 4103 shr mxd, 16 4104 sub srcq, 3 4105 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] 4106 movq m6, [base_reg+mxq*8+subpel_filters-prep_ssse3] 4107 mova m7, [base+pw_8192] 4108 pshufd m5, m6, q0000 4109 pshufd m6, m6, q1111 4110 add wq, base_reg 4111 jmp wq 4112%macro PREP_8TAP_H 2 ; dst, src_memloc 4113 movu m%1, [%2] 4114 pshufb m2, m%1, m11 ; subpel_h_shufB 4115 pshufb m3, m%1, m9 ; subpel_h_shufC 4116 pshufb m%1, m10 ; subpel_h_shufA 4117 mova m4, m2 4118 pmaddubsw m4, m5 ; subpel +0 B0 4119 pmaddubsw m2, m6 ; subpel +4 B4 4120 pmaddubsw m3, m6 ; subpel +4 C4 4121 pmaddubsw m%1, m5 ; subpel +0 A0 4122 paddw m3, m4 4123 paddw m%1, m2 4124 phaddw m%1, m3 4125 pmulhrsw m%1, m7 4126%endmacro 4127.h_w8: 4128 PREP_8TAP_H 0, srcq+strideq*0 4129 PREP_8TAP_H 1, srcq+strideq*1 4130 mova [tmpq+16*0], m0 4131 mova [tmpq+16*1], m1 4132 lea srcq, [srcq+strideq*2] 4133 add tmpq, 32 4134 sub hd, 2 4135 jg .h_w8 4136 RET 4137.h_w16: 4138 mov r3, -16*1 4139 jmp .h_start 4140.h_w32: 4141 mov r3, -16*2 4142 jmp .h_start 4143.h_w64: 4144 mov r3, -16*4 4145 jmp .h_start 4146.h_w128: 4147 mov r3, -16*8 4148.h_start: 4149 sub srcq, r3 4150 mov r5, r3 4151.h_loop: 4152 PREP_8TAP_H 0, srcq+r3+8*0 4153 PREP_8TAP_H 1, srcq+r3+8*1 4154 mova [tmpq+16*0], m0 4155 mova [tmpq+16*1], m1 4156 add tmpq, 32 4157 add r3, 16 4158 jl .h_loop 4159 add srcq, strideq 4160 mov r3, r5 4161 dec hd 4162 jg .h_loop 4163 RET 4164.hv: 4165 RESET_STACK_STATE 4166 cmp wd, 4 4167 jg .hv_w8 4168 and mxd, 0x7f 4169 movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] 4170%if ARCH_X86_32 4171 mov mxd, myd 4172 shr myd, 16 4173 and mxd, 0x7f 4174 cmp hd, 6 4175 cmovs myd, mxd 4176 movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] 4177 mov strideq, stridem 4178 %assign regs_used 6 4179 ALLOC_STACK -mmsize*14 4180 %assign regs_used 7 4181 lea r5, [strideq*3+1] 4182 sub srcq, r5 4183 %define subpelv0 [rsp+mmsize*0] 4184 %define subpelv1 [rsp+mmsize*1] 4185 %define subpelv2 [rsp+mmsize*2] 4186 %define subpelv3 [rsp+mmsize*3] 4187 punpcklbw m0, m0 4188 psraw m0, 8 4189 pshufd m6, m0, q0000 4190 mova subpelv0, m6 4191 pshufd m6, m0, q1111 4192 mova subpelv1, m6 4193 pshufd m6, m0, q2222 4194 mova subpelv2, m6 4195 pshufd m6, m0, q3333 4196 mova subpelv3, m6 4197%else 4198 movzx mxd, myb 4199 shr myd, 16 4200 cmp hd, 6 4201 cmovs myd, mxd 4202 movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] 4203 ALLOC_STACK mmsize*14, 14 4204 lea stride3q, [strideq*3] 4205 sub srcq, stride3q 4206 dec srcq 4207 %define subpelv0 m10 4208 %define subpelv1 m11 4209 %define subpelv2 m12 4210 %define subpelv3 m13 4211 punpcklbw m0, m0 4212 psraw m0, 8 4213 mova m8, [base+pw_8192] 4214 mova m9, [base+pd_32] 4215 pshufd m10, m0, q0000 4216 pshufd m11, m0, q1111 4217 pshufd m12, m0, q2222 4218 pshufd m13, m0, q3333 4219%endif 4220 pshufd m7, m1, q0000 4221%define hv4_line_0_0 4 4222%define hv4_line_0_1 5 4223%define hv4_line_0_2 6 4224%define hv4_line_0_3 7 4225%define hv4_line_0_4 8 4226%define hv4_line_0_5 9 4227%define hv4_line_1_0 10 4228%define hv4_line_1_1 11 4229%define hv4_line_1_2 12 4230%define hv4_line_1_3 13 4231%if ARCH_X86_32 4232 %define w8192reg [base+pw_8192] 4233 %define d32reg [base+pd_32] 4234%else 4235 %define w8192reg m8 4236 %define d32reg m9 4237%endif 4238 ; lower shuffle 0 1 2 3 4 4239 mova m6, [base+subpel_h_shuf4] 4240 movq m5, [srcq+strideq*0] ; 0 _ _ _ 4241 movhps m5, [srcq+strideq*1] ; 0 _ 1 _ 4242%if ARCH_X86_32 4243 lea srcq, [srcq+strideq*2] 4244 movq m4, [srcq+strideq*0] ; 2 _ _ _ 4245 movhps m4, [srcq+strideq*1] ; 2 _ 3 _ 4246 lea srcq, [srcq+strideq*2] 4247%else 4248 movq m4, [srcq+strideq*2] ; 2 _ _ _ 4249 movhps m4, [srcq+stride3q ] ; 2 _ 3 _ 4250 lea srcq, [srcq+strideq*4] 4251%endif 4252 pshufb m2, m5, m6 ;H subpel_h_shuf4 0~1~ 4253 pshufb m0, m4, m6 ;H subpel_h_shuf4 2~3~ 4254 pmaddubsw m2, m7 ;H subpel_filters 4255 pmaddubsw m0, m7 ;H subpel_filters 4256 phaddw m2, m0 4257 pmulhrsw m2, w8192reg 4258 SAVELINE_W4 m2, 2, 0 4259 ; upper shuffle 2 3 4 5 6 4260 mova m6, [base+subpel_h_shuf4+16] 4261 pshufb m2, m5, m6 ;H subpel_h_shuf4 0~1~ 4262 pshufb m0, m4, m6 ;H subpel_h_shuf4 2~3~ 4263 pmaddubsw m2, m7 ;H subpel_filters 4264 pmaddubsw m0, m7 ;H subpel_filters 4265 phaddw m2, m0 ;H 0 1 2 3 4266 pmulhrsw m2, w8192reg 4267 ; lower shuffle 4268 mova m6, [base+subpel_h_shuf4] 4269 movq m5, [srcq+strideq*0] ; 4 _ _ _ 4270 movhps m5, [srcq+strideq*1] ; 4 _ 5 _ 4271%if ARCH_X86_32 4272 lea srcq, [srcq+strideq*2] 4273 movq m4, [srcq+strideq*0] ; 6 _ _ _ 4274 add srcq, strideq 4275%else 4276 movq m4, [srcq+strideq*2] ; 6 _ _ _ 4277 add srcq, stride3q 4278%endif 4279 pshufb m3, m5, m6 ;H subpel_h_shuf4 4~5~ 4280 pshufb m0, m4, m6 ;H subpel_h_shuf4 6~6~ 4281 pmaddubsw m3, m7 ;H subpel_filters 4282 pmaddubsw m0, m7 ;H subpel_filters 4283 phaddw m3, m0 ;H 4 5 6 7 4284 pmulhrsw m3, w8192reg 4285 SAVELINE_W4 m3, 3, 0 4286 ; upper shuffle 4287 mova m6, [base+subpel_h_shuf4+16] 4288 pshufb m3, m5, m6 ;H subpel_h_shuf4 4~5~ 4289 pshufb m0, m4, m6 ;H subpel_h_shuf4 6~6~ 4290 pmaddubsw m3, m7 ;H subpel_filters 4291 pmaddubsw m0, m7 ;H subpel_filters 4292 phaddw m3, m0 ;H 4 5 6 7 4293 pmulhrsw m3, w8192reg 4294 ;process high 4295 palignr m4, m3, m2, 4;V 1 2 3 4 4296 punpcklwd m1, m2, m4 ; V 01 12 4297 punpckhwd m2, m4 ; V 23 34 4298 pshufd m0, m3, q2121;V 5 6 5 6 4299 punpcklwd m3, m0 ; V 45 56 4300 SAVELINE_W4 m0, 0, 1 4301 SAVELINE_W4 m1, 1, 1 4302 SAVELINE_W4 m2, 2, 1 4303 SAVELINE_W4 m3, 3, 1 4304 ;process low 4305 RESTORELINE_W4 m2, 2, 0 4306 RESTORELINE_W4 m3, 3, 0 4307 palignr m4, m3, m2, 4;V 1 2 3 4 4308 punpcklwd m1, m2, m4 ; V 01 12 4309 punpckhwd m2, m4 ; V 23 34 4310 pshufd m0, m3, q2121;V 5 6 5 6 4311 punpcklwd m3, m0 ; V 45 56 4312.hv_w4_loop: 4313 ;process low 4314 pmaddwd m5, m1, subpelv0 ; V a0 b0 4315 mova m1, m2 4316 pmaddwd m2, subpelv1; V a1 b1 4317 paddd m5, m2 4318 mova m2, m3 4319 pmaddwd m3, subpelv2; V a2 b2 4320 paddd m5, m3 4321 mova m6, [base+subpel_h_shuf4] 4322 movq m4, [srcq+strideq*0] ; 7 4323 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ 4324 pshufb m4, m6 ; H subpel_h_shuf4 7~8~ 4325 pmaddubsw m4, m7 ; H subpel_filters 4326 phaddw m4, m4 ; H 7878 4327 pmulhrsw m4, w8192reg 4328 palignr m3, m4, m0, 12 ; 6787 4329 mova m0, m4 4330 punpcklwd m3, m4 ; 67 78 4331 pmaddwd m4, m3, subpelv3; a3 b3 4332 paddd m5, d32reg ; pd_32 4333 paddd m5, m4 4334 psrad m5, 6 4335 SAVELINE_W4 m0, 0, 0 4336 SAVELINE_W4 m1, 1, 0 4337 SAVELINE_W4 m2, 2, 0 4338 SAVELINE_W4 m3, 3, 0 4339 SAVELINE_W4 m5, 5, 0 4340 ;process high 4341 RESTORELINE_W4 m0, 0, 1 4342 RESTORELINE_W4 m1, 1, 1 4343 RESTORELINE_W4 m2, 2, 1 4344 RESTORELINE_W4 m3, 3, 1 4345 pmaddwd m5, m1, subpelv0; V a0 b0 4346 mova m1, m2 4347 pmaddwd m2, subpelv1; V a1 b1 4348 paddd m5, m2 4349 mova m2, m3 4350 pmaddwd m3, subpelv2; V a2 b2 4351 paddd m5, m3 4352 mova m6, [base+subpel_h_shuf4+16] 4353 movq m4, [srcq+strideq*0] ; 7 4354 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ 4355 pshufb m4, m6 ; H subpel_h_shuf4 7~8~ 4356 pmaddubsw m4, m7 ; H subpel_filters 4357 phaddw m4, m4 ; H 7878 4358 pmulhrsw m4, w8192reg 4359 palignr m3, m4, m0, 12 ; 6787 4360 mova m0, m4 4361 punpcklwd m3, m4 ; 67 78 4362 pmaddwd m4, m3, subpelv3; a3 b3 4363 paddd m5, d32reg ; pd_32 4364 paddd m5, m4 4365 psrad m4, m5, 6 4366 RESTORELINE_W4 m5, 5, 0 4367 packssdw m5, m4 4368 pshufd m5, m5, q3120 4369 movu [tmpq], m5 4370 lea srcq, [srcq+strideq*2] 4371 add tmpq, 16 4372 sub hd, 2 4373 SAVELINE_W4 m0, 0, 1 4374 SAVELINE_W4 m1, 1, 1 4375 SAVELINE_W4 m2, 2, 1 4376 SAVELINE_W4 m3, 3, 1 4377 RESTORELINE_W4 m0, 0, 0 4378 RESTORELINE_W4 m1, 1, 0 4379 RESTORELINE_W4 m2, 2, 0 4380 RESTORELINE_W4 m3, 3, 0 4381 jg .hv_w4_loop 4382 RET 4383%undef subpelv0 4384%undef subpelv1 4385%undef subpelv2 4386%undef subpelv3 4387.hv_w8: 4388 RESET_STACK_STATE 4389%define hv8_line_1 0 4390%define hv8_line_2 1 4391%define hv8_line_3 2 4392%define hv8_line_4 3 4393%define hv8_line_6 4 4394 shr mxd, 16 4395%if ARCH_X86_32 4396 %define subpelh0 [rsp+mmsize*5] 4397 %define subpelh1 [rsp+mmsize*6] 4398 %define subpelv0 [rsp+mmsize*7] 4399 %define subpelv1 [rsp+mmsize*8] 4400 %define subpelv2 [rsp+mmsize*9] 4401 %define subpelv3 [rsp+mmsize*10] 4402 %define accuv0 [rsp+mmsize*11] 4403 %define accuv1 [rsp+mmsize*12] 4404 movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] 4405 mov mxd, myd 4406 shr myd, 16 4407 and mxd, 0x7f 4408 cmp hd, 6 4409 cmovs myd, mxd 4410 movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] 4411 mov strideq, stridem 4412 %assign regs_used 6 4413 ALLOC_STACK -mmsize*14 4414 %assign regs_used 7 4415 %if STACK_ALIGNMENT < mmsize 4416 %define tmpm [rsp+mmsize*13+gprsize*1] 4417 %define srcm [rsp+mmsize*13+gprsize*2] 4418 %define stridem [rsp+mmsize*13+gprsize*3] 4419 mov tmpm, tmpq 4420 mov stridem, strideq 4421 %endif 4422 pshufd m0, m1, q0000 4423 pshufd m1, m1, q1111 4424 punpcklbw m5, m5 4425 psraw m5, 8 4426 pshufd m2, m5, q0000 4427 pshufd m3, m5, q1111 4428 pshufd m4, m5, q2222 4429 pshufd m5, m5, q3333 4430 mova subpelh0, m0 4431 mova subpelh1, m1 4432 mova subpelv0, m2 4433 mova subpelv1, m3 4434 mova subpelv2, m4 4435 mova subpelv3, m5 4436 lea r5, [strideq*3+3] 4437 sub srcq, r5 4438 mov srcm, srcq 4439%else 4440 ALLOC_STACK mmsize*5, 16 4441 %define subpelh0 m10 4442 %define subpelh1 m11 4443 %define subpelv0 m12 4444 %define subpelv1 m13 4445 %define subpelv2 m14 4446 %define subpelv3 m15 4447 %define accuv0 m8 4448 %define accuv1 m9 4449 movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] 4450 movzx mxd, myb 4451 shr myd, 16 4452 cmp hd, 6 4453 cmovs myd, mxd 4454 movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] 4455 pshufd subpelh0, m0, q0000 4456 pshufd subpelh1, m0, q1111 4457 punpcklbw m1, m1 4458 psraw m1, 8 4459 pshufd subpelv0, m1, q0000 4460 pshufd subpelv1, m1, q1111 4461 pshufd subpelv2, m1, q2222 4462 pshufd subpelv3, m1, q3333 4463 lea stride3q, [strideq*3] 4464 sub srcq, 3 4465 sub srcq, stride3q 4466 mov r6, srcq 4467 mov r8, tmpq 4468%endif 4469 lea r5d, [wq-4] 4470 shl r5d, 14 4471 add r5d, hd 4472.hv_w8_loop0: 4473%if ARCH_X86_64 4474 mova m7, [base+subpel_h_shufA] 4475 mova m8, [base+subpel_h_shufB] 4476 mova m9, [base+subpel_h_shufC] 4477 %define shufA m7 4478 %define shufB m8 4479 %define shufC m9 4480%else 4481 %define shufA [base+subpel_h_shufA] 4482 %define shufB [base+subpel_h_shufB] 4483 %define shufC [base+subpel_h_shufC] 4484%endif 4485%macro PREP_8TAP_HV 2 ; dst, src_memloc, tmp[1-2] 4486 movu %1, [%2] 4487 pshufb m2, %1, shufB 4488 pshufb m3, %1, shufC 4489 pshufb %1, shufA 4490 mova m1, m2 4491 pmaddubsw m1, subpelh0 ; subpel +0 C0 4492 pmaddubsw m3, subpelh1 ; subpel +4 B4 4493 pmaddubsw m2, subpelh1 ; C4 4494 pmaddubsw %1, subpelh0 ; A0 4495 paddw m1, m3 ; C0+B4 4496 paddw %1, m2 ; A0+C4 4497 phaddw %1, m1 4498%endmacro 4499 PREP_8TAP_HV m4, srcq+strideq*0 4500 PREP_8TAP_HV m5, srcq+strideq*1 4501%if ARCH_X86_64 4502 PREP_8TAP_HV m6, srcq+strideq*2 4503 add srcq, stride3q 4504 PREP_8TAP_HV m0, srcq+strideq*0 4505%else 4506 lea srcq, [srcq+strideq*2] 4507 PREP_8TAP_HV m6, srcq+strideq*0 4508 PREP_8TAP_HV m0, srcq+strideq*1 4509 lea srcq, [srcq+strideq*2] 4510%endif 4511 mova m7, [base+pw_8192] 4512 REPX {pmulhrsw x, m7}, m4, m5, m6, m0 4513 punpcklwd m1, m4, m5 ; 01 4514 punpcklwd m2, m5, m6 ; 12 4515 punpcklwd m3, m6, m0 ; 23 4516 SAVELINE_W8 1, m1 4517 SAVELINE_W8 2, m2 4518 SAVELINE_W8 3, m3 4519 mova m7, [base+subpel_h_shufA] 4520%if ARCH_X86_64 4521 PREP_8TAP_HV m4, srcq+strideq*1 4522 PREP_8TAP_HV m5, srcq+strideq*2 4523 add srcq, stride3q 4524 PREP_8TAP_HV m6, srcq+strideq*0 4525%else 4526 PREP_8TAP_HV m4, srcq+strideq*0 4527 PREP_8TAP_HV m5, srcq+strideq*1 4528 lea srcq, [srcq+strideq*2] 4529 PREP_8TAP_HV m6, srcq+strideq*0 4530%endif 4531 mova m3, [base+pw_8192] 4532 pmulhrsw m1, m3, m4 4533 pmulhrsw m2, m3, m5 4534 pmulhrsw m3, m6 4535 punpcklwd m4, m0, m1 ; 34 4536 punpcklwd m5, m1, m2 ; 45 4537 punpcklwd m6, m2, m3 ; 56 4538 SAVELINE_W8 6, m3 4539 RESTORELINE_W8 1, m1 4540 RESTORELINE_W8 2, m2 4541 RESTORELINE_W8 3, m3 4542.hv_w8_loop: 4543 SAVELINE_W8 1, m3 4544 SAVELINE_W8 2, m4 4545 SAVELINE_W8 3, m5 4546 SAVELINE_W8 4, m6 4547%if ARCH_X86_32 4548 pmaddwd m0, m1, subpelv0 ; a0 4549 pmaddwd m7, m2, subpelv0 ; b0 4550 pmaddwd m3, subpelv1 ; a1 4551 pmaddwd m4, subpelv1 ; b1 4552 paddd m0, m3 4553 paddd m7, m4 4554 pmaddwd m5, subpelv2 ; a2 4555 pmaddwd m6, subpelv2 ; b2 4556 paddd m0, m5 4557 paddd m7, m6 4558 mova m5, [base+pd_32] 4559 paddd m0, m5 4560 paddd m7, m5 4561 mova accuv0, m0 4562 mova accuv1, m7 4563%else 4564 pmaddwd accuv0, m1, subpelv0 ; a0 4565 pmaddwd accuv1, m2, subpelv0 ; b0 4566 pmaddwd m3, subpelv1 ; a1 4567 pmaddwd m4, subpelv1 ; b1 4568 paddd accuv0, m3 4569 paddd accuv1, m4 4570 pmaddwd m5, subpelv2 ; a2 4571 pmaddwd m6, subpelv2 ; b2 4572 paddd accuv0, m5 4573 paddd accuv1, m6 4574 mova m7, [base+pd_32] 4575 paddd accuv0, m7 4576 paddd accuv1, m7 4577 mova m7, [base+subpel_h_shufB] 4578 mova m6, [base+subpel_h_shufC] 4579 mova m5, [base+subpel_h_shufA] 4580 %define shufA m5 4581 %define shufB m7 4582 %define shufC m6 4583%endif 4584 PREP_8TAP_HV m0, srcq+strideq*1 4585 lea srcq, [srcq+strideq*2] 4586 PREP_8TAP_HV m4, srcq+strideq*0 4587 mova m5, [base+pw_8192] 4588 pmulhrsw m0, m5 4589 pmulhrsw m4, m5 4590 RESTORELINE_W8 6, m6 4591 punpcklwd m5, m6, m0 ; 67 4592 punpcklwd m6, m0, m4 ; 78 4593 pmaddwd m1, m5, subpelv3 ; a3 4594 paddd m2, m1, accuv0 4595 pmaddwd m1, m6, subpelv3 ; b3 4596 paddd m1, m1, accuv1 4597 psrad m2, 6 4598 psrad m1, 6 4599 packssdw m2, m1 4600 movq [tmpq+wq*0], m2 4601 movhps [tmpq+wq*2], m2 4602 lea tmpq, [tmpq+wq*4] 4603 sub hd, 2 4604 jle .hv_w8_outer 4605 SAVELINE_W8 6, m4 4606 RESTORELINE_W8 1, m1 4607 RESTORELINE_W8 2, m2 4608 RESTORELINE_W8 3, m3 4609 RESTORELINE_W8 4, m4 4610 jmp .hv_w8_loop 4611.hv_w8_outer: 4612%if ARCH_X86_32 4613 mov srcq, srcm 4614 mov tmpq, tmpm 4615 movzx hd, r5w 4616 add srcq, 4 4617 add tmpq, 8 4618 mov srcm, srcq 4619 mov tmpm, tmpq 4620%else 4621 add r6, 4 4622 add r8, 8 4623 movzx hd, r5b 4624 mov srcq, r6 4625 mov tmpq, r8 4626%endif 4627 sub r5d, 1<<16 4628 jg .hv_w8_loop0 4629 RET 4630 4631%macro movifprep 2 4632 %if isprep 4633 mov %1, %2 4634 %endif 4635%endmacro 4636 4637%macro SAVE_REG 1 4638 %xdefine r%1_save r%1 4639 %xdefine r%1q_save r%1q 4640 %xdefine r%1d_save r%1d 4641 %if ARCH_X86_32 4642 %define r%1m_save [rstk+stack_offset+(%1+1)*4] 4643 %endif 4644%endmacro 4645 4646%macro LOAD_REG 1 4647 %xdefine r%1 r%1_save 4648 %xdefine r%1q r%1q_save 4649 %xdefine r%1d r%1d_save 4650 %if ARCH_X86_32 4651 %define r%1m r%1m_save 4652 %endif 4653 %undef r%1d_save 4654 %undef r%1q_save 4655 %undef r%1_save 4656%endmacro 4657 4658%macro REMAP_REG 2-3 4659 %xdefine r%1 r%2 4660 %xdefine r%1q r%2q 4661 %xdefine r%1d r%2d 4662 %if ARCH_X86_32 4663 %if %3 == 0 4664 %xdefine r%1m r%2m 4665 %else 4666 %define r%1m [rstk+stack_offset+(%1+1)*4] 4667 %endif 4668 %endif 4669%endmacro 4670 4671%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 4672 %if isprep 4673 %if ARCH_X86_64 4674 SAVE_REG 14 4675 %assign %%i 14 4676 %rep 14 4677 %assign %%j %%i-1 4678 REMAP_REG %%i, %%j 4679 %assign %%i %%i-1 4680 %endrep 4681 %else 4682 SAVE_REG 5 4683 %assign %%i 5 4684 %rep 5 4685 %assign %%j %%i-1 4686 REMAP_REG %%i, %%j, 0 4687 %assign %%i %%i-1 4688 %endrep 4689 %endif 4690 %endif 4691%endmacro 4692 4693%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 4694 %if isprep 4695 %assign %%i 1 4696 %if ARCH_X86_64 4697 %rep 13 4698 %assign %%j %%i+1 4699 REMAP_REG %%i, %%j 4700 %assign %%i %%i+1 4701 %endrep 4702 LOAD_REG 14 4703 %else 4704 %rep 4 4705 %assign %%j %%i+1 4706 REMAP_REG %%i, %%j, 1 4707 %assign %%i %%i+1 4708 %endrep 4709 LOAD_REG 5 4710 %endif 4711 %endif 4712%endmacro 4713 4714%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 4715 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 4716 RET 4717 %if %1 4718 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4719 %endif 4720%endmacro 4721 4722%if ARCH_X86_64 4723 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] 4724 SWAP m%2, m%5 4725 movq m%1, [srcq+ r4] 4726 movq m%2, [srcq+ r6] 4727 movhps m%1, [srcq+ r7] 4728 movhps m%2, [srcq+ r9] 4729 movq m%3, [srcq+r10] 4730 movq m%4, [srcq+r11] 4731 movhps m%3, [srcq+r13] 4732 movhps m%4, [srcq+ rX] 4733 add srcq, ssq 4734 movq m%5, [srcq+ r4] 4735 movq m%6, [srcq+ r6] 4736 movhps m%5, [srcq+ r7] 4737 movhps m%6, [srcq+ r9] 4738 movq m%7, [srcq+r10] 4739 movq m%8, [srcq+r11] 4740 movhps m%7, [srcq+r13] 4741 movhps m%8, [srcq+ rX] 4742 add srcq, ssq 4743 pmaddubsw m%1, m%9 4744 pmaddubsw m%5, m%9 4745 pmaddubsw m%2, m%10 4746 pmaddubsw m%6, m%10 4747 pmaddubsw m%3, m%11 4748 pmaddubsw m%7, m%11 4749 pmaddubsw m%4, m%12 4750 pmaddubsw m%8, m%12 4751 phaddw m%1, m%2 4752 phaddw m%5, m%6 4753 phaddw m%3, m%4 4754 phaddw m%7, m%8 4755 phaddw m%1, m%3 4756 phaddw m%5, m%7 4757 pmulhrsw m%1, m12 4758 pmulhrsw m%5, m12 4759 SWAP m%2, m%5 4760 %endmacro 4761%else 4762 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets 4763 %if %3 == 1 4764 mov r0, [esp+ 0] 4765 mov rX, [esp+ 8] 4766 mov r4, [esp+ 4] 4767 mov r5, [esp+12] 4768 %endif 4769 movq m0, [srcq+r0] 4770 movq m1, [srcq+rX] 4771 movhps m0, [srcq+r4] 4772 movhps m1, [srcq+r5] 4773 add srcq, ssq 4774 movq m4, [srcq+r0] 4775 movq m5, [srcq+rX] 4776 movhps m4, [srcq+r4] 4777 movhps m5, [srcq+r5] 4778 mov r0, [esp+16] 4779 mov rX, [esp+24] 4780 mov r4, [esp+20] 4781 mov r5, [esp+28] 4782 sub srcq, ssq 4783 movq m2, [srcq+r0] 4784 movq m3, [srcq+rX] 4785 movhps m2, [srcq+r4] 4786 movhps m3, [srcq+r5] 4787 add srcq, ssq 4788 movq m6, [srcq+r0] 4789 movq m7, [srcq+rX] 4790 movhps m6, [srcq+r4] 4791 movhps m7, [srcq+r5] 4792 add srcq, ssq 4793 pmaddubsw m0, [esp+%1+ 0] 4794 pmaddubsw m4, [esp+%1+ 0] 4795 pmaddubsw m1, [esp+%1+16] 4796 pmaddubsw m5, [esp+%1+16] 4797 pmaddubsw m2, [esp+%1+32] 4798 pmaddubsw m6, [esp+%1+32] 4799 pmaddubsw m3, [esp+%1+48] 4800 pmaddubsw m7, [esp+%1+48] 4801 phaddw m0, m1 4802 phaddw m4, m5 4803 phaddw m2, m3 4804 phaddw m6, m7 4805 phaddw m0, m2 4806 phaddw m4, m6 4807 pmulhrsw m0, m12 4808 pmulhrsw m4, m12 4809 %if %2 != 0 4810 mova [esp+%2+ 0], m0 4811 mova [esp+%2+16], m4 4812 %endif 4813 %endmacro 4814%endif 4815 4816%macro MC_8TAP_SCALED 1 4817%ifidn %1, put 4818 %assign isprep 0 4819 %if ARCH_X86_64 4820 %if required_stack_alignment <= STACK_ALIGNMENT 4821cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy 4822 %else 4823cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy 4824 %endif 4825 %else ; ARCH_X86_32 4826 %if required_stack_alignment <= STACK_ALIGNMENT 4827cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy 4828 %else 4829cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy 4830 %endif 4831 %endif 4832 %xdefine base_reg r12 4833 %define rndshift 10 4834%else ; prep 4835 %assign isprep 1 4836 %if ARCH_X86_64 4837 %if required_stack_alignment <= STACK_ALIGNMENT 4838cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy 4839 %xdefine tmp_stridem r14q 4840 %else 4841cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy 4842 %define tmp_stridem qword [rsp+0x138] 4843 %endif 4844 %xdefine base_reg r11 4845 %else ; ARCH_X86_32 4846 %if required_stack_alignment <= STACK_ALIGNMENT 4847cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy 4848 %else 4849cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy 4850 %endif 4851 %define tmp_stridem dword [esp+0x138] 4852 %endif 4853 %define rndshift 6 4854%endif 4855%if ARCH_X86_32 4856 mov [esp+0x1f0], t0d 4857 mov [esp+0x1f4], t1d 4858 %if !isprep && required_stack_alignment > STACK_ALIGNMENT 4859 mov dstd, dstm 4860 mov dsd, dsm 4861 mov srcd, srcm 4862 mov ssd, ssm 4863 mov hd, hm 4864 mov r4, mxm 4865 %define r0m [esp+0x200] 4866 %define dsm [esp+0x204] 4867 %define dsmp dsm 4868 %define r1m dsm 4869 %define r2m [esp+0x208] 4870 %define ssm [esp+0x20c] 4871 %define r3m ssm 4872 %define hm [esp+0x210] 4873 %define mxm [esp+0x214] 4874 mov r0m, dstd 4875 mov dsm, dsd 4876 mov r2m, srcd 4877 mov ssm, ssd 4878 mov hm, hd 4879 mov r0, mym 4880 mov r1, dxm 4881 mov r2, dym 4882 %define mym [esp+0x218] 4883 %define dxm [esp+0x09c] 4884 %define dym [esp+0x21c] 4885 mov mxm, r4 4886 mov mym, r0 4887 mov dxm, r1 4888 mov dym, r2 4889 tzcnt wd, wm 4890 %endif 4891 %if isprep && required_stack_alignment > STACK_ALIGNMENT 4892 %xdefine base_reg r5 4893 %else 4894 %xdefine base_reg r6 4895 %endif 4896 mov ssd, ssm 4897%endif 4898 LEA base_reg, %1_8tap_scaled_8bpc_ssse3 4899%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 4900%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT 4901 tzcnt wd, wm 4902%endif 4903%if ARCH_X86_32 4904 %define m8 m0 4905 %define m9 m1 4906 %define m14 m4 4907 %define m15 m3 4908%endif 4909 movd m8, dxm 4910 movd m14, mxm 4911 pshufd m8, m8, q0000 4912 pshufd m14, m14, q0000 4913%if isprep && UNIX64 4914 mov r5d, t0d 4915 DECLARE_REG_TMP 5, 7 4916%endif 4917%if ARCH_X86_64 4918 mov dyd, dym 4919%endif 4920%ifidn %1, put 4921 %if WIN64 4922 mov r8d, hm 4923 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 4924 %define hm r5m 4925 %define dxm r8m 4926 %elif ARCH_X86_64 4927 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 4928 %define hm r6m 4929 %endif 4930 %if ARCH_X86_64 4931 %if required_stack_alignment > STACK_ALIGNMENT 4932 %define dsm [rsp+0x138] 4933 %define rX r1 4934 %define rXd r1d 4935 %else 4936 %define dsm dsq 4937 %define rX r14 4938 %define rXd r14d 4939 %endif 4940 %else 4941 %define rX r1 4942 %endif 4943%else ; prep 4944 %if WIN64 4945 mov r7d, hm 4946 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 4947 %define hm r4m 4948 %define dxm r7m 4949 %elif ARCH_X86_64 4950 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 4951 %define hm [rsp+0x94] 4952 %endif 4953 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4954 %if ARCH_X86_64 4955 %define rX r14 4956 %define rXd r14d 4957 %else 4958 %define rX r3 4959 %endif 4960%endif 4961%if ARCH_X86_64 4962 mova m10, [base+pd_0x3ff] 4963 mova m12, [base+pw_8192] 4964 %ifidn %1, put 4965 mova m13, [base+pd_512] 4966 %else 4967 mova m13, [base+pd_32] 4968 %endif 4969%else 4970 %define m10 [base+pd_0x3ff] 4971 %define m12 [base+pw_8192] 4972 %ifidn %1, put 4973 %define m13 [base+pd_512] 4974 %else 4975 %define m13 [base+pd_32] 4976 %endif 4977%endif 4978 pxor m9, m9 4979%if ARCH_X86_64 4980 lea ss3q, [ssq*3] 4981 movzx r7d, t1b 4982 shr t1d, 16 4983 cmp hd, 6 4984 cmovs t1d, r7d 4985 sub srcq, ss3q 4986%else 4987 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 4988 mov r1, [esp+0x1f4] 4989 lea r0, [ssq*3] 4990 movzx r2, r1b 4991 shr r1, 16 4992 cmp dword hm, 6 4993 cmovs r1, r2 4994 mov [esp+0x1f4], r1 4995 mov r1, r1m 4996 mov r2, r2m 4997 sub srcq, r0 4998 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4999 %define ss3q r0 5000 %define myd r4 5001 %define dyd dword dym 5002 %define hd dword hm 5003%endif 5004 cmp dyd, 1024 5005 je .dy1 5006 cmp dyd, 2048 5007 je .dy2 5008 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] 5009 add wq, base_reg 5010 jmp wq 5011%ifidn %1, put 5012.w2: 5013 %if ARCH_X86_64 5014 mov myd, mym 5015 movzx t0d, t0b 5016 dec srcq 5017 movd m15, t0d 5018 %else 5019 movzx r4, byte [esp+0x1f0] 5020 dec srcq 5021 movd m15, r4 5022 %endif 5023 punpckldq m9, m8 5024 SWAP m8, m9 5025 paddd m14, m8 ; mx+dx*[0-1] 5026 %if ARCH_X86_64 5027 mova m11, [base+pd_0x4000] 5028 %else 5029 %define m11 [base+pd_0x4000] 5030 %endif 5031 pshufd m15, m15, q0000 5032 pand m8, m14, m10 5033 psrld m8, 6 5034 paddd m15, m8 5035 movd r4d, m15 5036 psrldq m15, 4 5037 %if ARCH_X86_64 5038 movd r6d, m15 5039 %else 5040 movd r3d, m15 5041 %endif 5042 mova m5, [base+bdct_lb_dw] 5043 mova m6, [base+subpel_s_shuf2] 5044 movd m15, [base+subpel_filters+r4*8+2] 5045 %if ARCH_X86_64 5046 movd m7, [base+subpel_filters+r6*8+2] 5047 %else 5048 movd m7, [base+subpel_filters+r3*8+2] 5049 %endif 5050 pxor m9, m9 5051 pcmpeqd m8, m9 5052 psrld m14, 10 5053 %if ARCH_X86_32 5054 mov r3, r3m 5055 pshufb m14, m5 5056 paddb m14, m6 5057 mova [rsp+0x180], m14 5058 SWAP m5, m0 5059 SWAP m6, m3 5060 %define m8 m5 5061 %define m15 m6 5062 %endif 5063 movq m0, [srcq+ssq*0] 5064 movq m2, [srcq+ssq*2] 5065 movhps m0, [srcq+ssq*1] 5066 movhps m2, [srcq+ss3q ] 5067 lea srcq, [srcq+ssq*4] 5068 %if ARCH_X86_64 5069 pshufb m14, m5 5070 paddb m14, m6 5071 %endif 5072 movq m1, [srcq+ssq*0] 5073 movq m3, [srcq+ssq*2] 5074 movhps m1, [srcq+ssq*1] 5075 movhps m3, [srcq+ss3q ] 5076 lea srcq, [srcq+ssq*4] 5077 punpckldq m15, m7 5078 punpcklqdq m15, m15 5079 %if ARCH_X86_64 5080 pand m11, m8 5081 pandn m8, m15 5082 SWAP m15, m8 5083 por m15, m11 5084 %else 5085 pand m7, m8, m11 5086 pandn m8, m15 5087 %define m8 m6 5088 %define m15 m5 5089 por m15, m7 5090 mova [rsp+0x190], m15 5091 %endif 5092 pshufb m0, m14 5093 pshufb m2, m14 5094 pshufb m1, m14 5095 pshufb m3, m14 5096 pmaddubsw m0, m15 5097 pmaddubsw m2, m15 5098 pmaddubsw m1, m15 5099 pmaddubsw m3, m15 5100 phaddw m0, m2 5101 phaddw m1, m3 5102 pmulhrsw m0, m12 ; 0 1 2 3 5103 pmulhrsw m1, m12 ; 4 5 6 7 5104 palignr m2, m1, m0, 4 ; 1 2 3 4 5105 punpcklwd m3, m0, m2 ; 01 12 5106 punpckhwd m0, m2 ; 23 34 5107 pshufd m5, m1, q0321 ; 5 6 7 _ 5108 punpcklwd m2, m1, m5 ; 45 56 5109 punpckhwd m4, m1, m5 ; 67 __ 5110 %if ARCH_X86_32 5111 mov myd, mym 5112 mov r0, r0m 5113 mova [rsp+0x1a0], m3 5114 mova [rsp+0x1b0], m0 5115 mova [rsp+0x1c0], m2 5116 mova [rsp+0x1d0], m4 5117 %endif 5118.w2_loop: 5119 and myd, 0x3ff 5120 %if ARCH_X86_64 5121 mov r6d, 64 << 24 5122 mov r4d, myd 5123 shr r4d, 6 5124 lea r4d, [t1+r4] 5125 cmovnz r6q, [base+subpel_filters+r4*8] 5126 movq m11, r6q 5127 punpcklbw m11, m11 5128 psraw m11, 8 5129 pshufd m8, m11, q0000 5130 pshufd m9, m11, q1111 5131 pshufd m10, m11, q2222 5132 pshufd m11, m11, q3333 5133 pmaddwd m5, m3, m8 5134 pmaddwd m6, m0, m9 5135 pmaddwd m7, m2, m10 5136 pmaddwd m8, m4, m11 5137 paddd m5, m6 5138 paddd m7, m8 5139 %else 5140 mov mym, myd 5141 mov r1, [esp+0x1f4] 5142 xor r3, r3 5143 shr r4, 6 5144 lea r1, [r1+r4] 5145 mov r4, 64 << 24 5146 cmovnz r4, [base+subpel_filters+r1*8+0] 5147 cmovnz r3, [base+subpel_filters+r1*8+4] 5148 movd m7, r4 5149 movd m6, r3 5150 punpckldq m7, m6 5151 punpcklbw m7, m7 5152 psraw m7, 8 5153 pshufd m5, m7, q0000 5154 pshufd m6, m7, q1111 5155 pmaddwd m3, m5 5156 pmaddwd m0, m6 5157 pshufd m5, m7, q2222 5158 pshufd m7, m7, q3333 5159 pmaddwd m2, m5 5160 pmaddwd m4, m7 5161 paddd m3, m0 5162 paddd m2, m4 5163 SWAP m5, m3 5164 SWAP m7, m2 5165 %endif 5166 paddd m5, m13 5167 paddd m5, m7 5168 psrad m5, 10 5169 packssdw m5, m5 5170 packuswb m5, m5 5171 %if ARCH_X86_64 5172 pextrw r6d, m5, 0 5173 mov [dstq], r6w 5174 add dstq, dsq 5175 dec hd 5176 jz .ret 5177 add myd, dyd 5178 %else 5179 pextrw r3d, m5, 0 5180 mov [dstq], r3w 5181 add dstq, dsm 5182 dec hd 5183 jz .ret 5184 mov myd, mym 5185 add myd, dym 5186 %endif 5187 test myd, ~0x3ff 5188 %if ARCH_X86_32 5189 SWAP m3, m5 5190 SWAP m2, m7 5191 mova m3, [rsp+0x1a0] 5192 mova m0, [rsp+0x1b0] 5193 mova m2, [rsp+0x1c0] 5194 mova m4, [rsp+0x1d0] 5195 %define m14 [esp+0x180] 5196 %define m15 [esp+0x190] 5197 %endif 5198 jz .w2_loop 5199 %if ARCH_X86_32 5200 mov r3, r3m 5201 %endif 5202 movq m5, [srcq] 5203 test myd, 0x400 5204 jz .w2_skip_line 5205 add srcq, ssq 5206 shufps m3, m0, q1032 ; 01 12 5207 shufps m0, m2, q1032 ; 23 34 5208 shufps m2, m4, q1032 ; 45 56 5209 pshufb m5, m14 5210 pmaddubsw m5, m15 5211 phaddw m5, m5 5212 pmulhrsw m5, m12 5213 palignr m4, m5, m1, 12 5214 punpcklqdq m1, m4, m4 ; 6 7 6 7 5215 punpcklwd m4, m1, m5 ; 67 __ 5216 %if ARCH_X86_32 5217 mova [rsp+0x1a0], m3 5218 mova [rsp+0x1b0], m0 5219 mova [rsp+0x1c0], m2 5220 mova [rsp+0x1d0], m4 5221 %endif 5222 jmp .w2_loop 5223.w2_skip_line: 5224 movhps m5, [srcq+ssq*1] 5225 lea srcq, [srcq+ssq*2] 5226 mova m3, m0 ; 01 12 5227 mova m0, m2 ; 23 34 5228 pshufb m5, m14 5229 pmaddubsw m5, m15 5230 phaddw m5, m5 5231 pmulhrsw m5, m12 ; 6 7 6 7 5232 palignr m4, m5, m1, 8 ; 4 5 6 7 5233 pshufd m5, m4, q0321 ; 5 6 7 _ 5234 mova m1, m4 5235 punpcklwd m2, m4, m5 ; 45 56 5236 punpckhwd m4, m5 ; 67 __ 5237 %if ARCH_X86_32 5238 mova [rsp+0x1a0], m3 5239 mova [rsp+0x1b0], m0 5240 mova [rsp+0x1c0], m2 5241 mova [rsp+0x1d0], m4 5242 %endif 5243 jmp .w2_loop 5244%endif 5245INIT_XMM ssse3 5246.w4: 5247%if ARCH_X86_64 5248 mov myd, mym 5249 movzx t0d, t0b 5250 dec srcq 5251 movd m15, t0d 5252%else 5253 %define m8 m0 5254 %xdefine m14 m4 5255 %define m15 m3 5256 movzx r4, byte [esp+0x1f0] 5257 dec srcq 5258 movd m15, r4 5259%endif 5260 pmaddwd m8, [base+rescale_mul] 5261%if ARCH_X86_64 5262 mova m11, [base+pd_0x4000] 5263%else 5264 %define m11 [base+pd_0x4000] 5265%endif 5266 pshufd m15, m15, q0000 5267 paddd m14, m8 ; mx+dx*[0-3] 5268 pand m0, m14, m10 5269 psrld m0, 6 5270 paddd m15, m0 5271 psrldq m7, m15, 8 5272%if ARCH_X86_64 5273 movd r4d, m15 5274 movd r11d, m7 5275 psrldq m15, 4 5276 psrldq m7, 4 5277 movd r6d, m15 5278 movd r13d, m7 5279 movd m15, [base+subpel_filters+ r4*8+2] 5280 movd m2, [base+subpel_filters+r11*8+2] 5281 movd m3, [base+subpel_filters+ r6*8+2] 5282 movd m4, [base+subpel_filters+r13*8+2] 5283%else 5284 movd r0, m15 5285 movd rX, m7 5286 psrldq m15, 4 5287 psrldq m7, 4 5288 movd r4, m15 5289 movd r5, m7 5290 movd m1, [base+subpel_filters+r0*8+2] 5291 movd m2, [base+subpel_filters+rX*8+2] 5292 movd m3, [base+subpel_filters+r4*8+2] 5293 movd m7, [base+subpel_filters+r5*8+2] 5294 movifprep r3, r3m 5295 SWAP m4, m7 5296 %define m15 m1 5297%endif 5298 mova m5, [base+bdct_lb_dw] 5299 movq m6, [base+subpel_s_shuf2] 5300 psrld m14, 10 5301 punpckldq m15, m3 5302 punpckldq m2, m4 5303 punpcklqdq m15, m2 5304 punpcklqdq m6, m6 5305 pshufb m14, m5 5306 paddb m14, m6 5307%if ARCH_X86_64 5308 pcmpeqd m0, m9 5309 pand m11, m0 5310%else 5311 mova [esp+0x180], m14 5312 SWAP m7, m4 5313 pxor m3, m3 5314 pcmpeqd m0, m3 5315 pand m2, m11, m0 5316 %define m11 m2 5317%endif 5318 pandn m0, m15 5319%if ARCH_X86_64 5320 SWAP m15, m0 5321%else 5322 %define m15 m0 5323%endif 5324 por m15, m11 5325%if ARCH_X86_64 5326 movu m7, [srcq+ssq*0] 5327 movu m9, [srcq+ssq*1] 5328 movu m8, [srcq+ssq*2] 5329 movu m10, [srcq+ss3q ] 5330 lea srcq, [srcq+ssq*4] 5331 movu m2, [srcq+ssq*0] 5332 movu m4, [srcq+ssq*1] 5333 movu m3, [srcq+ssq*2] 5334 movu m5, [srcq+ss3q ] 5335 lea srcq, [srcq+ssq*4] 5336 pshufb m7, m14 5337 pshufb m9, m14 5338 pshufb m8, m14 5339 pshufb m10, m14 5340 pshufb m2, m14 5341 pshufb m4, m14 5342 pshufb m3, m14 5343 pshufb m5, m14 5344 pmaddubsw m7, m15 5345 pmaddubsw m9, m15 5346 pmaddubsw m8, m15 5347 pmaddubsw m10, m15 5348 pmaddubsw m2, m15 5349 pmaddubsw m4, m15 5350 pmaddubsw m3, m15 5351 pmaddubsw m5, m15 5352 phaddw m7, m9 5353 phaddw m8, m10 5354 phaddw m9, m2, m4 5355 phaddw m3, m5 5356 pmulhrsw m7, m12 ; 0 1 5357 pmulhrsw m8, m12 ; 2 3 5358 pmulhrsw m9, m12 ; 4 5 5359 pmulhrsw m3, m12 ; 6 7 5360 shufps m4, m7, m8, q1032 ; 1 2 5361 shufps m5, m8, m9, q1032 ; 3 4 5362 shufps m6, m9, m3, q1032 ; 5 6 5363 psrldq m11, m3, 8 ; 7 _ 5364 punpcklwd m0, m7, m4 ; 01 5365 punpckhwd m7, m4 ; 12 5366 punpcklwd m1, m8, m5 ; 23 5367 punpckhwd m8, m5 ; 34 5368 punpcklwd m2, m9, m6 ; 45 5369 punpckhwd m9, m6 ; 56 5370 punpcklwd m3, m11 ; 67 5371 mova [rsp+0x00], m7 5372 mova [rsp+0x10], m8 5373 mova [rsp+0x20], m9 5374%else 5375 mova [esp+0x190], m15 5376 lea ss3q, [ssq*3] 5377 movu m2, [srcq+ssq*0] 5378 movu m3, [srcq+ssq*1] 5379 movu m7, [srcq+ssq*2] 5380 movu m6, [srcq+ss3q ] 5381 lea srcq, [srcq+ssq*4] 5382 pshufb m2, m14 5383 pshufb m3, m14 5384 pshufb m7, m14 5385 pshufb m6, m14 5386 pmaddubsw m2, m15 5387 pmaddubsw m3, m15 5388 pmaddubsw m7, m15 5389 pmaddubsw m6, m15 5390 phaddw m2, m3 5391 phaddw m7, m6 5392 movu m1, [srcq+ssq*0] 5393 movu m5, [srcq+ssq*1] 5394 movu m3, [srcq+ssq*2] 5395 movu m6, [srcq+ss3q ] 5396 lea srcq, [srcq+ssq*4] 5397 pshufb m1, m14 5398 pshufb m5, m14 5399 pshufb m3, m14 5400 pshufb m6, m14 5401 pmaddubsw m1, m15 5402 pmaddubsw m5, m15 5403 pmaddubsw m3, m15 5404 pmaddubsw m6, m15 5405 phaddw m1, m5 5406 phaddw m3, m6 5407 pmulhrsw m2, m12 5408 pmulhrsw m7, m12 5409 pmulhrsw m1, m12 5410 pmulhrsw m3, m12 5411 shufps m4, m2, m7, q1032 ; 1 2 5412 shufps m5, m7, m1, q1032 ; 3 4 5413 shufps m6, m1, m3, q1032 ; 5 6 5414 psrldq m0, m3, 8 ; 7 _ 5415 mova [esp+0x1a0], m0 5416 %define m11 [esp+0x1a0] 5417 punpcklwd m0, m2, m4 ; 01 5418 punpckhwd m2, m4 ; 12 5419 punpcklwd m4, m7, m5 ; 23 5420 punpckhwd m7, m5 ; 34 5421 punpcklwd m5, m1, m6 ; 45 5422 punpckhwd m1, m6 ; 56 5423 punpcklwd m3, [esp+0x1a0] ; 67 5424 mov myd, mym 5425 mov r0, r0m 5426 mova [esp+0x1b0], m0 ; 01 5427 mova [esp+0x1c0], m4 ; 23 5428 mova [esp+0x1d0], m5 ; 45 5429 mova [esp+0x1e0], m3 ; 67 5430 mova [rsp+0x00], m2 ; 12 5431 mova [rsp+0x10], m7 ; 34 5432 mova [rsp+0x20], m1 ; 56 5433 SWAP m1, m4 5434 SWAP m2, m5 5435%endif 5436.w4_loop: 5437 and myd, 0x3ff 5438%if ARCH_X86_64 5439 mov r6d, 64 << 24 5440 mov r4d, myd 5441 shr r4d, 6 5442 lea r4d, [t1+r4] 5443 cmovnz r6q, [base+subpel_filters+r4*8] 5444 movq m10, r6q 5445 punpcklbw m10, m10 5446 psraw m10, 8 5447 pshufd m7, m10, q0000 5448 pshufd m8, m10, q1111 5449 pshufd m9, m10, q2222 5450 pshufd m10, m10, q3333 5451 pmaddwd m4, m0, m7 5452 pmaddwd m5, m1, m8 5453 pmaddwd m6, m2, m9 5454 pmaddwd m7, m3, m10 5455 paddd m4, m5 5456 paddd m6, m7 5457 paddd m4, m13 5458 paddd m4, m6 5459%else 5460 mov mym, myd 5461 mov r5, [esp+0x1f4] 5462 xor r3, r3 5463 shr r4, 6 5464 lea r5, [r5+r4] 5465 mov r4, 64 << 24 5466 cmovnz r4, [base+subpel_filters+r5*8+0] 5467 cmovnz r3, [base+subpel_filters+r5*8+4] 5468 movd m7, r4 5469 movd m6, r3 5470 punpckldq m7, m6 5471 punpcklbw m7, m7 5472 psraw m7, 8 5473 pshufd m4, m7, q0000 5474 pshufd m5, m7, q1111 5475 pshufd m6, m7, q2222 5476 pshufd m7, m7, q3333 5477 pmaddwd m0, m4 5478 pmaddwd m1, m5 5479 pmaddwd m2, m6 5480 pmaddwd m3, m7 5481 paddd m0, m1 5482 paddd m2, m3 5483 paddd m0, m13 5484 paddd m0, m2 5485 SWAP m4, m0 5486%endif 5487 psrad m4, rndshift 5488 packssdw m4, m4 5489%ifidn %1, put 5490 packuswb m4, m4 5491 movd [dstq], m4 5492 add dstq, dsmp 5493%else 5494 movq [tmpq], m4 5495 add tmpq, 8 5496%endif 5497 dec hd 5498 jz .ret 5499%if ARCH_X86_64 5500 add myd, dyd 5501 test myd, ~0x3ff 5502 jz .w4_loop 5503%else 5504 SWAP m0, m4 5505 mov myd, mym 5506 mov r3, r3m 5507 add myd, dym 5508 test myd, ~0x3ff 5509 jnz .w4_next_line 5510 mova m0, [esp+0x1b0] 5511 mova m1, [esp+0x1c0] 5512 mova m2, [esp+0x1d0] 5513 mova m3, [esp+0x1e0] 5514 jmp .w4_loop 5515.w4_next_line: 5516 %define m14 [esp+0x180] 5517 %define m15 [esp+0x190] 5518%endif 5519 movu m4, [srcq] 5520 test myd, 0x400 5521 jz .w4_skip_line 5522%if ARCH_X86_64 5523 mova m0, [rsp+0x00] 5524 mova [rsp+0x00], m1 5525 mova m1, [rsp+0x10] 5526 mova [rsp+0x10], m2 5527 mova m2, [rsp+0x20] 5528 mova [rsp+0x20], m3 5529%else 5530 mova m5, [esp+0x1c0] 5531 mova m0, [rsp+0x000] 5532 mova [rsp+0x00], m5 5533 mova [esp+0x1b0], m0 5534 mova m6, [esp+0x1d0] 5535 mova m1, [rsp+0x010] 5536 mova [rsp+0x10], m6 5537 mova [esp+0x1c0], m1 5538 mova m7, [esp+0x1e0] 5539 mova m2, [rsp+0x020] 5540 mova [rsp+0x20], m7 5541 mova [esp+0x1d0], m2 5542%endif 5543 pshufb m4, m14 5544 pmaddubsw m4, m15 5545 phaddw m4, m4 5546 pmulhrsw m4, m12 5547 punpcklwd m3, m11, m4 5548%if ARCH_X86_32 5549 mova [esp+0x1e0], m3 5550%endif 5551 mova m11, m4 5552 add srcq, ssq 5553 jmp .w4_loop 5554.w4_skip_line: 5555%if ARCH_X86_32 5556 mova m0, [esp+0x1c0] 5557 mova m1, [esp+0x1d0] 5558 mova m2, [esp+0x1e0] 5559%endif 5560 movu m5, [srcq+ssq*1] 5561 lea srcq, [srcq+ssq*2] 5562 mova m6, [rsp+0x10] 5563 mova m7, [rsp+0x20] 5564 pshufb m4, m14 5565 pshufb m5, m14 5566 pmaddubsw m4, m15 5567 pmaddubsw m5, m15 5568 phaddw m4, m5 5569 pmulhrsw m4, m12 5570 punpcklwd m5, m11, m4 5571 mova [rsp+0x00], m6 5572 mova [rsp+0x10], m7 5573 mova [rsp+0x20], m5 5574%if ARCH_X86_64 5575 psrldq m11, m4, 8 5576 mova m0, m1 5577 mova m1, m2 5578 mova m2, m3 5579 punpcklwd m3, m4, m11 5580%else 5581 psrldq m6, m4, 8 5582 punpcklwd m3, m4, m6 5583 mova [esp+0x1a0], m6 5584 mova [esp+0x1b0], m0 5585 mova [esp+0x1c0], m1 5586 mova [esp+0x1d0], m2 5587 mova [esp+0x1e0], m3 5588%endif 5589 jmp .w4_loop 5590INIT_XMM ssse3 5591.w8: 5592 mov dword [rsp+0x90], 1 5593 movifprep tmp_stridem, 16 5594 jmp .w_start 5595.w16: 5596 mov dword [rsp+0x90], 2 5597 movifprep tmp_stridem, 32 5598 jmp .w_start 5599.w32: 5600 mov dword [rsp+0x90], 4 5601 movifprep tmp_stridem, 64 5602 jmp .w_start 5603.w64: 5604 mov dword [rsp+0x90], 8 5605 movifprep tmp_stridem, 128 5606 jmp .w_start 5607.w128: 5608 mov dword [rsp+0x90], 16 5609 movifprep tmp_stridem, 256 5610.w_start: 5611%ifidn %1, put 5612 movifnidn dsm, dsq 5613%endif 5614%if ARCH_X86_64 5615 shr t0d, 16 5616 movd m15, t0d 5617%else 5618 %define m8 m0 5619 %xdefine m14 m4 5620 %define m15 m3 5621 %if isprep 5622 %define ssq ssm 5623 %endif 5624 mov r4, [esp+0x1f0] 5625 shr r4, 16 5626 movd m15, r4 5627 mov r0, r0m 5628 mov myd, mym 5629%endif 5630 sub srcq, 3 5631 pslld m7, m8, 2 ; dx*4 5632 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 5633 pshufd m15, m15, q0000 5634 paddd m14, m8 ; mx+dx*[0-3] 5635 mova [rsp+0x100], m7 5636 mova [rsp+0x120], m15 5637 mov [rsp+0x098], srcq 5638 mov [rsp+0x130], r0q ; dstq / tmpq 5639%if ARCH_X86_64 && UNIX64 5640 mov hm, hd 5641%elif ARCH_X86_32 5642 mov r5, hm 5643 mov [esp+0x094], myd 5644 mov [esp+0x134], r5 5645%endif 5646 jmp .hloop 5647.hloop_prep: 5648 dec dword [rsp+0x090] 5649 jz .ret 5650%if ARCH_X86_64 5651 add qword [rsp+0x130], 8*(isprep+1) 5652 mov hd, hm 5653%else 5654 add dword [esp+0x130], 8*(isprep+1) 5655 mov myd, [esp+0x094] 5656 mov r5, [esp+0x134] 5657 mov r0, [esp+0x130] 5658%endif 5659 mova m7, [rsp+0x100] 5660 mova m14, [rsp+0x110] 5661%if ARCH_X86_64 5662 mova m10, [base+pd_0x3ff] 5663%endif 5664 mova m15, [rsp+0x120] 5665 pxor m9, m9 5666 mov srcq, [rsp+0x098] 5667%if ARCH_X86_64 5668 mov r0q, [rsp+0x130] ; dstq / tmpq 5669%else 5670 mov mym, myd 5671 mov hm, r5 5672 mov r0m, r0 5673 mov r3, r3m 5674%endif 5675 paddd m14, m7 5676.hloop: 5677%if ARCH_X86_64 5678 mova m11, [base+pq_0x40000000] 5679%else 5680 %define m11 [base+pq_0x40000000] 5681%endif 5682 psrld m2, m14, 10 5683 mova [rsp], m2 5684 pand m6, m14, m10 5685 psrld m6, 6 5686 paddd m5, m15, m6 5687 pcmpeqd m6, m9 5688 psrldq m2, m5, 8 5689%if ARCH_X86_64 5690 movd r4d, m5 5691 movd r6d, m2 5692 psrldq m5, 4 5693 psrldq m2, 4 5694 movd r7d, m5 5695 movd r9d, m2 5696 movq m0, [base+subpel_filters+r4*8] 5697 movq m1, [base+subpel_filters+r6*8] 5698 movhps m0, [base+subpel_filters+r7*8] 5699 movhps m1, [base+subpel_filters+r9*8] 5700%else 5701 movd r0, m5 5702 movd rX, m2 5703 psrldq m5, 4 5704 psrldq m2, 4 5705 movd r4, m5 5706 movd r5, m2 5707 movq m0, [base+subpel_filters+r0*8] 5708 movq m1, [base+subpel_filters+rX*8] 5709 movhps m0, [base+subpel_filters+r4*8] 5710 movhps m1, [base+subpel_filters+r5*8] 5711 pxor m2, m2 5712 %define m9 m2 5713%endif 5714 paddd m14, m7 ; mx+dx*[4-7] 5715 pand m5, m14, m10 5716 psrld m5, 6 5717 paddd m15, m5 5718 pcmpeqd m5, m9 5719 mova [rsp+0x110], m14 5720 psrldq m4, m15, 8 5721%if ARCH_X86_64 5722 movd r10d, m15 5723 movd r11d, m4 5724 psrldq m15, 4 5725 psrldq m4, 4 5726 movd r13d, m15 5727 movd rXd, m4 5728 movq m2, [base+subpel_filters+r10*8] 5729 movq m3, [base+subpel_filters+r11*8] 5730 movhps m2, [base+subpel_filters+r13*8] 5731 movhps m3, [base+subpel_filters+ rX*8] 5732 psrld m14, 10 5733 psrldq m4, m14, 8 5734 movd r10d, m14 5735 movd r11d, m4 5736 psrldq m14, 4 5737 psrldq m4, 4 5738 movd r13d, m14 5739 movd rXd, m4 5740 mov r4d, [rsp+ 0] 5741 mov r6d, [rsp+ 8] 5742 mov r7d, [rsp+ 4] 5743 mov r9d, [rsp+12] 5744 pshufd m4, m6, q1100 5745 pshufd m6, m6, q3322 5746 pshufd m14, m5, q1100 5747 pshufd m5, m5, q3322 5748 pand m7, m11, m4 5749 pand m8, m11, m6 5750 pand m15, m11, m14 5751 pand m11, m11, m5 5752 pandn m4, m0 5753 pandn m6, m1 5754 pandn m14, m2 5755 pandn m5, m3 5756 por m7, m4 5757 por m8, m6 5758 por m15, m14 5759 por m11, m5 5760 mova [rsp+0x10], m7 5761 mova [rsp+0x20], m8 5762 mova [rsp+0x30], m15 5763 mova [rsp+0x40], m11 5764 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 5765 mova [rsp+0x50], m1 5766 mova [rsp+0x60], m2 5767 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 5768 mova [rsp+0x70], m3 5769 mova [rsp+0x80], m4 5770 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 5771 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 5772 SWAP m7, m0 5773 SWAP m8, m14 5774 mova m1, [rsp+0x50] 5775 mova m2, [rsp+0x60] 5776 mova m3, [rsp+0x70] 5777 mova m9, [rsp+0x80] 5778 mov myd, mym 5779 mov dyd, dym 5780 punpcklwd m4, m5, m6 ; 45a 5781 punpckhwd m5, m6 ; 45b 5782 punpcklwd m6, m7, m8 ; 67a 5783 punpckhwd m7, m8 ; 67b 5784 punpcklwd m0, m1, m2 ; 01a 5785 punpckhwd m1, m2 ; 01b 5786 punpcklwd m2, m3, m9 ; 23a 5787 punpckhwd m3, m9 ; 23b 5788 mova [rsp+0x50], m4 5789 mova [rsp+0x60], m5 5790 mova [rsp+0x70], m6 5791 mova [rsp+0x80], m7 5792 SWAP m14, m8 5793.vloop: 5794 and myd, 0x3ff 5795 mov r6d, 64 << 24 5796 mov r4d, myd 5797 shr r4d, 6 5798 lea r4d, [t1+r4] 5799 cmovnz r6q, [base+subpel_filters+r4*8] 5800 movq m11, r6q 5801 punpcklbw m11, m11 5802 psraw m11, 8 5803 pshufd m5, m11, q0000 5804 pshufd m7, m11, q1111 5805 pshufd m10, m11, q2222 5806 pshufd m11, m11, q3333 5807 pmaddwd m4, m5, m0 5808 pmaddwd m5, m5, m1 5809 pmaddwd m6, m7, m2 5810 pmaddwd m7, m7, m3 5811 paddd m4, m13 5812 paddd m5, m13 5813 paddd m4, m6 5814 paddd m5, m7 5815 pmaddwd m6, [rsp+0x50], m10 5816 pmaddwd m7, [rsp+0x60], m10 5817 pmaddwd m8, [rsp+0x70], m11 5818 pmaddwd m9, [rsp+0x80], m11 5819 paddd m4, m6 5820 paddd m5, m7 5821 paddd m4, m8 5822 paddd m5, m9 5823%else 5824 movd r0, m15 5825 movd rX, m4 5826 psrldq m15, 4 5827 psrldq m4, 4 5828 movd r4, m15 5829 movd r5, m4 5830 mova m14, [esp+0x110] 5831 movq m2, [base+subpel_filters+r0*8] 5832 movq m3, [base+subpel_filters+rX*8] 5833 movhps m2, [base+subpel_filters+r4*8] 5834 movhps m3, [base+subpel_filters+r5*8] 5835 psrld m14, 10 5836 mova [esp+16], m14 5837 mov r0, [esp+ 0] 5838 mov rX, [esp+ 8] 5839 mov r4, [esp+ 4] 5840 mov r5, [esp+12] 5841 mova [esp+0x20], m0 5842 mova [esp+0x30], m1 5843 mova [esp+0x40], m2 5844 mova [esp+0x50], m3 5845 pshufd m4, m6, q1100 5846 pshufd m6, m6, q3322 5847 pshufd m7, m5, q1100 5848 pshufd m5, m5, q3322 5849 pand m0, m11, m4 5850 pand m1, m11, m6 5851 pand m2, m11, m7 5852 pand m3, m11, m5 5853 pandn m4, [esp+0x20] 5854 pandn m6, [esp+0x30] 5855 pandn m7, [esp+0x40] 5856 pandn m5, [esp+0x50] 5857 por m0, m4 5858 por m1, m6 5859 por m2, m7 5860 por m3, m5 5861 mova [esp+0x20], m0 5862 mova [esp+0x30], m1 5863 mova [esp+0x40], m2 5864 mova [esp+0x50], m3 5865 MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 5866 MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 5867 MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 5868 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 5869 mova m5, [esp+0x180] 5870 mova m6, [esp+0x190] 5871 mova m7, [esp+0x1a0] 5872 mova m0, [esp+0x1b0] 5873 mov myd, mym 5874 punpcklwd m4, m5, m6 ; 45a 5875 punpckhwd m5, m6 ; 45b 5876 punpcklwd m6, m7, m0 ; 67a 5877 punpckhwd m7, m0 ; 67b 5878 mova [esp+0x180], m4 5879 mova [esp+0x190], m5 5880 mova [esp+0x1a0], m6 5881 mova [esp+0x1b0], m7 5882 mova m1, [esp+0x140] 5883 mova m2, [esp+0x150] 5884 mova m3, [esp+0x160] 5885 mova m4, [esp+0x170] 5886 punpcklwd m0, m1, m2 ; 01a 5887 punpckhwd m1, m2 ; 01b 5888 punpcklwd m2, m3, m4 ; 23a 5889 punpckhwd m3, m4 ; 23b 5890 mova [esp+0x140], m0 5891 mova [esp+0x150], m1 5892 mova [esp+0x160], m2 5893 mova [esp+0x170], m3 5894.vloop: 5895 mov r0, r0m 5896 mov r5, [esp+0x1f4] 5897 and myd, 0x3ff 5898 mov mym, myd 5899 xor r3, r3 5900 shr r4, 6 5901 lea r5, [r5+r4] 5902 mov r4, 64 << 24 5903 cmovnz r4, [base+subpel_filters+r5*8+0] 5904 cmovnz r3, [base+subpel_filters+r5*8+4] 5905 movd m7, r4 5906 movd m6, r3 5907 punpckldq m7, m6 5908 punpcklbw m7, m7 5909 psraw m7, 8 5910 pshufd m4, m7, q0000 5911 pshufd m5, m7, q1111 5912 pmaddwd m0, m4 5913 pmaddwd m1, m4 5914 pmaddwd m2, m5 5915 pmaddwd m3, m5 5916 pshufd m6, m7, q2222 5917 pshufd m7, m7, q3333 5918 paddd m0, m2 5919 paddd m1, m3 5920 pmaddwd m2, [esp+0x180], m6 5921 pmaddwd m3, [esp+0x190], m6 5922 pmaddwd m4, [esp+0x1a0], m7 5923 pmaddwd m5, [esp+0x1b0], m7 5924 paddd m0, m2 5925 paddd m1, m3 5926 paddd m0, m13 5927 paddd m1, m13 5928 paddd m4, m0 5929 paddd m5, m1 5930%endif 5931 psrad m4, rndshift 5932 psrad m5, rndshift 5933 packssdw m4, m5 5934%ifidn %1, put 5935 packuswb m4, m4 5936 movq [dstq], m4 5937 add dstq, dsm 5938%else 5939 mova [tmpq], m4 5940 add tmpq, tmp_stridem 5941%endif 5942 dec hd 5943 jz .hloop_prep 5944%if ARCH_X86_64 5945 add myd, dyd 5946 test myd, ~0x3ff 5947 jz .vloop 5948 test myd, 0x400 5949 mov [rsp+0x140], myd 5950 mov r4d, [rsp+ 0] 5951 mov r6d, [rsp+ 8] 5952 mov r7d, [rsp+ 4] 5953 mov r9d, [rsp+12] 5954 jz .skip_line 5955 mova m14, [base+unpckw] 5956 movq m6, [srcq+r10] 5957 movq m7, [srcq+r11] 5958 movhps m6, [srcq+r13] 5959 movhps m7, [srcq+ rX] 5960 movq m4, [srcq+ r4] 5961 movq m5, [srcq+ r6] 5962 movhps m4, [srcq+ r7] 5963 movhps m5, [srcq+ r9] 5964 add srcq, ssq 5965 mov myd, [rsp+0x140] 5966 mov dyd, dym 5967 pshufd m9, m14, q1032 5968 pshufb m0, m14 ; 0a 1a 5969 pshufb m1, m14 ; 0b 1b 5970 pshufb m2, m9 ; 3a 2a 5971 pshufb m3, m9 ; 3b 2b 5972 pmaddubsw m6, [rsp+0x30] 5973 pmaddubsw m7, [rsp+0x40] 5974 pmaddubsw m4, [rsp+0x10] 5975 pmaddubsw m5, [rsp+0x20] 5976 phaddw m6, m7 5977 phaddw m4, m5 5978 phaddw m4, m6 5979 pmulhrsw m4, m12 5980 pshufb m5, [rsp+0x50], m14 ; 4a 5a 5981 pshufb m6, [rsp+0x60], m14 ; 4b 5b 5982 pshufb m7, [rsp+0x70], m9 ; 7a 6a 5983 pshufb m8, [rsp+0x80], m9 ; 7b 6b 5984 punpckhwd m0, m2 ; 12a 5985 punpckhwd m1, m3 ; 12b 5986 punpcklwd m2, m5 ; 34a 5987 punpcklwd m3, m6 ; 34b 5988 punpckhwd m5, m7 ; 56a 5989 punpckhwd m6, m8 ; 56b 5990 punpcklwd m7, m4 ; 78a 5991 punpckhqdq m4, m4 5992 punpcklwd m8, m4 ; 78b 5993 mova [rsp+0x50], m5 5994 mova [rsp+0x60], m6 5995 mova [rsp+0x70], m7 5996 mova [rsp+0x80], m8 5997 jmp .vloop 5998.skip_line: 5999 mova m0, [rsp+0x10] 6000 mova m1, [rsp+0x20] 6001 mova m14, [rsp+0x30] 6002 mova m15, [rsp+0x40] 6003 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 6004 mov myd, [rsp+0x140] 6005 mov dyd, dym 6006 mova m0, m2 ; 01a 6007 mova m1, m3 ; 01b 6008 mova m2, [rsp+0x50] ; 23a 6009 mova m3, [rsp+0x60] ; 23b 6010 mova m5, [rsp+0x70] ; 45a 6011 mova m6, [rsp+0x80] ; 45b 6012 punpcklwd m7, m4, m8 ; 67a 6013 punpckhwd m4, m8 ; 67b 6014 mova [rsp+0x50], m5 6015 mova [rsp+0x60], m6 6016 mova [rsp+0x70], m7 6017 mova [rsp+0x80], m4 6018%else 6019 mov r0m, r0 6020 mov myd, mym 6021 mov r3, r3m 6022 add myd, dym 6023 test myd, ~0x3ff 6024 mov mym, myd 6025 jnz .next_line 6026 mova m0, [esp+0x140] 6027 mova m1, [esp+0x150] 6028 mova m2, [esp+0x160] 6029 mova m3, [esp+0x170] 6030 jmp .vloop 6031.next_line: 6032 test myd, 0x400 6033 mov r0, [esp+ 0] 6034 mov rX, [esp+ 8] 6035 mov r4, [esp+ 4] 6036 mov r5, [esp+12] 6037 jz .skip_line 6038 mova m6, [base+unpckw] 6039 mova m0, [esp+0x140] 6040 mova m1, [esp+0x150] 6041 mova m7, [esp+0x180] 6042 movq m4, [srcq+r0] 6043 movq m5, [srcq+rX] 6044 movhps m4, [srcq+r4] 6045 movhps m5, [srcq+r5] 6046 pshufb m0, m6 ; 0a 1a 6047 pshufb m1, m6 ; 0b 1b 6048 pshufb m7, m6 ; 4a 5a 6049 mov r0, [esp+16] 6050 mov rX, [esp+24] 6051 mov r4, [esp+20] 6052 mov r5, [esp+28] 6053 movq m3, [srcq+r0] 6054 movq m2, [srcq+rX] 6055 movhps m3, [srcq+r4] 6056 movhps m2, [srcq+r5] 6057 add srcq, ssq 6058 pmaddubsw m4, [esp+0x20] 6059 pmaddubsw m5, [esp+0x30] 6060 pmaddubsw m3, [esp+0x40] 6061 pmaddubsw m2, [esp+0x50] 6062 phaddw m4, m5 6063 phaddw m3, m2 6064 mova m5, [esp+0x190] 6065 mova m2, [esp+0x160] 6066 phaddw m4, m3 6067 mova m3, [esp+0x170] 6068 pmulhrsw m4, m12 ; 8a 8b 6069 mov myd, mym 6070 pshufb m5, m6 ; 4b 5b 6071 pshufd m6, m6, q1032 6072 pshufb m2, m6 ; 3a 2a 6073 pshufb m3, m6 ; 3b 2b 6074 punpckhwd m0, m2 ; 12a 6075 punpckhwd m1, m3 ; 12b 6076 mova [esp+0x140], m0 6077 mova [esp+0x150], m1 6078 mova m0, [esp+0x1a0] 6079 mova m1, [esp+0x1b0] 6080 punpcklwd m2, m7 ; 34a 6081 punpcklwd m3, m5 ; 34b 6082 mova [esp+0x160], m2 6083 mova [esp+0x170], m3 6084 pshufb m0, m6 ; 7a 6a 6085 pshufb m1, m6 ; 7b 6b 6086 punpckhwd m7, m0 ; 56a 6087 punpckhwd m5, m1 ; 56b 6088 punpcklwd m0, m4 6089 punpckhqdq m4, m4 6090 punpcklwd m1, m4 6091 mova [esp+0x180], m7 6092 mova [esp+0x190], m5 6093 mova [esp+0x1a0], m0 6094 mova [esp+0x1b0], m1 6095 mova m0, [esp+0x140] 6096 mova m1, [esp+0x150] 6097 jmp .vloop 6098.skip_line: 6099 MC_8TAP_SCALED_H 0x20, 0x1c0, 0 6100 mov myd, mym 6101 mova m0, [esp+0x160] 6102 mova m1, [esp+0x170] 6103 mova m2, [esp+0x180] 6104 mova m3, [esp+0x190] 6105 mova [esp+0x140], m0 6106 mova [esp+0x150], m1 6107 mova m4, [esp+0x1a0] 6108 mova m5, [esp+0x1b0] 6109 mova [esp+0x160], m2 6110 mova [esp+0x170], m3 6111 mova m6, [esp+0x1c0] 6112 mova m7, [esp+0x1d0] 6113 mova [esp+0x180], m4 6114 mova [esp+0x190], m5 6115 punpcklwd m4, m6, m7 6116 punpckhwd m6, m7 6117 mova [esp+0x1a0], m4 6118 mova [esp+0x1b0], m6 6119%endif 6120 jmp .vloop 6121INIT_XMM ssse3 6122.dy1: 6123 movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] 6124 add wq, base_reg 6125 jmp wq 6126%ifidn %1, put 6127.dy1_w2: 6128 %if ARCH_X86_64 6129 mov myd, mym 6130 movzx t0d, t0b 6131 dec srcq 6132 movd m15, t0d 6133 %else 6134 %define m8 m0 6135 %define m9 m1 6136 %define m14 m4 6137 %define m15 m3 6138 movzx r5, byte [esp+0x1f0] 6139 dec srcd 6140 movd m15, r5 6141 %endif 6142 punpckldq m9, m8 6143 SWAP m8, m9 6144 paddd m14, m8 ; mx+dx*[0-1] 6145 %if ARCH_X86_64 6146 mova m11, [base+pd_0x4000] 6147 %else 6148 %define m11 [base+pd_0x4000] 6149 %endif 6150 pshufd m15, m15, q0000 6151 pand m8, m14, m10 6152 psrld m8, 6 6153 paddd m15, m8 6154 movd r4d, m15 6155 psrldq m15, 4 6156 %if ARCH_X86_64 6157 movd r6d, m15 6158 %else 6159 movd r3d, m15 6160 %endif 6161 mova m5, [base+bdct_lb_dw] 6162 mova m6, [base+subpel_s_shuf2] 6163 movd m15, [base+subpel_filters+r4*8+2] 6164 %if ARCH_X86_64 6165 movd m7, [base+subpel_filters+r6*8+2] 6166 %else 6167 movd m7, [base+subpel_filters+r3*8+2] 6168 %endif 6169 pxor m9, m9 6170 pcmpeqd m8, m9 6171 psrld m14, 10 6172 %if ARCH_X86_32 6173 mov r3, r3m 6174 pshufb m14, m5 6175 paddb m14, m6 6176 mova [esp+0x00], m14 6177 %define m14 [esp+0x00] 6178 SWAP m5, m0 6179 SWAP m6, m3 6180 %define m8 m5 6181 %define m15 m6 6182 %endif 6183 movq m0, [srcq+ssq*0] 6184 movq m2, [srcq+ssq*2] 6185 movhps m0, [srcq+ssq*1] 6186 movhps m2, [srcq+ss3q ] 6187 lea srcq, [srcq+ssq*4] 6188 %if ARCH_X86_64 6189 shr myd, 6 6190 mov r4d, 64 << 24 6191 lea myd, [t1+myq] 6192 cmovnz r4q, [base+subpel_filters+myq*8] 6193 pshufb m14, m5 6194 paddb m14, m6 6195 movq m10, r4 6196 %else 6197 mov myd, mym 6198 mov r5, [esp+0x1f4] 6199 xor r3, r3 6200 shr myd, 6 6201 lea r5, [r5+myd] 6202 mov r4, 64 << 24 6203 cmovnz r4, [base+subpel_filters+r5*8+0] 6204 cmovnz r3, [base+subpel_filters+r5*8+4] 6205 %define m10 m4 6206 movd m10, r4 6207 movd m3, r3 6208 mov r3, r3m 6209 punpckldq m10, m3 6210 %endif 6211 movq m1, [srcq+ssq*0] 6212 movq m3, [srcq+ssq*2] 6213 movhps m1, [srcq+ssq*1] 6214 add srcq, ss3q 6215 punpcklbw m10, m10 6216 psraw m10, 8 6217 punpckldq m15, m7 6218 punpcklqdq m15, m15 6219 %if ARCH_X86_64 6220 pand m11, m8 6221 %else 6222 pand m7, m11, m8 6223 %define m11 m7 6224 %endif 6225 pandn m8, m15 6226 SWAP m15, m8 6227 por m15, m11 6228 %if ARCH_X86_64 6229 pshufd m8, m10, q0000 6230 pshufd m9, m10, q1111 6231 pshufd m11, m10, q3333 6232 pshufd m10, m10, q2222 6233 %else 6234 mova [esp+0x10], m15 6235 %define m15 [esp+0x10] 6236 mov r0, r0m 6237 pshufd m5, m4, q0000 6238 pshufd m6, m4, q1111 6239 pshufd m7, m4, q2222 6240 pshufd m4, m4, q3333 6241 %define m8 [esp+0x20] 6242 %define m9 [esp+0x30] 6243 %define m10 [esp+0x40] 6244 %define m11 [esp+0x50] 6245 mova m8, m5 6246 mova m9, m6 6247 mova m10, m7 6248 mova m11, m4 6249 %endif 6250 pshufb m0, m14 6251 pshufb m2, m14 6252 pshufb m1, m14 6253 pshufb m3, m14 6254 pmaddubsw m0, m15 6255 pmaddubsw m2, m15 6256 pmaddubsw m1, m15 6257 pmaddubsw m3, m15 6258 phaddw m0, m2 6259 phaddw m1, m3 6260 pmulhrsw m0, m12 6261 pmulhrsw m1, m12 6262 palignr m2, m1, m0, 4 6263 pshufd m4, m1, q2121 6264 punpcklwd m3, m0, m2 ; 01 12 6265 punpckhwd m0, m2 ; 23 34 6266 punpcklwd m2, m1, m4 ; 45 56 6267.dy1_w2_loop: 6268 movq m1, [srcq+ssq*0] 6269 movhps m1, [srcq+ssq*1] 6270 lea srcq, [srcq+ssq*2] 6271 pmaddwd m5, m3, m8 6272 pmaddwd m6, m0, m9 6273 pmaddwd m7, m2, m10 6274 mova m3, m0 6275 mova m0, m2 6276 paddd m5, m13 6277 paddd m6, m7 6278 pshufb m1, m14 6279 pmaddubsw m1, m15 6280 phaddw m1, m1 6281 pmulhrsw m1, m12 6282 palignr m7, m1, m4, 12 6283 punpcklwd m2, m7, m1 ; 67 78 6284 pmaddwd m7, m2, m11 6285 mova m4, m1 6286 paddd m5, m6 6287 paddd m5, m7 6288 psrad m5, rndshift 6289 packssdw m5, m5 6290 packuswb m5, m5 6291 movd r4d, m5 6292 mov [dstq+dsq*0], r4w 6293 shr r4d, 16 6294 mov [dstq+dsq*1], r4w 6295 lea dstq, [dstq+dsq*2] 6296 sub hd, 2 6297 jg .dy1_w2_loop 6298 RET 6299%endif 6300INIT_XMM ssse3 6301.dy1_w4: 6302%if ARCH_X86_64 6303 mov myd, mym 6304 movzx t0d, t0b 6305 dec srcq 6306 movd m15, t0d 6307%else 6308 %define m10 [base+pd_0x3ff] 6309 %define m11 [base+pd_0x4000] 6310 %define m8 m0 6311 %xdefine m14 m4 6312 %define m15 m3 6313 %if isprep 6314 %define ssq r3 6315 %endif 6316 movzx r4, byte [esp+0x1f0] 6317 dec srcq 6318 movd m15, r4 6319%endif 6320 pmaddwd m8, [base+rescale_mul] 6321%if ARCH_X86_64 6322 mova m11, [base+pd_0x4000] 6323%endif 6324 pshufd m15, m15, q0000 6325 paddd m14, m8 ; mx+dx*[0-3] 6326 pand m8, m14, m10 6327 psrld m8, 6 6328 paddd m15, m8 6329 psrldq m7, m15, 8 6330%if ARCH_X86_64 6331 movd r4d, m15 6332 movd r11d, m7 6333 psrldq m15, 4 6334 psrldq m7, 4 6335 movd r6d, m15 6336 movd r13d, m7 6337 movd m15, [base+subpel_filters+ r4*8+2] 6338 movd m2, [base+subpel_filters+r11*8+2] 6339 movd m3, [base+subpel_filters+ r6*8+2] 6340 movd m4, [base+subpel_filters+r13*8+2] 6341 shr myd, 6 6342 mov r4d, 64 << 24 6343 lea myd, [t1+myq] 6344 cmovnz r4q, [base+subpel_filters+myq*8] 6345%else 6346 movd r1, m15 6347 movd r3, m7 6348 psrldq m15, 4 6349 psrldq m7, 4 6350 movd r4, m15 6351 movd r5, m7 6352 %define m15 m5 6353 SWAP m4, m7 6354 movd m15, [base+subpel_filters+r1*8+2] 6355 movd m2, [base+subpel_filters+r3*8+2] 6356 movd m3, [base+subpel_filters+r4*8+2] 6357 movd m4, [base+subpel_filters+r5*8+2] 6358 mov myd, mym 6359 mov rX, [esp+0x1f4] 6360 xor r5, r5 6361 shr myd, 6 6362 lea rX, [rX+myd] 6363 mov r4, 64 << 24 6364 cmovnz r4, [base+subpel_filters+rX*8+0] 6365 cmovnz r5, [base+subpel_filters+rX*8+4] 6366 mov r3, r3m 6367 %if isprep 6368 lea ss3q, [ssq*3] 6369 %endif 6370%endif 6371 punpckldq m15, m3 6372 punpckldq m2, m4 6373 punpcklqdq m15, m2 6374 movq m6, [base+subpel_s_shuf2] 6375%if ARCH_X86_64 6376 pcmpeqd m8, m9 6377 psrld m14, 10 6378 pshufb m14, [base+bdct_lb_dw] 6379 movu m0, [srcq+ssq*0] 6380 movu m1, [srcq+ssq*1] 6381 movu m2, [srcq+ssq*2] 6382 movu m3, [srcq+ss3q ] 6383 lea srcq, [srcq+ssq*4] 6384 punpcklqdq m6, m6 6385 movu m4, [srcq+ssq*0] 6386 movu m5, [srcq+ssq*1] 6387 movu m7, [srcq+ssq*2] 6388 add srcq, ss3q 6389 pand m11, m8 6390 pandn m8, m15 6391 SWAP m15, m8 6392 por m15, m11 6393 paddb m14, m6 6394 movq m10, r4q 6395 punpcklbw m10, m10 6396 psraw m10, 8 6397 pshufb m0, m14 6398 pshufb m1, m14 6399 pshufb m2, m14 6400 pshufb m3, m14 6401 pshufb m4, m14 6402 pshufb m5, m14 6403 pshufb m7, m14 6404 pmaddubsw m0, m15 6405 pmaddubsw m1, m15 6406 pmaddubsw m2, m15 6407 pmaddubsw m3, m15 6408 pmaddubsw m4, m15 6409 pmaddubsw m5, m15 6410 pmaddubsw m7, m15 6411 phaddw m0, m1 6412 phaddw m2, m3 6413 phaddw m4, m5 6414 phaddw m6, m7, m7 6415 pmulhrsw m0, m12 ; 0 1 6416 pmulhrsw m2, m12 ; 2 3 6417 pmulhrsw m4, m12 ; 4 5 6418 pmulhrsw m6, m12 ; 6 _ 6419 shufps m1, m0, m2, q1032 ; 1 2 6420 shufps m3, m2, m4, q1032 ; 3 4 6421 shufps m5, m4, m6, q1032 ; 5 6 6422 punpcklwd m7, m0, m1 ; 01 6423 punpckhwd m0, m1 ; 12 6424 punpcklwd m8, m2, m3 ; 23 6425 punpckhwd m2, m3 ; 34 6426 punpcklwd m9, m4, m5 ; 45 6427 punpckhwd m4, m5 ; 56 6428%else 6429 pxor m3, m3 6430 pcmpeqd m8, m3 6431 psrld m14, 10 6432 pshufb m14, [base+bdct_lb_dw] 6433 movu m1, [srcq+ssq*0] 6434 movu m2, [srcq+ssq*1] 6435 movu m3, [srcq+ssq*2] 6436 add srcq, ss3q 6437 punpcklqdq m6, m6 6438 SWAP m4, m7 6439 pand m7, m11, m8 6440 pandn m8, m15 6441 SWAP m5, m0 6442 por m15, m7 6443 paddb m14, m6 6444 movu m0, [srcq+ssq*0] 6445 movu m7, [srcq+ssq*1] 6446 movu m6, [srcq+ssq*2] 6447 pshufb m1, m14 6448 pshufb m2, m14 6449 pshufb m3, m14 6450 pshufb m0, m14 6451 pshufb m7, m14 6452 pshufb m6, m14 6453 pmaddubsw m1, m15 6454 pmaddubsw m2, m15 6455 pmaddubsw m3, m15 6456 mova [esp+0x00], m14 6457 mova [esp+0x10], m15 6458 pmaddubsw m0, m15 6459 pmaddubsw m7, m15 6460 pmaddubsw m6, m15 6461 phaddw m1, m2 6462 movu m2, [srcq+ss3q ] 6463 lea srcq, [srcq+ssq*4] 6464 mov r0, r0m 6465 phaddw m3, m0 6466 pshufb m2, m14 6467 pmaddubsw m2, m15 6468 %define m14 [esp+0x00] 6469 %define m15 [esp+0x10] 6470 phaddw m7, m6 6471 phaddw m2, m2 6472 movd m6, r4 6473 movd m0, r5 6474 punpckldq m6, m0 6475 punpcklbw m6, m6 6476 psraw m6, 8 6477 mova [esp+0x20], m6 6478 pmulhrsw m1, m12 ; 0 1 6479 pmulhrsw m3, m12 ; 2 3 6480 pmulhrsw m7, m12 ; 4 5 6481 pmulhrsw m2, m12 ; 6 _ 6482 shufps m0, m1, m3, q1032 ; 1 2 6483 shufps m4, m3, m7, q1032 ; 3 4 6484 shufps m5, m7, m2, q1032 ; 5 6 6485 punpcklwd m6, m1, m0 ; 01 6486 punpckhwd m1, m0 ; 12 6487 mova [esp+0x30], m1 6488 punpcklwd m1, m3, m4 ; 23 6489 punpckhwd m3, m4 ; 34 6490 mova [esp+0x40], m3 6491 punpcklwd m3, m7, m5 ; 45 6492 punpckhwd m7, m5 ; 56 6493 mova [esp+0x50], m7 6494 mova [esp+0x60], m2 6495 mova m0, [esp+0x20] 6496 %xdefine m8 m1 6497 %xdefine m9 m3 6498 %xdefine m10 m0 6499 SWAP m7, m6 6500 SWAP m1, m4 6501 SWAP m3, m2 6502%endif 6503 pshufd m1, m10, q0000 6504 pshufd m3, m10, q1111 6505 pshufd m5, m10, q2222 6506 pshufd m10, m10, q3333 6507%if ARCH_X86_64 6508 mova [rsp+0x00], m8 6509 mova [rsp+0x10], m2 6510 mova [rsp+0x20], m9 6511 mova [rsp+0x30], m4 6512%else 6513 mova [esp+0x70], m8 6514 mova [esp+0x80], m9 6515 mova [esp+0x90], m1 6516 mova [esp+0xa0], m3 6517 mova [esp+0xb0], m5 6518 mova [esp+0xc0], m10 6519 %ifidn %1, put 6520 mov dsd, dsm 6521 %endif 6522 %define m11 m6 6523%endif 6524.dy1_w4_loop: 6525%if ARCH_X86_64 6526 movu m11, [srcq+ssq*0] 6527 pmaddwd m7, m1 6528 pmaddwd m8, m3 6529 pmaddwd m0, m1 6530 pmaddwd m2, m3 6531 pmaddwd m9, m5 6532 pmaddwd m4, m5 6533 paddd m7, m8 6534 paddd m0, m2 6535 movu m8, [srcq+ssq*1] 6536 lea srcq, [srcq+ssq*2] 6537 pshufb m11, m14 6538 pmaddubsw m11, m15 6539 paddd m7, m13 6540 paddd m0, m13 6541 paddd m7, m9 6542 paddd m0, m4 6543 pshufb m8, m14 6544 pmaddubsw m8, m15 6545 phaddw m11, m8 6546 mova m8, [rsp+0x20] 6547 pmulhrsw m11, m12 6548 punpcklwd m9, m6, m11 ; 67 6549 psrldq m6, m11, 8 6550 punpcklwd m4, m11, m6 ; 78 6551 pmaddwd m2, m9, m10 6552 pmaddwd m11, m4, m10 6553 paddd m7, m2 6554 mova m2, [rsp+0x30] 6555 paddd m0, m11 6556%else 6557 SWAP m7, m6 6558 SWAP m1, m4 6559 SWAP m3, m2 6560 movu m5, [srcq+ssq*0] 6561 mova m0, [esp+0x30] 6562 mova m2, [esp+0x40] 6563 mova m4, [esp+0x50] 6564 pmaddwd m6, [esp+0x90] 6565 pmaddwd m1, [esp+0xa0] 6566 pmaddwd m0, [esp+0x90] 6567 pmaddwd m2, [esp+0xa0] 6568 pmaddwd m3, [esp+0xb0] 6569 pmaddwd m4, [esp+0xb0] 6570 paddd m6, m1 6571 paddd m0, m2 6572 movu m7, [srcq+ssq*1] 6573 lea srcq, [srcq+ssq*2] 6574 pshufb m5, m14 6575 pmaddubsw m5, m15 6576 paddd m6, m13 6577 paddd m0, m13 6578 paddd m6, m3 6579 paddd m0, m4 6580 pshufb m7, m14 6581 pmaddubsw m7, m15 6582 phaddw m5, m7 6583 mova m7, [rsp+0x80] 6584 pmulhrsw m5, m12 6585 punpcklwd m3, [esp+0x60], m5 ; 67 6586 psrldq m1, m5, 8 6587 punpcklwd m4, m5, m1 ; 78 6588 pmaddwd m2, m3, [esp+0xc0] 6589 pmaddwd m5, m4, [esp+0xc0] 6590 mova [esp+0x60], m1 6591 paddd m6, m2 6592 mova m2, [esp+0x50] 6593 paddd m0, m5 6594 SWAP m7, m6 6595%endif 6596 psrad m7, rndshift 6597 psrad m0, rndshift 6598 packssdw m7, m0 6599%if ARCH_X86_64 6600 mova m0, [rsp+0x10] 6601%else 6602 mova m0, [esp+0x40] 6603%define m11 m5 6604%endif 6605%ifidn %1, put 6606 packuswb m7, m7 6607 psrldq m11, m7, 4 6608 movd [dstq+dsq*0], m7 6609 movd [dstq+dsq*1], m11 6610 lea dstq, [dstq+dsq*2] 6611%else 6612 mova [tmpq], m7 6613 add tmpq, 16 6614%endif 6615 sub hd, 2 6616 jz .ret 6617%if ARCH_X86_64 6618 mova m7, [rsp+0x00] 6619 mova [rsp+0x00], m8 6620 mova [rsp+0x10], m2 6621 mova [rsp+0x20], m9 6622 mova [rsp+0x30], m4 6623%else 6624 mova m7, [esp+0x70] ; 01 6625 mova m1, [esp+0x80] ; 23 6626 mova m2, [esp+0x50] ; 34 6627 mova [esp+0x30], m0 6628 mova [esp+0x70], m1 6629 mova [esp+0x40], m2 6630 mova [esp+0x80], m3 6631 mova [esp+0x50], m4 6632%endif 6633 jmp .dy1_w4_loop 6634INIT_XMM ssse3 6635.dy1_w8: 6636 mov dword [rsp+0x90], 1 6637 movifprep tmp_stridem, 16 6638 jmp .dy1_w_start 6639.dy1_w16: 6640 mov dword [rsp+0x90], 2 6641 movifprep tmp_stridem, 32 6642 jmp .dy1_w_start 6643.dy1_w32: 6644 mov dword [rsp+0x90], 4 6645 movifprep tmp_stridem, 64 6646 jmp .dy1_w_start 6647.dy1_w64: 6648 mov dword [rsp+0x90], 8 6649 movifprep tmp_stridem, 128 6650 jmp .dy1_w_start 6651.dy1_w128: 6652 mov dword [rsp+0x90], 16 6653 movifprep tmp_stridem, 256 6654.dy1_w_start: 6655 mov myd, mym 6656%ifidn %1, put 6657 movifnidn dsm, dsq 6658%endif 6659%if ARCH_X86_64 6660 shr t0d, 16 6661 sub srcq, 3 6662 shr myd, 6 6663 mov r4d, 64 << 24 6664 lea myd, [t1+myq] 6665 cmovnz r4q, [base+subpel_filters+myq*8] 6666 movd m15, t0d 6667%else 6668 %define m8 m0 6669 %define m9 m1 6670 %xdefine m14 m4 6671 %xdefine m15 m3 6672 %if isprep 6673 %define ssq ssm 6674 %endif 6675 mov r5, [esp+0x1f0] 6676 mov r3, [esp+0x1f4] 6677 shr r5, 16 6678 sub srcq, 3 6679 movd m15, r5 6680 xor r5, r5 6681 shr myd, 6 6682 lea r3, [r3+myd] 6683 mov r4, 64 << 24 6684 cmovnz r4, [base+subpel_filters+r3*8+0] 6685 cmovnz r5, [base+subpel_filters+r3*8+4] 6686 mov r0, r0m 6687 mov r3, r3m 6688%endif 6689 pslld m7, m8, 2 ; dx*4 6690 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 6691 pshufd m15, m15, q0000 6692 paddd m14, m8 ; mx+dx*[0-3] 6693%if ARCH_X86_64 6694 movq m3, r4q 6695 punpcklbw m3, m3 6696 psraw m3, 8 6697%else 6698 movd m5, r4 6699 movd m6, r5 6700 punpckldq m5, m6 6701 punpcklbw m5, m5 6702 psraw m5, 8 6703 SWAP m3, m5 6704%endif 6705 mova [rsp+0x100], m7 6706 mova [rsp+0x120], m15 6707 mov [rsp+0x098], srcq 6708 mov [rsp+0x130], r0q ; dstq / tmpq 6709 pshufd m0, m3, q0000 6710 pshufd m1, m3, q1111 6711 pshufd m2, m3, q2222 6712 pshufd m3, m3, q3333 6713 mova [rsp+0x140], m0 6714 mova [rsp+0x150], m1 6715 mova [rsp+0x160], m2 6716 mova [rsp+0x170], m3 6717%if ARCH_X86_64 && UNIX64 6718 mov hm, hd 6719%elif ARCH_X86_32 6720 SWAP m5, m3 6721 mov r5, hm 6722 mov [esp+0x134], r5 6723%endif 6724 jmp .dy1_hloop 6725.dy1_hloop_prep: 6726 dec dword [rsp+0x090] 6727 jz .ret 6728%if ARCH_X86_64 6729 add qword [rsp+0x130], 8*(isprep+1) 6730 mov hd, hm 6731%else 6732 add dword [rsp+0x130], 8*(isprep+1) 6733 mov r5, [esp+0x134] 6734 mov r0, [esp+0x130] 6735%endif 6736 mova m7, [rsp+0x100] 6737 mova m14, [rsp+0x110] 6738%if ARCH_X86_64 6739 mova m10, [base+pd_0x3ff] 6740%else 6741 %define m10 [base+pd_0x3ff] 6742%endif 6743 mova m15, [rsp+0x120] 6744 mov srcq, [rsp+0x098] 6745%if ARCH_X86_64 6746 mov r0q, [rsp+0x130] ; dstq / tmpq 6747%else 6748 mov hm, r5 6749 mov r0m, r0 6750 mov r3, r3m 6751%endif 6752 paddd m14, m7 6753.dy1_hloop: 6754 pxor m9, m9 6755%if ARCH_X86_64 6756 mova m11, [base+pq_0x40000000] 6757%else 6758 %define m11 [base+pq_0x40000000] 6759%endif 6760 psrld m2, m14, 10 6761 mova [rsp], m2 6762 pand m6, m14, m10 6763 psrld m6, 6 6764 paddd m5, m15, m6 6765 pcmpeqd m6, m9 6766 psrldq m2, m5, 8 6767%if ARCH_X86_64 6768 movd r4d, m5 6769 movd r6d, m2 6770 psrldq m5, 4 6771 psrldq m2, 4 6772 movd r7d, m5 6773 movd r9d, m2 6774 movq m0, [base+subpel_filters+r4*8] 6775 movq m1, [base+subpel_filters+r6*8] 6776 movhps m0, [base+subpel_filters+r7*8] 6777 movhps m1, [base+subpel_filters+r9*8] 6778%else 6779 movd r0, m5 6780 movd rX, m2 6781 psrldq m5, 4 6782 psrldq m2, 4 6783 movd r4, m5 6784 movd r5, m2 6785 movq m0, [base+subpel_filters+r0*8] 6786 movq m1, [base+subpel_filters+rX*8] 6787 movhps m0, [base+subpel_filters+r4*8] 6788 movhps m1, [base+subpel_filters+r5*8] 6789 pxor m2, m2 6790 %define m9 m2 6791%endif 6792 paddd m14, m7 ; mx+dx*[4-7] 6793 pand m5, m14, m10 6794 psrld m5, 6 6795 paddd m15, m5 6796 pcmpeqd m5, m9 6797 mova [rsp+0x110], m14 6798 psrldq m4, m15, 8 6799%if ARCH_X86_64 6800 movd r10d, m15 6801 movd r11d, m4 6802 psrldq m15, 4 6803 psrldq m4, 4 6804 movd r13d, m15 6805 movd rXd, m4 6806 movq m2, [base+subpel_filters+r10*8] 6807 movq m3, [base+subpel_filters+r11*8] 6808 movhps m2, [base+subpel_filters+r13*8] 6809 movhps m3, [base+subpel_filters+ rX*8] 6810 psrld m14, 10 6811 psrldq m4, m14, 8 6812 movd r10d, m14 6813 movd r11d, m4 6814 psrldq m14, 4 6815 psrldq m4, 4 6816 movd r13d, m14 6817 movd rXd, m4 6818 mov r4d, [rsp+ 0] 6819 mov r6d, [rsp+ 8] 6820 mov r7d, [rsp+ 4] 6821 mov r9d, [rsp+12] 6822 pshufd m4, m6, q1100 6823 pshufd m6, m6, q3322 6824 pshufd m7, m5, q1100 6825 pshufd m5, m5, q3322 6826 pand m8, m11, m4 6827 pand m9, m11, m6 6828 pand m15, m11, m7 6829 pand m11, m11, m5 6830 pandn m4, m0 6831 pandn m6, m1 6832 pandn m7, m2 6833 pandn m5, m3 6834 por m8, m4 6835 por m9, m6 6836 por m15, m7 6837 por m11, m5 6838 mova [rsp+0x10], m8 6839 mova [rsp+0x20], m9 6840 mova [rsp+0x30], m15 6841 mova [rsp+0x40], m11 6842 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 6843 mova [rsp+0x50], m1 6844 mova [rsp+0x60], m2 6845 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 6846 mova [rsp+0x70], m3 6847 mova [rsp+0x80], m4 6848 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 6849 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 6850 SWAP m7, m0 6851 SWAP m8, m14 6852 mova m1, [rsp+0x50] 6853 mova m2, [rsp+0x60] 6854 mova m3, [rsp+0x70] 6855 mova m15, [rsp+0x80] 6856 punpcklwd m4, m5, m6 ; 45a 6857 punpckhwd m5, m6 ; 45b 6858 punpcklwd m6, m7, m8 ; 67a 6859 punpckhwd m7, m8 ; 67b 6860 SWAP m14, m8 6861 mova m8, [rsp+0x140] 6862 mova m9, [rsp+0x150] 6863 mova m10, [rsp+0x160] 6864 mova m11, [rsp+0x170] 6865 punpcklwd m0, m1, m2 ; 01a 6866 punpckhwd m1, m2 ; 01b 6867 punpcklwd m2, m3, m15; 23a 6868 punpckhwd m3, m15 ; 23b 6869 mova [rsp+0x50], m4 6870 mova [rsp+0x60], m5 6871 mova [rsp+0x70], m6 6872 mova [rsp+0x80], m7 6873 mova m14, [base+unpckw] 6874%else 6875 movd r0, m15 6876 movd rX, m4 6877 psrldq m15, 4 6878 psrldq m4, 4 6879 movd r4, m15 6880 movd r5, m4 6881 mova m14, [esp+0x110] 6882 movq m2, [base+subpel_filters+r0*8] 6883 movq m3, [base+subpel_filters+rX*8] 6884 movhps m2, [base+subpel_filters+r4*8] 6885 movhps m3, [base+subpel_filters+r5*8] 6886 psrld m14, 10 6887 mova [esp+16], m14 6888 mov r0, [esp+ 0] 6889 mov rX, [esp+ 8] 6890 mov r4, [esp+ 4] 6891 mov r5, [esp+12] 6892 mova [esp+0x20], m0 6893 mova [esp+0x30], m1 6894 mova [esp+0x40], m2 6895 mova [esp+0x50], m3 6896 pshufd m4, m6, q1100 6897 pshufd m6, m6, q3322 6898 pshufd m7, m5, q1100 6899 pshufd m5, m5, q3322 6900 pand m0, m11, m4 6901 pand m1, m11, m6 6902 pand m2, m11, m7 6903 pand m3, m11, m5 6904 pandn m4, [esp+0x20] 6905 pandn m6, [esp+0x30] 6906 pandn m7, [esp+0x40] 6907 pandn m5, [esp+0x50] 6908 por m0, m4 6909 por m1, m6 6910 por m2, m7 6911 por m3, m5 6912 mova [esp+0x20], m0 6913 mova [esp+0x30], m1 6914 mova [esp+0x40], m2 6915 mova [esp+0x50], m3 6916 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 6917 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 6918 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 6919 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 6920 mova m5, [esp+0x1a0] 6921 mova m6, [esp+0x1b0] 6922 mova m7, [esp+0x1c0] 6923 mova m0, [esp+0x1d0] 6924 punpcklwd m4, m5, m6 ; 45a 6925 punpckhwd m5, m6 ; 45b 6926 punpcklwd m6, m7, m0 ; 67a 6927 punpckhwd m7, m0 ; 67b 6928 mova [esp+0x1a0], m4 6929 mova [esp+0x1b0], m5 6930 mova [esp+0x1c0], m6 6931 mova [esp+0x1d0], m7 6932 mova m1, [esp+0x060] 6933 mova m2, [esp+0x070] 6934 mova m3, [esp+0x180] 6935 mova m4, [esp+0x190] 6936 punpcklwd m0, m1, m2 ; 01a 6937 punpckhwd m1, m2 ; 01b 6938 punpcklwd m2, m3, m4 ; 23a 6939 punpckhwd m3, m4 ; 23b 6940 mova [esp+0x060], m0 6941 mova [esp+0x070], m1 6942 mova [esp+0x180], m2 6943 mova [esp+0x190], m3 6944 %define m8 [esp+0x140] 6945 %define m9 [esp+0x150] 6946 %define m10 [esp+0x160] 6947 %define m11 [esp+0x170] 6948%endif 6949.dy1_vloop: 6950%if ARCH_X86_32 6951 mov r0, r0m 6952%endif 6953 pmaddwd m4, m0, m8 6954 pmaddwd m5, m1, m8 6955 pmaddwd m6, m2, m9 6956 pmaddwd m7, m3, m9 6957 paddd m4, m13 6958 paddd m5, m13 6959 paddd m4, m6 6960 paddd m5, m7 6961%if ARCH_X86_64 6962 pmaddwd m6, [rsp+0x50], m10 6963 pmaddwd m7, [rsp+0x60], m10 6964%else 6965 pmaddwd m6, [rsp+0x1a0], m10 6966 pmaddwd m7, [rsp+0x1b0], m10 6967%endif 6968 paddd m4, m6 6969 paddd m5, m7 6970%if ARCH_X86_64 6971 pmaddwd m6, [rsp+0x70], m11 6972 pmaddwd m7, [rsp+0x80], m11 6973%else 6974 pmaddwd m6, [rsp+0x1c0], m11 6975 pmaddwd m7, [rsp+0x1d0], m11 6976%endif 6977 paddd m4, m6 6978 paddd m5, m7 6979 psrad m4, rndshift 6980 psrad m5, rndshift 6981 packssdw m4, m5 6982%ifidn %1, put 6983 packuswb m4, m4 6984 movq [dstq], m4 6985 add dstq, dsm 6986%else 6987 mova [tmpq], m4 6988 add tmpq, tmp_stridem 6989%endif 6990%if ARCH_X86_32 6991 mov r0m, r0 6992%endif 6993 dec hd 6994 jz .dy1_hloop_prep 6995%if ARCH_X86_64 6996 movq m4, [srcq+ r4] 6997 movq m5, [srcq+ r6] 6998 movhps m4, [srcq+ r7] 6999 movhps m5, [srcq+ r9] 7000 movq m6, [srcq+r10] 7001 movq m7, [srcq+r11] 7002 movhps m6, [srcq+r13] 7003 movhps m7, [srcq+ rX] 7004 add srcq, ssq 7005 pshufd m15, m14, q1032 7006 pshufb m0, m14 ; 0a 1a 7007 pshufb m1, m14 ; 0b 1b 7008 pshufb m2, m15 ; 3a 2a 7009 pshufb m3, m15 ; 3b 2b 7010 pmaddubsw m4, [rsp+0x10] 7011 pmaddubsw m5, [rsp+0x20] 7012 pmaddubsw m6, [rsp+0x30] 7013 pmaddubsw m7, [rsp+0x40] 7014 phaddw m4, m5 7015 phaddw m6, m7 7016 phaddw m4, m6 7017 pmulhrsw m4, m12 7018 pshufb m5, [rsp+0x70], m15 ; 7a 6a 7019 pshufb m7, [rsp+0x80], m15 ; 7b 6b 7020 pshufb m6, [rsp+0x50], m14 ; 4a 5a 7021 pshufb m15, [rsp+0x60], m14 ; 4b 5b 7022 punpckhwd m0, m2 ; 12a 7023 punpckhwd m1, m3 ; 12b 7024 punpcklwd m2, m6 ; 34a 7025 punpcklwd m3, m15 ; 34b 7026 punpckhwd m6, m5 ; 56a 7027 punpckhwd m15, m7 ; 56b 7028 punpcklwd m5, m4 ; 78a 7029 psrldq m4, 8 7030 punpcklwd m7, m4 ; 78b 7031 mova [rsp+0x50], m6 7032 mova [rsp+0x60], m15 7033 mova [rsp+0x70], m5 7034 mova [rsp+0x80], m7 7035%else 7036 mov r0, [esp+ 0] 7037 mov rX, [esp+ 8] 7038 mov r4, [esp+ 4] 7039 mov r5, [esp+12] 7040 mova m6, [base+unpckw] 7041 mova m0, [esp+0x060] 7042 mova m1, [esp+0x070] 7043 mova m7, [esp+0x1a0] 7044 movq m4, [srcq+r0] 7045 movq m5, [srcq+rX] 7046 movhps m4, [srcq+r4] 7047 movhps m5, [srcq+r5] 7048 pshufb m0, m6 ; 0a 1a 7049 pshufb m1, m6 ; 0b 1b 7050 pshufb m7, m6 ; 4a 5a 7051 mov r0, [esp+16] 7052 mov rX, [esp+24] 7053 mov r4, [esp+20] 7054 mov r5, [esp+28] 7055 movq m3, [srcq+r0] 7056 movq m2, [srcq+rX] 7057 movhps m3, [srcq+r4] 7058 movhps m2, [srcq+r5] 7059 add srcq, ssq 7060 pmaddubsw m4, [esp+0x20] 7061 pmaddubsw m5, [esp+0x30] 7062 pmaddubsw m3, [esp+0x40] 7063 pmaddubsw m2, [esp+0x50] 7064 phaddw m4, m5 7065 phaddw m3, m2 7066 mova m5, [esp+0x1b0] 7067 mova m2, [esp+0x180] 7068 phaddw m4, m3 7069 mova m3, [esp+0x190] 7070 pmulhrsw m4, m12 ; 8a 8b 7071 pshufb m5, m6 ; 4b 5b 7072 pshufd m6, m6, q1032 7073 pshufb m2, m6 ; 3a 2a 7074 pshufb m3, m6 ; 3b 2b 7075 punpckhwd m0, m2 ; 12a 7076 punpckhwd m1, m3 ; 12b 7077 mova [esp+0x60], m0 7078 mova [esp+0x70], m1 7079 mova m0, [esp+0x1c0] 7080 mova m1, [esp+0x1d0] 7081 punpcklwd m2, m7 ; 34a 7082 punpcklwd m3, m5 ; 34b 7083 mova [esp+0x180], m2 7084 mova [esp+0x190], m3 7085 pshufb m0, m6 ; 7a 6a 7086 pshufb m1, m6 ; 7b 6b 7087 punpckhwd m7, m0 ; 56a 7088 punpckhwd m5, m1 ; 56b 7089 punpcklwd m0, m4 7090 punpckhqdq m4, m4 7091 punpcklwd m1, m4 7092 mova [esp+0x1a0], m7 7093 mova [esp+0x1b0], m5 7094 mova [esp+0x1c0], m0 7095 mova [esp+0x1d0], m1 7096 mova m0, [esp+0x60] 7097 mova m1, [esp+0x70] 7098%endif 7099 jmp .dy1_vloop 7100INIT_XMM ssse3 7101.dy2: 7102 movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] 7103 add wq, base_reg 7104 jmp wq 7105%ifidn %1, put 7106.dy2_w2: 7107 %if ARCH_X86_64 7108 mov myd, mym 7109 movzx t0d, t0b 7110 dec srcq 7111 movd m15, t0d 7112 %else 7113 %define m10 [base+pd_0x3ff] 7114 %define m11 [base+pd_0x4000] 7115 %define m8 m0 7116 %define m9 m1 7117 %define m14 m4 7118 %define m15 m3 7119 movzx r5, byte [esp+0x1f0] 7120 dec srcd 7121 movd m15, r5 7122 %endif 7123 punpckldq m9, m8 7124 SWAP m8, m9 7125 paddd m14, m8 ; mx+dx*[0-1] 7126 %if ARCH_X86_64 7127 mova m11, [base+pd_0x4000] 7128 %endif 7129 pshufd m15, m15, q0000 7130 pand m8, m14, m10 7131 psrld m8, 6 7132 paddd m15, m8 7133 movd r4d, m15 7134 psrldq m15, 4 7135 %if ARCH_X86_64 7136 movd r6d, m15 7137 %else 7138 movd r3d, m15 7139 %endif 7140 mova m5, [base+bdct_lb_dw] 7141 mova m6, [base+subpel_s_shuf2] 7142 movd m15, [base+subpel_filters+r4*8+2] 7143 %if ARCH_X86_64 7144 movd m7, [base+subpel_filters+r6*8+2] 7145 %else 7146 movd m7, [base+subpel_filters+r3*8+2] 7147 %endif 7148 pxor m9, m9 7149 pcmpeqd m8, m9 7150 psrld m14, 10 7151 %if ARCH_X86_32 7152 mov r3, r3m 7153 pshufb m14, m5 7154 paddb m14, m6 7155 mova [esp+0x00], m14 7156 %define m14 [esp+0x00] 7157 SWAP m5, m0 7158 SWAP m6, m3 7159 %define m8 m5 7160 %define m15 m6 7161 %endif 7162 movq m0, [srcq+ssq*0] 7163 movq m1, [srcq+ssq*1] 7164 movhps m0, [srcq+ssq*2] 7165 movhps m1, [srcq+ss3q ] 7166 lea srcq, [srcq+ssq*4] 7167 %if ARCH_X86_64 7168 shr myd, 6 7169 mov r4d, 64 << 24 7170 lea myd, [t1+myq] 7171 cmovnz r4q, [base+subpel_filters+myq*8] 7172 pshufb m14, m5 7173 paddb m14, m6 7174 movq m10, r4q 7175 %else 7176 mov myd, mym 7177 mov r3, [esp+0x1f4] 7178 xor r5, r5 7179 shr myd, 6 7180 lea r3, [r3+myd] 7181 mov r4, 64 << 24 7182 cmovnz r4, [base+subpel_filters+r3*8+0] 7183 cmovnz r5, [base+subpel_filters+r3*8+4] 7184 mov r3, r3m 7185 %define m10 m4 7186 movd m10, r4 7187 movd m3, r5 7188 punpckldq m10, m3 7189 %endif 7190 movq m3, [srcq+ssq*0] 7191 movhps m3, [srcq+ssq*1] 7192 lea srcq, [srcq+ssq*2] 7193 punpcklbw m10, m10 7194 psraw m10, 8 7195 punpckldq m15, m7 7196 punpcklqdq m15, m15 7197 %if ARCH_X86_64 7198 pand m11, m8 7199 %else 7200 pand m7, m11, m8 7201 %define m11 m7 7202 %endif 7203 pandn m8, m15 7204 SWAP m15, m8 7205 por m15, m11 7206 %if ARCH_X86_64 7207 pshufd m8, m10, q0000 7208 pshufd m9, m10, q1111 7209 pshufd m11, m10, q3333 7210 pshufd m10, m10, q2222 7211 %else 7212 mova [esp+0x10], m15 7213 %define m15 [esp+0x10] 7214 mov r5, r0m 7215 %define dstq r5 7216 mov dsd, dsm 7217 pshufd m5, m4, q0000 7218 pshufd m6, m4, q1111 7219 pshufd m7, m4, q2222 7220 pshufd m4, m4, q3333 7221 %define m8 [esp+0x20] 7222 %define m9 [esp+0x30] 7223 %define m10 [esp+0x40] 7224 %define m11 [esp+0x50] 7225 mova m8, m5 7226 mova m9, m6 7227 mova m10, m7 7228 mova m11, m4 7229 %endif 7230 pshufb m0, m14 7231 pshufb m1, m14 7232 pshufb m3, m14 7233 pmaddubsw m0, m15 7234 pmaddubsw m1, m15 7235 pmaddubsw m3, m15 7236 pslldq m2, m3, 8 7237 phaddw m0, m2 7238 phaddw m1, m3 7239 pmulhrsw m0, m12 ; 0 2 _ 4 7240 pmulhrsw m1, m12 ; 1 3 _ 5 7241 pshufd m2, m0, q3110 ; 0 2 2 4 7242 pshufd m1, m1, q3110 ; 1 3 3 5 7243 punpcklwd m3, m2, m1 ; 01 23 7244 punpckhwd m2, m1 ; 23 45 7245.dy2_w2_loop: 7246 movq m6, [srcq+ssq*0] 7247 movq m7, [srcq+ssq*1] 7248 movhps m6, [srcq+ssq*2] 7249 movhps m7, [srcq+ss3q ] 7250 lea srcq, [srcq+ssq*4] 7251 pmaddwd m4, m3, m8 7252 pmaddwd m5, m2, m9 7253 pshufb m6, m14 7254 pshufb m7, m14 7255 pmaddubsw m6, m15 7256 pmaddubsw m7, m15 7257 phaddw m6, m7 7258 pmulhrsw m6, m12 7259 psrldq m7, m6, 8 7260 palignr m6, m0, 8 7261 palignr m7, m1, 8 7262 mova m0, m6 7263 mova m1, m7 7264 pshufd m6, m6, q3221 7265 pshufd m7, m7, q3221 7266 punpcklwd m3, m6, m7 ; 45 67 7267 punpckhwd m2, m6, m7 ; 67 89 7268 pmaddwd m6, m3, m10 7269 pmaddwd m7, m2, m11 7270 paddd m4, m5 7271 paddd m4, m13 7272 paddd m6, m7 7273 paddd m4, m6 7274 psrad m4, rndshift 7275 packssdw m4, m4 7276 packuswb m4, m4 7277 movd r4d, m4 7278 mov [dstq+dsq*0], r4w 7279 shr r4d, 16 7280 mov [dstq+dsq*1], r4w 7281 lea dstq, [dstq+dsq*2] 7282 sub hd, 2 7283 jg .dy2_w2_loop 7284 RET 7285%endif 7286INIT_XMM ssse3 7287.dy2_w4: 7288%if ARCH_X86_64 7289 mov myd, mym 7290 movzx t0d, t0b 7291 dec srcq 7292 movd m15, t0d 7293%else 7294 %define m10 [base+pd_0x3ff] 7295 %define m11 [base+pd_0x4000] 7296 %define m8 m0 7297 %xdefine m14 m4 7298 %define m15 m3 7299 %define dstq r0 7300 %if isprep 7301 %define ssq r3 7302 %endif 7303 movzx r4, byte [esp+0x1f0] 7304 dec srcq 7305 movd m15, r4 7306%endif 7307 pmaddwd m8, [base+rescale_mul] 7308%if ARCH_X86_64 7309 mova m11, [base+pd_0x4000] 7310%endif 7311 pshufd m15, m15, q0000 7312 paddd m14, m8 ; mx+dx*[0-3] 7313 pand m8, m14, m10 7314 psrld m8, 6 7315 paddd m15, m8 7316 psrldq m7, m15, 8 7317%if ARCH_X86_64 7318 movd r4d, m15 7319 movd r11d, m7 7320 psrldq m15, 4 7321 psrldq m7, 4 7322 movd r6d, m15 7323 movd r13d, m7 7324 movd m15, [base+subpel_filters+ r4*8+2] 7325 movd m2, [base+subpel_filters+r11*8+2] 7326 movd m3, [base+subpel_filters+ r6*8+2] 7327 movd m4, [base+subpel_filters+r13*8+2] 7328 movq m6, [base+subpel_s_shuf2] 7329 shr myd, 6 7330 mov r4d, 64 << 24 7331 lea myd, [t1+myq] 7332 cmovnz r4q, [base+subpel_filters+myq*8] 7333%else 7334 movd r1, m15 7335 movd r3, m7 7336 psrldq m15, 4 7337 psrldq m7, 4 7338 movd r4, m15 7339 movd r5, m7 7340 %define m15 m5 7341 SWAP m4, m7 7342 movd m15, [base+subpel_filters+r1*8+2] 7343 movd m2, [base+subpel_filters+r3*8+2] 7344 movd m3, [base+subpel_filters+r4*8+2] 7345 movd m4, [base+subpel_filters+r5*8+2] 7346 movq m6, [base+subpel_s_shuf2] 7347 mov myd, mym 7348 mov r3, [esp+0x1f4] 7349 xor r5, r5 7350 shr myd, 6 7351 lea r3, [r3+myd] 7352 mov r4, 64 << 24 7353 cmovnz r4, [base+subpel_filters+r3*8+0] 7354 cmovnz r5, [base+subpel_filters+r3*8+4] 7355 mov r3, r3m 7356 %if isprep 7357 lea ss3q, [ssq*3] 7358 %endif 7359%endif 7360 punpckldq m15, m3 7361 punpckldq m2, m4 7362 punpcklqdq m15, m2 7363%if ARCH_X86_64 7364 pcmpeqd m8, m9 7365 psrld m14, 10 7366 movu m0, [srcq+ssq*0] 7367 movu m2, [srcq+ssq*2] 7368 movu m1, [srcq+ssq*1] 7369 movu m3, [srcq+ss3q ] 7370 lea srcq, [srcq+ssq*4] 7371 punpcklqdq m6, m6 7372 pshufb m14, [base+bdct_lb_dw] 7373 movu m4, [srcq+ssq*0] 7374 movu m5, [srcq+ssq*1] 7375 lea srcq, [srcq+ssq*2] 7376 pand m11, m8 7377 pandn m8, m15 7378 SWAP m15, m8 7379 por m15, m11 7380 paddb m14, m6 7381 movq m11, r4q 7382 punpcklbw m11, m11 7383 psraw m11, 8 7384 pshufb m0, m14 7385 pshufb m2, m14 7386 pshufb m1, m14 7387 pshufb m3, m14 7388 pshufb m4, m14 7389 pshufb m5, m14 7390 pmaddubsw m0, m15 7391 pmaddubsw m2, m15 7392 pmaddubsw m1, m15 7393 pmaddubsw m3, m15 7394 pmaddubsw m4, m15 7395 pmaddubsw m5, m15 7396 phaddw m0, m2 7397 phaddw m1, m3 7398 phaddw m4, m5 7399 pmulhrsw m0, m12 ; 0 2 7400 pmulhrsw m1, m12 ; 1 3 7401 pmulhrsw m4, m12 ; 4 5 7402 pshufd m8, m11, q0000 7403 pshufd m9, m11, q1111 7404 pshufd m10, m11, q2222 7405 pshufd m11, m11, q3333 7406%else 7407 pxor m3, m3 7408 pcmpeqd m8, m3 7409 psrld m14, 10 7410 pshufb m14, [base+bdct_lb_dw] 7411 movu m1, [srcq+ssq*0] 7412 movu m2, [srcq+ssq*2] 7413 movu m3, [srcq+ssq*1] 7414 add srcq, ss3q 7415 punpcklqdq m6, m6 7416 SWAP m4, m7 7417 pand m7, m11, m8 7418 pandn m8, m15 7419 SWAP m15, m8 7420 por m15, m7 7421 paddb m14, m6 7422 movu m0, [srcq+ssq*0] 7423 movu m7, [srcq+ssq*1] 7424 movu m6, [srcq+ssq*2] 7425 add srcq, ss3q 7426 pshufb m1, m14 7427 pshufb m2, m14 7428 pshufb m3, m14 7429 pshufb m0, m14 7430 pshufb m7, m14 7431 pshufb m6, m14 7432 pmaddubsw m1, m15 7433 pmaddubsw m2, m15 7434 pmaddubsw m3, m15 7435 mova [esp+0x00], m14 7436 mova [esp+0x10], m15 7437 pmaddubsw m0, m15 7438 pmaddubsw m7, m15 7439 pmaddubsw m6, m15 7440 %define m14 [esp+0x00] 7441 %define m15 [esp+0x10] 7442 phaddw m1, m2 7443 phaddw m3, m0 7444 phaddw m7, m6 7445 %ifidn %1, put 7446 mov dsd, dsm 7447 %define dstq r5 7448 %else 7449 %define tmpq r5 7450 %endif 7451 movd m6, r4 7452 movd m0, r5 7453 punpckldq m6, m0 7454 punpcklbw m6, m6 7455 psraw m6, 8 7456 mov r5, r0m 7457 pmulhrsw m1, m12 ; 0 2 7458 pmulhrsw m3, m12 ; 1 3 7459 pmulhrsw m7, m12 ; 4 5 7460 SWAP m0, m1, m3 7461 SWAP m4, m7 7462 pshufd m2, m6, q0000 7463 pshufd m3, m6, q1111 7464 pshufd m7, m6, q2222 7465 pshufd m6, m6, q3333 7466 mova [esp+0x30], m2 7467 mova [esp+0x40], m3 7468 mova [esp+0x50], m7 7469 mova [esp+0x60], m6 7470 %define m8 [esp+0x30] 7471 %define m9 [esp+0x40] 7472 %define m10 [esp+0x50] 7473 %define m11 [esp+0x60] 7474%endif 7475 psrldq m5, m4, 8 ; 5 _ 7476 punpckhwd m2, m0, m1 ; 23 7477 punpcklwd m0, m1 ; 01 7478 punpcklwd m4, m5 ; 45 7479.dy2_w4_loop: 7480 pmaddwd m0, m8 ; a0 7481 pmaddwd m5, m2, m8 ; b0 7482 pmaddwd m2, m9 ; a1 7483 pmaddwd m7, m4, m9 ; b1 7484 pmaddwd m3, m4, m10 ; a2 7485 paddd m0, m13 7486 paddd m5, m13 7487 paddd m0, m2 7488 paddd m5, m7 7489 paddd m0, m3 7490 movu m6, [srcq+ssq*0] 7491 movu m7, [srcq+ssq*1] 7492 movu m3, [srcq+ssq*2] 7493 movu m1, [srcq+ss3q ] 7494 lea srcq, [srcq+ssq*4] 7495 pshufb m6, m14 7496 pshufb m7, m14 7497 pshufb m3, m14 7498 pshufb m1, m14 7499 pmaddubsw m6, m15 7500 pmaddubsw m7, m15 7501 pmaddubsw m3, m15 7502 pmaddubsw m1, m15 7503 phaddw m6, m7 7504 phaddw m3, m1 7505 pmulhrsw m6, m12 ; 6 7 7506 pmulhrsw m3, m12 ; 8 9 7507 psrldq m7, m6, 8 7508 psrldq m1, m3, 8 7509 punpcklwd m6, m7 ; 67 7510 punpcklwd m3, m1 ; 89 7511 mova m2, m6 7512 pmaddwd m1, m6, m10 ; b2 7513 pmaddwd m6, m11 ; a3 7514 pmaddwd m7, m3, m11 ; b3 7515 paddd m5, m1 7516 paddd m0, m6 7517 paddd m5, m7 7518 psrad m0, rndshift 7519 psrad m5, rndshift 7520 packssdw m0, m5 7521%ifidn %1, put 7522 packuswb m0, m0 7523 psrldq m1, m0, 4 7524 movd [dstq+dsq*0], m0 7525 movd [dstq+dsq*1], m1 7526 lea dstq, [dstq+dsq*2] 7527%else 7528 mova [tmpq], m0 7529 add tmpq, 16 7530%endif 7531 mova m0, m4 7532 mova m4, m3 7533 sub hd, 2 7534 jg .dy2_w4_loop 7535 MC_8TAP_SCALED_RET 7536INIT_XMM ssse3 7537.dy2_w8: 7538 mov dword [rsp+0x90], 1 7539 movifprep tmp_stridem, 16 7540 jmp .dy2_w_start 7541.dy2_w16: 7542 mov dword [rsp+0x90], 2 7543 movifprep tmp_stridem, 32 7544 jmp .dy2_w_start 7545.dy2_w32: 7546 mov dword [rsp+0x90], 4 7547 movifprep tmp_stridem, 64 7548 jmp .dy2_w_start 7549.dy2_w64: 7550 mov dword [rsp+0x90], 8 7551 movifprep tmp_stridem, 128 7552 jmp .dy2_w_start 7553.dy2_w128: 7554 mov dword [rsp+0x90], 16 7555 movifprep tmp_stridem, 256 7556.dy2_w_start: 7557 mov myd, mym 7558%ifidn %1, put 7559 movifnidn dsm, dsq 7560%endif 7561%if ARCH_X86_64 7562 shr t0d, 16 7563 sub srcq, 3 7564 shr myd, 6 7565 mov r4d, 64 << 24 7566 lea myd, [t1+myq] 7567 cmovnz r4q, [base+subpel_filters+myq*8] 7568 movd m15, t0d 7569%else 7570 %define m10 [base+pd_0x3ff] 7571 %define m11 [base+pd_0x4000] 7572 %define m8 m0 7573 %define m9 m1 7574 %xdefine m14 m4 7575 %xdefine m15 m3 7576 %if isprep 7577 %define tmpq r0 7578 %define ssq ssm 7579 %else 7580 %define dstq r0 7581 %endif 7582 mov r5, [esp+0x1f0] 7583 mov r3, [esp+0x1f4] 7584 shr r5, 16 7585 sub srcq, 3 7586 movd m15, r5 7587 xor r5, r5 7588 shr myd, 6 7589 lea r3, [r3+myd] 7590 mov r4, 64 << 24 7591 cmovnz r4, [base+subpel_filters+r3*8+0] 7592 cmovnz r5, [base+subpel_filters+r3*8+4] 7593 mov r0, r0m 7594 mov r3, r3m 7595%endif 7596 pslld m7, m8, 2 ; dx*4 7597 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 7598 pshufd m15, m15, q0000 7599 paddd m14, m8 ; mx+dx*[0-3] 7600%if ARCH_X86_64 7601 movq m3, r4q 7602 punpcklbw m3, m3 7603 psraw m3, 8 7604%else 7605 movd m5, r4 7606 movd m6, r5 7607 punpckldq m5, m6 7608 punpcklbw m5, m5 7609 psraw m5, 8 7610 SWAP m3, m5 7611%endif 7612 mova [rsp+0x100], m7 7613 mova [rsp+0x120], m15 7614 mov [rsp+0x098], srcq 7615 mov [rsp+0x130], r0q ; dstq / tmpq 7616 pshufd m0, m3, q0000 7617 pshufd m1, m3, q1111 7618 pshufd m2, m3, q2222 7619 pshufd m3, m3, q3333 7620 mova [rsp+0x140], m0 7621 mova [rsp+0x150], m1 7622 mova [rsp+0x160], m2 7623 mova [rsp+0x170], m3 7624%if ARCH_X86_64 && UNIX64 7625 mov hm, hd 7626%elif ARCH_X86_32 7627 SWAP m5, m3 7628 mov r5, hm 7629 mov [esp+0x134], r5 7630%endif 7631 jmp .dy2_hloop 7632.dy2_hloop_prep: 7633 dec dword [rsp+0x090] 7634 jz .ret 7635%if ARCH_X86_64 7636 add qword [rsp+0x130], 8*(isprep+1) 7637 mov hd, hm 7638%else 7639 add dword [rsp+0x130], 8*(isprep+1) 7640 mov r5, [esp+0x134] 7641 mov r0, [esp+0x130] 7642%endif 7643 mova m7, [rsp+0x100] 7644 mova m14, [rsp+0x110] 7645%if ARCH_X86_64 7646 mova m10, [base+pd_0x3ff] 7647%else 7648 %define m10 [base+pd_0x3ff] 7649%endif 7650 mova m15, [rsp+0x120] 7651 mov srcq, [rsp+0x098] 7652%if ARCH_X86_64 7653 mov r0q, [rsp+0x130] ; dstq / tmpq 7654%else 7655 mov hm, r5 7656 mov r0m, r0 7657 mov r3, r3m 7658%endif 7659 paddd m14, m7 7660.dy2_hloop: 7661 pxor m9, m9 7662%if ARCH_X86_64 7663 mova m11, [base+pq_0x40000000] 7664%else 7665 %define m11 [base+pq_0x40000000] 7666%endif 7667 psrld m2, m14, 10 7668 mova [rsp], m2 7669 pand m6, m14, m10 7670 psrld m6, 6 7671 paddd m5, m15, m6 7672 pcmpeqd m6, m9 7673 psrldq m2, m5, 8 7674%if ARCH_X86_64 7675 movd r4d, m5 7676 movd r6d, m2 7677 psrldq m5, 4 7678 psrldq m2, 4 7679 movd r7d, m5 7680 movd r9d, m2 7681 movq m0, [base+subpel_filters+r4*8] 7682 movq m1, [base+subpel_filters+r6*8] 7683 movhps m0, [base+subpel_filters+r7*8] 7684 movhps m1, [base+subpel_filters+r9*8] 7685%else 7686 movd r0, m5 7687 movd rX, m2 7688 psrldq m5, 4 7689 psrldq m2, 4 7690 movd r4, m5 7691 movd r5, m2 7692 movq m0, [base+subpel_filters+r0*8] 7693 movq m1, [base+subpel_filters+rX*8] 7694 movhps m0, [base+subpel_filters+r4*8] 7695 movhps m1, [base+subpel_filters+r5*8] 7696 pxor m2, m2 7697 %define m9 m2 7698%endif 7699 paddd m14, m7 ; mx+dx*[4-7] 7700 pand m5, m14, m10 7701 psrld m5, 6 7702 paddd m15, m5 7703 pcmpeqd m5, m9 7704 mova [rsp+0x110], m14 7705 psrldq m4, m15, 8 7706%if ARCH_X86_64 7707 movd r10d, m15 7708 movd r11d, m4 7709 psrldq m15, 4 7710 psrldq m4, 4 7711 movd r13d, m15 7712 movd rXd, m4 7713 movq m2, [base+subpel_filters+r10*8] 7714 movq m3, [base+subpel_filters+r11*8] 7715 movhps m2, [base+subpel_filters+r13*8] 7716 movhps m3, [base+subpel_filters+ rX*8] 7717 psrld m14, 10 7718 psrldq m4, m14, 8 7719 movd r10d, m14 7720 movd r11d, m4 7721 psrldq m14, 4 7722 psrldq m4, 4 7723 movd r13d, m14 7724 movd rXd, m4 7725 mov r4d, [rsp+ 0] 7726 mov r6d, [rsp+ 8] 7727 mov r7d, [rsp+ 4] 7728 mov r9d, [rsp+12] 7729 pshufd m4, m6, q1100 7730 pshufd m6, m6, q3322 7731 pshufd m7, m5, q1100 7732 pshufd m5, m5, q3322 7733 pand m8, m11, m4 7734 pand m9, m11, m6 7735 pand m15, m11, m7 7736 pand m11, m11, m5 7737 pandn m4, m0 7738 pandn m6, m1 7739 pandn m7, m2 7740 pandn m5, m3 7741 por m8, m4 7742 por m9, m6 7743 por m15, m7 7744 por m11, m5 7745 mova [rsp+0x10], m8 7746 mova [rsp+0x20], m9 7747 mova [rsp+0x30], m15 7748 mova [rsp+0x40], m11 7749 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 7750 mova [rsp+0x50], m1 7751 mova [rsp+0x60], m2 7752 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 7753 mova [rsp+0x70], m3 7754 mova [rsp+0x80], m4 7755 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 7756 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 7757 SWAP m7, m0 7758 SWAP m8, m14 7759 mova m1, [rsp+0x50] 7760 mova m2, [rsp+0x60] 7761 mova m3, [rsp+0x70] 7762 mova m15, [rsp+0x80] 7763 punpcklwd m4, m5, m6 ; 45a 7764 punpckhwd m5, m6 ; 45b 7765 punpcklwd m6, m7, m8 ; 67a 7766 punpckhwd m7, m8 ; 67b 7767 SWAP m14, m8 7768 mova m8, [rsp+0x140] 7769 mova m9, [rsp+0x150] 7770 mova m10, [rsp+0x160] 7771 mova m11, [rsp+0x170] 7772 punpcklwd m0, m1, m2 ; 01a 7773 punpckhwd m1, m2 ; 01b 7774 punpcklwd m2, m3, m15; 23a 7775 punpckhwd m3, m15 ; 23b 7776 mova [rsp+0x50], m4 7777 mova [rsp+0x60], m5 7778 mova [rsp+0x70], m6 7779 mova [rsp+0x80], m7 7780%else 7781 movd r0, m15 7782 movd rX, m4 7783 psrldq m15, 4 7784 psrldq m4, 4 7785 movd r4, m15 7786 movd r5, m4 7787 mova m14, [esp+0x110] 7788 movq m2, [base+subpel_filters+r0*8] 7789 movq m3, [base+subpel_filters+rX*8] 7790 movhps m2, [base+subpel_filters+r4*8] 7791 movhps m3, [base+subpel_filters+r5*8] 7792 psrld m14, 10 7793 mova [esp+16], m14 7794 mov r0, [esp+ 0] 7795 mov rX, [esp+ 8] 7796 mov r4, [esp+ 4] 7797 mov r5, [esp+12] 7798 mova [esp+0x20], m0 7799 mova [esp+0x30], m1 7800 mova [esp+0x40], m2 7801 mova [esp+0x50], m3 7802 pshufd m4, m6, q1100 7803 pshufd m6, m6, q3322 7804 pshufd m7, m5, q1100 7805 pshufd m5, m5, q3322 7806 pand m0, m11, m4 7807 pand m1, m11, m6 7808 pand m2, m11, m7 7809 pand m3, m11, m5 7810 pandn m4, [esp+0x20] 7811 pandn m6, [esp+0x30] 7812 pandn m7, [esp+0x40] 7813 pandn m5, [esp+0x50] 7814 por m0, m4 7815 por m1, m6 7816 por m2, m7 7817 por m3, m5 7818 mova [esp+0x20], m0 7819 mova [esp+0x30], m1 7820 mova [esp+0x40], m2 7821 mova [esp+0x50], m3 7822 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 7823 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 7824 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 7825 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 7826 mova m5, [esp+0x1a0] 7827 mova m6, [esp+0x1b0] 7828 mova m7, [esp+0x1c0] 7829 mova m0, [esp+0x1d0] 7830 punpcklwd m4, m5, m6 ; 45a 7831 punpckhwd m5, m6 ; 45b 7832 punpcklwd m6, m7, m0 ; 67a 7833 punpckhwd m7, m0 ; 67b 7834 mova [esp+0x1a0], m4 7835 mova [esp+0x1b0], m5 7836 mova [esp+0x1c0], m6 7837 mova [esp+0x1d0], m7 7838 mova m1, [esp+0x060] 7839 mova m2, [esp+0x070] 7840 mova m3, [esp+0x180] 7841 mova m4, [esp+0x190] 7842 punpcklwd m0, m1, m2 ; 01a 7843 punpckhwd m1, m2 ; 01b 7844 punpcklwd m2, m3, m4 ; 23a 7845 punpckhwd m3, m4 ; 23b 7846 mova [esp+0x180], m2 7847 mova [esp+0x190], m3 7848 %define m8 [esp+0x140] 7849 %define m9 [esp+0x150] 7850 %define m10 [esp+0x160] 7851 %define m11 [esp+0x170] 7852%endif 7853.dy2_vloop: 7854%if ARCH_X86_32 7855 mov r0, r0m 7856%endif 7857 pmaddwd m4, m0, m8 7858 pmaddwd m5, m1, m8 7859 pmaddwd m6, m2, m9 7860 pmaddwd m7, m3, m9 7861 paddd m4, m13 7862 paddd m5, m13 7863 paddd m4, m6 7864 paddd m5, m7 7865%if ARCH_X86_64 7866 pmaddwd m6, [rsp+0x50], m10 7867 pmaddwd m7, [rsp+0x60], m10 7868%else 7869 pmaddwd m6, [esp+0x1a0], m10 7870 pmaddwd m7, [esp+0x1b0], m10 7871%endif 7872 paddd m4, m6 7873 paddd m5, m7 7874%if ARCH_X86_64 7875 pmaddwd m6, [rsp+0x70], m11 7876 pmaddwd m7, [rsp+0x80], m11 7877%else 7878 pmaddwd m6, [esp+0x1c0], m11 7879 pmaddwd m7, [esp+0x1d0], m11 7880%endif 7881 paddd m4, m6 7882 paddd m5, m7 7883 psrad m4, rndshift 7884 psrad m5, rndshift 7885 packssdw m4, m5 7886%ifidn %1, put 7887 packuswb m4, m4 7888 movq [dstq], m4 7889 add dstq, dsm 7890%else 7891 mova [tmpq], m4 7892 add tmpq, tmp_stridem 7893%endif 7894%if ARCH_X86_32 7895 mov r0m, r0 7896%endif 7897 dec hd 7898 jz .dy2_hloop_prep 7899%if ARCH_X86_64 7900 mova m8, [rsp+0x10] 7901 mova m9, [rsp+0x20] 7902 mova m10, [rsp+0x30] 7903 mova m11, [rsp+0x40] 7904 mova m0, m2 ; 01a 7905 mova m1, m3 ; 01b 7906 MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 7907 mova m3, [rsp+0x50] ; 23a 7908 mova m4, [rsp+0x60] ; 23b 7909 mova m5, [rsp+0x70] ; 45a 7910 mova m7, [rsp+0x80] ; 45b 7911 mova m8, [rsp+0x140] 7912 mova m9, [rsp+0x150] 7913 mova m10, [rsp+0x160] 7914 mova m11, [rsp+0x170] 7915 punpcklwd m14, m2, m6 ; 67a 7916 punpckhwd m2, m6 ; 67b 7917 mova [rsp+0x50], m5 7918 mova [rsp+0x60], m7 7919 mova [rsp+0x70], m14 7920 mova [rsp+0x80], m2 7921 mova m2, m3 7922 mova m3, m4 7923%else 7924 MC_8TAP_SCALED_H 0x20, 0 7925 punpcklwd m6, m0, m4 7926 punpckhwd m7, m0, m4 7927 mova m0, [esp+0x180] ; 01a 7928 mova m1, [esp+0x190] ; 01b 7929 mova m2, [rsp+0x1a0] ; 23a 7930 mova m3, [esp+0x1b0] ; 23b 7931 mova m4, [esp+0x1c0] ; 45a 7932 mova m5, [esp+0x1d0] ; 45b 7933 mova [esp+0x180], m2 7934 mova [esp+0x190], m3 7935 mova [esp+0x1a0], m4 7936 mova [esp+0x1b0], m5 7937 mova [esp+0x1c0], m6 ; 67a 7938 mova [esp+0x1d0], m7 ; 67b 7939%endif 7940 jmp .dy2_vloop 7941.ret: 7942 MC_8TAP_SCALED_RET 0 7943%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT 7944 %define r0m [rstk+stack_offset+ 4] 7945 %define r1m [rstk+stack_offset+ 8] 7946 %define r2m [rstk+stack_offset+12] 7947 %define r3m [rstk+stack_offset+16] 7948%endif 7949%undef isprep 7950%endmacro 7951 7952%macro BILIN_SCALED_FN 1 7953cglobal %1_bilin_scaled_8bpc 7954 mov t0d, (5*15 << 16) | 5*15 7955 mov t1d, (5*15 << 16) | 5*15 7956 jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) 7957%endmacro 7958 7959%if WIN64 7960DECLARE_REG_TMP 6, 5 7961%elif ARCH_X86_64 7962DECLARE_REG_TMP 6, 8 7963%else 7964DECLARE_REG_TMP 1, 2 7965%endif 7966%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 7967BILIN_SCALED_FN put 7968PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc 7969PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc 7970PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc 7971PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc 7972PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc 7973PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc 7974PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc 7975PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc 7976PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 7977MC_8TAP_SCALED put 7978 7979%if WIN64 7980DECLARE_REG_TMP 5, 4 7981%elif ARCH_X86_64 7982DECLARE_REG_TMP 6, 7 7983%else 7984DECLARE_REG_TMP 1, 2 7985%endif 7986%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 7987BILIN_SCALED_FN prep 7988PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc 7989PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc 7990PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc 7991PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc 7992PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc 7993PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc 7994PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc 7995PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc 7996PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 7997MC_8TAP_SCALED prep 7998 7999%if ARCH_X86_32 8000 %macro SAVE_ALPHA_BETA 0 8001 mov alpham, alphad 8002 mov betam, betad 8003 %endmacro 8004 8005 %macro SAVE_DELTA_GAMMA 0 8006 mov deltam, deltad 8007 mov gammam, gammad 8008 %endmacro 8009 8010 %macro LOAD_ALPHA_BETA_MX 0 8011 mov mym, myd 8012 mov alphad, alpham 8013 mov betad, betam 8014 mov mxd, mxm 8015 %endmacro 8016 8017 %macro LOAD_DELTA_GAMMA_MY 0 8018 mov mxm, mxd 8019 mov deltad, deltam 8020 mov gammad, gammam 8021 mov myd, mym 8022 %endmacro 8023 8024 %define PIC_reg r2 8025 %define PIC_base_offset $$ 8026 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 8027%else 8028 %define SAVE_ALPHA_BETA 8029 %define SAVE_DELTA_GAMMA 8030 %define PIC_sym(sym) sym 8031%endif 8032 8033%if ARCH_X86_32 8034 %if STACK_ALIGNMENT < required_stack_alignment 8035 %assign copy_args 8*4 8036 %else 8037 %assign copy_args 0 8038 %endif 8039%endif 8040 8041%macro RELOC_ARGS 0 8042 %if copy_args 8043 mov r0, r0m 8044 mov r1, r1m 8045 mov r2, r2m 8046 mov r3, r3m 8047 mov r5, r5m 8048 mov dstm, r0 8049 mov dsm, r1 8050 mov srcm, r2 8051 mov ssm, r3 8052 mov mxm, r5 8053 mov r0, r6m 8054 mov mym, r0 8055 %endif 8056%endmacro 8057 8058%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 8059 %if cpuflag(sse4) 8060 pblendw %1, %2, 0xAA 8061 %else 8062 pand %2, m10 8063 por %1, %2 8064 %endif 8065%endmacro 8066 8067%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 8068 %if ARCH_X86_32 8069 %define m8 m4 8070 %define m9 m5 8071 %define m14 m6 8072 %define m15 m7 8073 %define m11 m7 8074 %endif 8075 %if ARCH_X86_32 8076 pxor m11, m11 8077 %endif 8078 lea tmp1d, [myq+deltaq*4] 8079 lea tmp2d, [myq+deltaq*1] 8080 shr myd, 10 8081 shr tmp1d, 10 8082 movq m2, [filterq+myq *8] ; a 8083 movq m8, [filterq+tmp1q*8] ; e 8084 lea tmp1d, [tmp2q+deltaq*4] 8085 lea myd, [tmp2q+deltaq*1] 8086 shr tmp2d, 10 8087 shr tmp1d, 10 8088 movq m3, [filterq+tmp2q*8] ; b 8089 movq m0, [filterq+tmp1q*8] ; f 8090 punpcklwd m2, m3 8091 punpcklwd m8, m0 8092 lea tmp1d, [myq+deltaq*4] 8093 lea tmp2d, [myq+deltaq*1] 8094 shr myd, 10 8095 shr tmp1d, 10 8096 movq m0, [filterq+myq *8] ; c 8097 movq m9, [filterq+tmp1q*8] ; g 8098 lea tmp1d, [tmp2q+deltaq*4] 8099 lea myd, [tmp2q+gammaq] ; my += gamma 8100 shr tmp2d, 10 8101 shr tmp1d, 10 8102 movq m3, [filterq+tmp2q*8] ; d 8103 movq m1, [filterq+tmp1q*8] ; h 8104 punpcklwd m0, m3 8105 punpcklwd m9, m1 8106 punpckldq m1, m2, m0 8107 punpckhdq m2, m0 8108 punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 8109 punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 8110 punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 8111 punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 8112 pmaddwd m0, %3 8113 pmaddwd m3, %5 8114 pmaddwd m1, %7 8115 pmaddwd m14, %9 8116 paddd m0, m3 8117 paddd m1, m14 8118 paddd m0, m1 8119 mova %1, m0 8120 %if ARCH_X86_64 8121 SWAP m3, m14 8122 %endif 8123 punpckldq m0, m8, m9 8124 punpckhdq m8, m9 8125 punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 8126 punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 8127 punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 8128 punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 8129 pmaddwd m1, %4 8130 pmaddwd m14, %6 8131 pmaddwd m2, %8 8132 pmaddwd m15, %10 8133 paddd m1, m14 8134 paddd m2, m15 8135 paddd m1, m2 8136 mova %2, m1 8137 %if ARCH_X86_64 8138 SWAP m14, m3 8139 %endif 8140%endmacro 8141 8142%if ARCH_X86_64 8143 %define counterd r4d 8144%else 8145 %if copy_args == 0 8146 %define counterd dword r4m 8147 %else 8148 %define counterd dword [esp+stack_size-4*7] 8149 %endif 8150%endif 8151 8152%macro WARP_AFFINE_8X8 0 8153%if ARCH_X86_64 8154cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts 8155%else 8156cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts 8157 %if copy_args 8158 %define tmpm [esp+stack_size-4*1] 8159 %define tsm [esp+stack_size-4*2] 8160 %endif 8161%endif 8162 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main 8163.loop: 8164%if ARCH_X86_32 8165 %define m12 m4 8166 %define m13 m5 8167 %define m14 m6 8168 %define m15 m7 8169 mova m12, [esp+0xC0] 8170 mova m13, [esp+0xD0] 8171 mova m14, [esp+0xE0] 8172 mova m15, [esp+0xF0] 8173%endif 8174 psrad m12, 13 8175 psrad m13, 13 8176 psrad m14, 13 8177 psrad m15, 13 8178 packssdw m12, m13 8179 packssdw m14, m15 8180 mova m13, [PIC_sym(pw_8192)] 8181 pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 8182 pmulhrsw m14, m13 8183 mova [tmpq+tsq*0], m12 8184 mova [tmpq+tsq*2], m14 8185 dec counterd 8186 jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end 8187%if ARCH_X86_32 8188 mov tmpm, tmpd 8189 mov r0, [esp+0x100] 8190 mov r1, [esp+0x104] 8191%endif 8192 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 8193 lea tmpq, [tmpq+tsq*4] 8194 jmp .loop 8195 8196%if ARCH_X86_64 8197cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ 8198 dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ 8199 filter, tmp1, delta, my, gamma 8200%else 8201cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ 8202 dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ 8203 filter, tmp1, delta, my, gamma 8204 %define alphaq r0 8205 %define alphad r0 8206 %define alpham [esp+gprsize+0x100] 8207 %define betaq r1 8208 %define betad r1 8209 %define betam [esp+gprsize+0x104] 8210 %define deltaq r0 8211 %define deltad r0 8212 %define deltam [esp+gprsize+0x108] 8213 %define gammaq r1 8214 %define gammad r1 8215 %define gammam [esp+gprsize+0x10C] 8216 %define filterq r3 8217 %define tmp1q r4 8218 %define tmp1d r4 8219 %define tmp1m [esp+gprsize+0x110] 8220 %define myq r5 8221 %define myd r5 8222 %define mym r6m 8223 %if copy_args 8224 %define dstm [esp+stack_size-4*1] 8225 %define dsm [esp+stack_size-4*2] 8226 %define srcm [esp+stack_size-4*3] 8227 %define ssm [esp+stack_size-4*4] 8228 %define mxm [esp+stack_size-4*5] 8229 %define mym [esp+stack_size-4*6] 8230 %endif 8231%endif 8232 call .main 8233 jmp .start 8234.loop: 8235%if ARCH_X86_32 8236 mov dstm, dstd 8237 mov alphad, [esp+0x100] 8238 mov betad, [esp+0x104] 8239%endif 8240 call .main2 8241 lea dstq, [dstq+dsq*2] 8242.start: 8243%if notcpuflag(sse4) 8244 %define roundval pw_8192 8245 %if ARCH_X86_64 8246 mova m10, [PIC_sym(roundval)] 8247 %else 8248 %define m10 [PIC_sym(roundval)] 8249 %endif 8250%endif 8251%if ARCH_X86_32 8252 %define m12 m5 8253 %define m13 m6 8254 mova m12, [esp+0xC0] 8255 mova m13, [esp+0xD0] 8256%endif 8257%if cpuflag(sse4) 8258 %if ARCH_X86_32 8259 %define m11 m4 8260 pxor m11, m11 8261 %endif 8262 psrad m12, 18 8263 psrad m13, 18 8264 packusdw m12, m13 8265 pavgw m12, m11 ; (x + (1 << 10)) >> 11 8266%else 8267 psrad m12, 17 8268 psrad m13, 17 8269 packssdw m12, m13 8270 pmulhrsw m12, m10 8271%endif 8272%if ARCH_X86_32 8273 %define m14 m6 8274 %define m15 m7 8275 mova m14, [esp+0xE0] 8276 mova m15, [esp+0xF0] 8277%endif 8278%if cpuflag(sse4) 8279 psrad m14, 18 8280 psrad m15, 18 8281 packusdw m14, m15 8282 pavgw m14, m11 ; (x + (1 << 10)) >> 11 8283%else 8284 psrad m14, 17 8285 psrad m15, 17 8286 packssdw m14, m15 8287 pmulhrsw m14, m10 8288%endif 8289 packuswb m12, m14 8290 movq [dstq+dsq*0], m12 8291 movhps [dstq+dsq*1], m12 8292 dec counterd 8293 jg .loop 8294.end: 8295 RET 8296ALIGN function_align 8297.main: 8298%assign stack_offset stack_offset+gprsize 8299%if ARCH_X86_32 8300 %assign stack_size stack_size+4 8301 %if copy_args 8302 %assign stack_offset stack_offset-4 8303 %endif 8304 RELOC_ARGS 8305 LEA PIC_reg, $$ 8306 %define PIC_mem [esp+gprsize+0x114] 8307 mov abcdd, abcdm 8308 %if copy_args == 0 8309 mov ssd, ssm 8310 mov mxd, mxm 8311 %endif 8312 mov PIC_mem, PIC_reg 8313 mov srcd, srcm 8314%endif 8315 movsx deltad, word [abcdq+2*2] 8316 movsx gammad, word [abcdq+2*3] 8317 lea tmp1d, [deltaq*3] 8318 sub gammad, tmp1d ; gamma -= delta*3 8319 SAVE_DELTA_GAMMA 8320%if ARCH_X86_32 8321 mov abcdd, abcdm 8322%endif 8323 movsx alphad, word [abcdq+2*0] 8324 movsx betad, word [abcdq+2*1] 8325 lea tmp1q, [ssq*3+3] 8326 add mxd, 512+(64<<10) 8327 lea tmp2d, [alphaq*3] 8328 sub srcq, tmp1q ; src -= src_stride*3 + 3 8329%if ARCH_X86_32 8330 mov srcm, srcd 8331 mov PIC_reg, PIC_mem 8332%endif 8333 sub betad, tmp2d ; beta -= alpha*3 8334 lea filterq, [PIC_sym(mc_warp_filter2)] 8335%if ARCH_X86_64 8336 mov myd, r6m 8337 pxor m11, m11 8338%endif 8339 call .h 8340 psrld m2, m0, 16 8341 psrld m3, m1, 16 8342%if ARCH_X86_32 8343 mova [esp+gprsize+0x10], m3 8344%endif 8345 call .h 8346 psrld m4, m0, 16 8347 psrld m5, m1, 16 8348%if ARCH_X86_32 8349 mova [esp+gprsize+0x20], m4 8350 mova [esp+gprsize+0x30], m5 8351%endif 8352 call .h 8353%if ARCH_X86_64 8354 %define blendmask [rsp+gprsize+0x80] 8355%else 8356 mova m3, [esp+gprsize+0x10] 8357 %define blendmask [esp+gprsize+0x120] 8358 %define m10 m7 8359%endif 8360 pcmpeqd m10, m10 8361 pslld m10, 16 8362 mova blendmask, m10 8363 BLENDHWDW m2, m0 ; 0 8364 BLENDHWDW m3, m1 ; 2 8365 mova [rsp+gprsize+0x00], m2 8366 mova [rsp+gprsize+0x10], m3 8367 call .h 8368%if ARCH_X86_32 8369 mova m4, [esp+gprsize+0x20] 8370 mova m5, [esp+gprsize+0x30] 8371%endif 8372 mova m10, blendmask 8373 BLENDHWDW m4, m0 ; 1 8374 BLENDHWDW m5, m1 ; 3 8375 mova [rsp+gprsize+0x20], m4 8376 mova [rsp+gprsize+0x30], m5 8377 call .h 8378%if ARCH_X86_32 8379 mova m3, [esp+gprsize+0x10] 8380 %define m10 m5 8381%endif 8382 psrld m6, m2, 16 8383 psrld m7, m3, 16 8384 mova m10, blendmask 8385 BLENDHWDW m6, m0 ; 2 8386 BLENDHWDW m7, m1 ; 4 8387 mova [rsp+gprsize+0x40], m6 8388 mova [rsp+gprsize+0x50], m7 8389 call .h 8390%if ARCH_X86_32 8391 mova m4, [esp+gprsize+0x20] 8392 mova m5, [esp+gprsize+0x30] 8393%endif 8394 psrld m2, m4, 16 8395 psrld m3, m5, 16 8396 mova m10, blendmask 8397 BLENDHWDW m2, m0 ; 3 8398 BLENDHWDW m3, m1 ; 5 8399 mova [rsp+gprsize+0x60], m2 8400 mova [rsp+gprsize+0x70], m3 8401 call .h 8402%if ARCH_X86_32 8403 mova m6, [esp+gprsize+0x40] 8404 mova m7, [esp+gprsize+0x50] 8405 %define m10 m7 8406%endif 8407 psrld m4, m6, 16 8408 psrld m5, m7, 16 8409 mova m10, blendmask 8410 BLENDHWDW m4, m0 ; 4 8411 BLENDHWDW m5, m1 ; 6 8412%if ARCH_X86_64 8413 add myd, 512+(64<<10) 8414 mova m6, m2 8415 mova m7, m3 8416%else 8417 mova [esp+gprsize+0x80], m4 8418 mova [esp+gprsize+0x90], m5 8419 add dword mym, 512+(64<<10) 8420%endif 8421 mov counterd, 4 8422 SAVE_ALPHA_BETA 8423.main2: 8424 call .h 8425%if ARCH_X86_32 8426 mova m6, [esp+gprsize+0x60] 8427 mova m7, [esp+gprsize+0x70] 8428 %define m10 m5 8429%endif 8430 psrld m6, 16 8431 psrld m7, 16 8432 mova m10, blendmask 8433 BLENDHWDW m6, m0 ; 5 8434 BLENDHWDW m7, m1 ; 7 8435%if ARCH_X86_64 8436 WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ 8437 m4, m5, \ 8438 [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ 8439 m6, m7 8440%else 8441 mova [esp+gprsize+0xA0], m6 8442 mova [esp+gprsize+0xB0], m7 8443 LOAD_DELTA_GAMMA_MY 8444 WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ 8445 [esp+gprsize+0x00], [esp+gprsize+0x10], \ 8446 [esp+gprsize+0x80], [esp+gprsize+0x90], \ 8447 [esp+gprsize+0x20], [esp+gprsize+0x30], \ 8448 [esp+gprsize+0xA0], [esp+gprsize+0xB0] 8449 LOAD_ALPHA_BETA_MX 8450%endif 8451 call .h 8452 mova m2, [rsp+gprsize+0x40] 8453 mova m3, [rsp+gprsize+0x50] 8454%if ARCH_X86_32 8455 mova m4, [rsp+gprsize+0x80] 8456 mova m5, [rsp+gprsize+0x90] 8457 %define m10 m7 8458%endif 8459 mova [rsp+gprsize+0x00], m2 8460 mova [rsp+gprsize+0x10], m3 8461 mova [rsp+gprsize+0x40], m4 8462 mova [rsp+gprsize+0x50], m5 8463 psrld m4, 16 8464 psrld m5, 16 8465 mova m10, blendmask 8466 BLENDHWDW m4, m0 ; 6 8467 BLENDHWDW m5, m1 ; 8 8468%if ARCH_X86_64 8469 WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ 8470 m6, m7, \ 8471 [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ 8472 m4, m5 8473%else 8474 mova [esp+gprsize+0x80], m4 8475 mova [esp+gprsize+0x90], m5 8476 LOAD_DELTA_GAMMA_MY 8477 WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ 8478 [esp+gprsize+0x20], [esp+gprsize+0x30], \ 8479 [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ 8480 [esp+gprsize+0x00], [esp+gprsize+0x10], \ 8481 [esp+gprsize+0x80], [esp+gprsize+0x90] 8482 mov mym, myd 8483 mov dstd, dstm 8484 mov dsd, dsm 8485 mov mxd, mxm 8486%endif 8487 mova m2, [rsp+gprsize+0x60] 8488 mova m3, [rsp+gprsize+0x70] 8489%if ARCH_X86_32 8490 mova m6, [esp+gprsize+0xA0] 8491 mova m7, [esp+gprsize+0xB0] 8492%endif 8493 mova [rsp+gprsize+0x20], m2 8494 mova [rsp+gprsize+0x30], m3 8495 mova [rsp+gprsize+0x60], m6 8496 mova [rsp+gprsize+0x70], m7 8497 ret 8498ALIGN function_align 8499.h: 8500%if ARCH_X86_32 8501 %define m8 m3 8502 %define m9 m4 8503 %define m10 m5 8504 %define m14 m6 8505 %define m15 m7 8506%endif 8507 lea tmp1d, [mxq+alphaq*4] 8508 lea tmp2d, [mxq+alphaq*1] 8509%if ARCH_X86_32 8510 %assign stack_offset stack_offset+4 8511 %assign stack_size stack_size+4 8512 %define PIC_mem [esp+gprsize*2+0x114] 8513 mov PIC_mem, PIC_reg 8514 mov srcd, srcm 8515%endif 8516 movu m10, [srcq] 8517%if ARCH_X86_32 8518 add srcd, ssm 8519 mov srcm, srcd 8520 mov PIC_reg, PIC_mem 8521%else 8522 add srcq, ssq 8523%endif 8524 shr mxd, 10 8525 shr tmp1d, 10 8526 movq m1, [filterq+mxq *8] ; 0 X 8527 movq m8, [filterq+tmp1q*8] ; 4 X 8528 lea tmp1d, [tmp2q+alphaq*4] 8529 lea mxd, [tmp2q+alphaq*1] 8530 shr tmp2d, 10 8531 shr tmp1d, 10 8532 movhps m1, [filterq+tmp2q*8] ; 0 1 8533 movhps m8, [filterq+tmp1q*8] ; 4 5 8534 lea tmp1d, [mxq+alphaq*4] 8535 lea tmp2d, [mxq+alphaq*1] 8536 shr mxd, 10 8537 shr tmp1d, 10 8538 movq m14, [filterq+mxq *8] ; 2 X 8539 movq m9, [filterq+tmp1q*8] ; 6 X 8540 lea tmp1d, [tmp2q+alphaq*4] 8541 lea mxd, [tmp2q+betaq] ; mx += beta 8542 shr tmp2d, 10 8543 shr tmp1d, 10 8544 movhps m14, [filterq+tmp2q*8] ; 2 3 8545 movhps m9, [filterq+tmp1q*8] ; 6 7 8546 pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] 8547 pmaddubsw m0, m1 8548 pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] 8549 pmaddubsw m1, m8 8550 pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] 8551 pmaddubsw m15, m14 8552 pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] 8553 pmaddubsw m10, m9 8554 phaddw m0, m15 8555 phaddw m1, m10 8556 mova m14, [PIC_sym(pw_8192)] 8557 mova m9, [PIC_sym(pd_32768)] 8558 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 8559 pmaddwd m1, m14 8560 paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword 8561 paddd m1, m9 8562 ret 8563%endmacro 8564 8565%if WIN64 8566DECLARE_REG_TMP 6, 4 8567%else 8568DECLARE_REG_TMP 6, 7 8569%endif 8570 8571%macro BIDIR_FN 1 ; op 8572 %1 0 8573 lea stride3q, [strideq*3] 8574 jmp wq 8575.w4_loop: 8576 %1_INC_PTR 2 8577 %1 0 8578 lea dstq, [dstq+strideq*4] 8579.w4: ; tile 4x 8580 movd [dstq ], m0 ; copy dw[0] 8581 pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] 8582 movd [dstq+strideq*1], m1 ; copy dw[1] 8583 punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] 8584 movd [dstq+strideq*2], m0 ; dw[2] 8585 psrlq m0, 32 ; shift right in dw[3] 8586 movd [dstq+stride3q ], m0 ; copy 8587 sub hd, 4 8588 jg .w4_loop 8589 RET 8590.w8_loop: 8591 %1_INC_PTR 2 8592 %1 0 8593 lea dstq, [dstq+strideq*2] 8594.w8: 8595 movq [dstq ], m0 8596 movhps [dstq+strideq*1], m0 8597 sub hd, 2 8598 jg .w8_loop 8599 RET 8600.w16_loop: 8601 %1_INC_PTR 2 8602 %1 0 8603 lea dstq, [dstq+strideq] 8604.w16: 8605 mova [dstq ], m0 8606 dec hd 8607 jg .w16_loop 8608 RET 8609.w32_loop: 8610 %1_INC_PTR 4 8611 %1 0 8612 lea dstq, [dstq+strideq] 8613.w32: 8614 mova [dstq ], m0 8615 %1 2 8616 mova [dstq + 16 ], m0 8617 dec hd 8618 jg .w32_loop 8619 RET 8620.w64_loop: 8621 %1_INC_PTR 8 8622 %1 0 8623 add dstq, strideq 8624.w64: 8625 %assign i 0 8626 %rep 4 8627 mova [dstq + i*16 ], m0 8628 %assign i i+1 8629 %if i < 4 8630 %1 2*i 8631 %endif 8632 %endrep 8633 dec hd 8634 jg .w64_loop 8635 RET 8636.w128_loop: 8637 %1_INC_PTR 16 8638 %1 0 8639 add dstq, strideq 8640.w128: 8641 %assign i 0 8642 %rep 8 8643 mova [dstq + i*16 ], m0 8644 %assign i i+1 8645 %if i < 8 8646 %1 2*i 8647 %endif 8648 %endrep 8649 dec hd 8650 jg .w128_loop 8651 RET 8652%endmacro 8653 8654%macro AVG 1 ; src_offset 8655 ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel 8656 mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 8657 paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 8658 mova m1, [tmp1q+(%1+1)*mmsize] 8659 paddw m1, [tmp2q+(%1+1)*mmsize] 8660 pmulhrsw m0, m2 8661 pmulhrsw m1, m2 8662 packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit 8663%endmacro 8664 8665%macro AVG_INC_PTR 1 8666 add tmp1q, %1*mmsize 8667 add tmp2q, %1*mmsize 8668%endmacro 8669 8670cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 8671 LEA r6, avg_ssse3_table 8672 tzcnt wd, wm ; leading zeros 8673 movifnidn hd, hm ; move h(stack) to h(register) if not already that register 8674 movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg 8675 mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align 8676 add wq, r6 8677 BIDIR_FN AVG 8678 8679%macro W_AVG 1 ; src_offset 8680 ; (a * weight + b * (16 - weight) + 128) >> 8 8681 ; = ((a - b) * weight + (b << 4) + 128) >> 8 8682 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 8683 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 8684 mova m2, [tmp1q+(%1+0)*mmsize] 8685 mova m0, m2 8686 psubw m2, [tmp2q+(%1+0)*mmsize] 8687 mova m3, [tmp1q+(%1+1)*mmsize] 8688 mova m1, m3 8689 psubw m3, [tmp2q+(%1+1)*mmsize] 8690 pmulhw m2, m4 8691 pmulhw m3, m4 8692 paddw m0, m2 8693 paddw m1, m3 8694 pmulhrsw m0, m5 8695 pmulhrsw m1, m5 8696 packuswb m0, m1 8697%endmacro 8698 8699%define W_AVG_INC_PTR AVG_INC_PTR 8700 8701cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 8702 LEA r6, w_avg_ssse3_table 8703 tzcnt wd, wm 8704 movd m4, r6m 8705 movifnidn hd, hm 8706 pxor m0, m0 8707 movsxd wq, dword [r6+wq*4] 8708 mova m5, [pw_2048+r6-w_avg_ssse3_table] 8709 pshufb m4, m0 8710 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 8711 add wq, r6 8712 cmp dword r6m, 7 8713 jg .weight_gt7 8714 mov r6, tmp1q 8715 psubw m0, m4 8716 mov tmp1q, tmp2q 8717 mova m4, m0 ; -weight 8718 mov tmp2q, r6 8719.weight_gt7: 8720 BIDIR_FN W_AVG 8721 8722%macro MASK 1 ; src_offset 8723 ; (a * m + b * (64 - m) + 512) >> 10 8724 ; = ((a - b) * m + (b << 6) + 512) >> 10 8725 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 8726 mova m3, [maskq+(%1+0)*(mmsize/2)] 8727 mova m0, [tmp2q+(%1+0)*mmsize] ; b 8728 psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a 8729 mova m6, m3 ; m 8730 psubb m3, m4, m6 ; -m 8731 paddw m1, m1 ; (b - a) << 1 8732 paddb m3, m3 ; -m << 1 8733 punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) 8734 pmulhw m1, m2 ; (-m * (b - a)) << 10 8735 paddw m0, m1 ; + b 8736 mova m1, [tmp2q+(%1+1)*mmsize] ; b 8737 psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a 8738 paddw m2, m2 ; (b - a) << 1 8739 mova m6, m3 ; (-m << 1) 8740 punpckhbw m3, m4, m6 ; (-m << 9) 8741 pmulhw m2, m3 ; (-m << 9) 8742 paddw m1, m2 ; (-m * (b - a)) << 10 8743 pmulhrsw m0, m5 ; round 8744 pmulhrsw m1, m5 ; round 8745 packuswb m0, m1 ; interleave 16 -> 8 8746%endmacro 8747 8748%macro MASK_INC_PTR 1 8749 add maskq, %1*mmsize/2 8750 add tmp1q, %1*mmsize 8751 add tmp2q, %1*mmsize 8752%endmacro 8753 8754%if ARCH_X86_64 8755cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 8756 movifnidn hd, hm 8757%else 8758cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 8759%define hd dword r5m 8760%endif 8761%define base r6-mask_ssse3_table 8762 LEA r6, mask_ssse3_table 8763 tzcnt wd, wm 8764 movsxd wq, dword [r6+wq*4] 8765 pxor m4, m4 8766 mova m5, [base+pw_2048] 8767 add wq, r6 8768 mov maskq, r6m 8769 BIDIR_FN MASK 8770%undef hd 8771 8772%macro W_MASK_420_END 1-* 8773%rep %0 8774 call .main 8775 paddw m2, [maskq+16*%1] 8776 mova [maskq+16*%1], m2 8777 mova [dstq+strideq*1+16*(2*%1+0)], m0 8778 call .main 8779 psubw m3, m7, m2 8780 psubw m1, m7, [maskq+16*%1] 8781 psubw m3, [dstq+strideq*1+16*(2*%1+1)] 8782 psrlw m1, 2 8783 psrlw m3, 2 8784 packuswb m1, m3 8785 mova [maskq+16*%1], m1 8786 mova [dstq+strideq*1+16*(2*%1+1)], m0 8787 %rotate 1 8788%endrep 8789%endmacro 8790 8791%if UNIX64 8792DECLARE_REG_TMP 7 8793%else 8794DECLARE_REG_TMP 5 8795%endif 8796 8797cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask 8798%define base t0-w_mask_420_ssse3_table 8799 LEA t0, w_mask_420_ssse3_table 8800 tzcnt wd, wm 8801 mov r6d, r7m ; sign 8802 sub tmp2q, tmp1q 8803 movsxd wq, [t0+wq*4] 8804 mova m6, [base+pw_2048] 8805 movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign 8806 add wq, t0 8807%if ARCH_X86_64 8808 mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 8809 movifnidn hd, hm 8810%else 8811 %define m8 [base+pw_6903] 8812 %define hd dword hm 8813%endif 8814 mov maskq, maskmp 8815 call .main 8816 jmp wq 8817.w4_loop: 8818 call .main 8819 add maskq, 4 8820 lea dstq, [dstq+strideq*2] 8821.w4: 8822 pshufd m3, m2, q2020 8823 pshufd m2, m2, q3131 8824 psubw m1, m7, m3 8825 psubw m1, m2 8826 psrlw m1, 2 8827 packuswb m1, m1 8828 movd [maskq], m1 8829 movd [dstq+strideq*0], m0 8830 pshuflw m1, m0, q1032 8831 movd [dstq+strideq*1], m1 8832 punpckhqdq m0, m0 8833 lea dstq, [dstq+strideq*2] 8834 movd [dstq+strideq*0], m0 8835 pshuflw m1, m0, q1032 8836 movd [dstq+strideq*1], m1 8837 sub hd, 4 8838 jg .w4_loop 8839 RET 8840.w8_loop: 8841 call .main 8842 add maskq, 4 8843 lea dstq, [dstq+strideq*2] 8844.w8: 8845 movhlps m3, m2 8846 psubw m1, m7, m2 8847 psubw m1, m3 8848 psrlw m1, 2 8849 packuswb m1, m1 8850 movd [maskq], m1 8851 movq [dstq+strideq*0], m0 8852 movhps [dstq+strideq*1], m0 8853 sub hd, 2 8854 jg .w8_loop 8855 RET 8856.w16_loop: 8857 call .main 8858 add maskq, 8 8859 lea dstq, [dstq+strideq*2] 8860.w16: 8861 mova [dstq+strideq*1], m2 8862 mova [dstq+strideq*0], m0 8863 call .main 8864 psubw m1, m7, [dstq+strideq*1] 8865 psubw m1, m2 8866 psrlw m1, 2 8867 packuswb m1, m1 8868 movq [maskq], m1 8869 mova [dstq+strideq*1], m0 8870 sub hd, 2 8871 jg .w16_loop 8872 RET 8873.w32_loop: 8874 call .main 8875 add maskq, 16 8876 lea dstq, [dstq+strideq*2] 8877.w32: 8878 mova [maskq], m2 8879 mova [dstq+strideq*0+16*0], m0 8880 call .main 8881 mova [dstq+strideq*1+16*1], m2 8882 mova [dstq+strideq*0+16*1], m0 8883 W_MASK_420_END 0 8884 sub hd, 2 8885 jg .w32_loop 8886 RET 8887.w64_loop: 8888 call .main 8889 add maskq, 16*2 8890 lea dstq, [dstq+strideq*2] 8891.w64: 8892 mova [maskq+16*0], m2 8893 mova [dstq+strideq*0+16*0], m0 8894 call .main 8895 mova [dstq+strideq*1+16*1], m2 8896 mova [dstq+strideq*0+16*1], m0 8897 call .main 8898 mova [maskq+16*1], m2 8899 mova [dstq+strideq*0+16*2], m0 8900 call .main 8901 mova [dstq+strideq*1+16*3], m2 8902 mova [dstq+strideq*0+16*3], m0 8903 W_MASK_420_END 0, 1 8904 sub hd, 2 8905 jg .w64_loop 8906 RET 8907.w128_loop: 8908 call .main 8909 add maskq, 16*4 8910 lea dstq, [dstq+strideq*2] 8911.w128: 8912 mova [maskq+16*0], m2 8913 mova [dstq+strideq*0+16*0], m0 8914 call .main 8915 mova [dstq+strideq*1+16*1], m2 8916 mova [dstq+strideq*0+16*1], m0 8917 call .main 8918 mova [maskq+16*1], m2 8919 mova [dstq+strideq*0+16*2], m0 8920 call .main 8921 mova [dstq+strideq*1+16*3], m2 8922 mova [dstq+strideq*0+16*3], m0 8923 call .main 8924 mova [maskq+16*2], m2 8925 mova [dstq+strideq*0+16*4], m0 8926 call .main 8927 mova [dstq+strideq*1+16*5], m2 8928 mova [dstq+strideq*0+16*5], m0 8929 call .main 8930 mova [maskq+16*3], m2 8931 mova [dstq+strideq*0+16*6], m0 8932 call .main 8933 mova [dstq+strideq*1+16*7], m2 8934 mova [dstq+strideq*0+16*7], m0 8935 W_MASK_420_END 0, 1, 2, 3 8936 sub hd, 2 8937 jg .w128_loop 8938 RET 8939ALIGN function_align 8940.main: 8941 mova m0, [tmp1q +16*0] 8942 mova m3, [tmp1q+tmp2q+16*0] 8943 mova m1, [tmp1q +16*1] 8944 mova m4, [tmp1q+tmp2q+16*1] 8945 add tmp1q, 16*2 8946 psubw m3, m0 8947 psubw m4, m1 8948 pabsw m5, m3 8949 psubusw m2, m8, m5 8950 psrlw m2, 8 ; 64 - m 8951 psllw m5, m2, 10 8952 pmulhw m3, m5 8953 pabsw m5, m4 8954 paddw m0, m3 8955 psubusw m3, m8, m5 8956 psrlw m3, 8 8957 phaddw m2, m3 8958 psllw m3, 10 8959 pmulhw m4, m3 8960 paddw m1, m4 8961 pmulhrsw m0, m6 8962 pmulhrsw m1, m6 8963 packuswb m0, m1 8964 ret 8965 8966%macro W_MASK_422_BACKUP 1 ; mask_offset 8967%if ARCH_X86_64 8968 mova m10, m2 8969%else 8970 mova [maskq+16*%1], m2 8971%endif 8972%endmacro 8973 8974%macro W_MASK_422_END 1 ; mask_offset 8975%if ARCH_X86_64 8976 packuswb m10, m2 8977 psubb m1, m7, m10 8978 pavgb m1, m9 8979%else 8980 mova m3, [maskq+16*%1] 8981 packuswb m3, m2 8982 pxor m2, m2 8983 psubb m1, m7, m3 8984 pavgb m1, m2 8985%endif 8986 mova [maskq+16*%1], m1 8987%endmacro 8988 8989cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask 8990%define base t0-w_mask_422_ssse3_table 8991 LEA t0, w_mask_422_ssse3_table 8992 tzcnt wd, wm 8993 mov r6d, r7m ; sign 8994 sub tmp2q, tmp1q 8995 movsxd wq, [t0+wq*4] 8996 mova m6, [base+pw_2048] 8997 movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign 8998 add wq, t0 8999%if ARCH_X86_64 9000 mova m8, [base+pw_6903] 9001 pxor m9, m9 9002 movifnidn hd, hm 9003%else 9004 add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table 9005 %define hd dword hm 9006%endif 9007 mov maskq, maskmp 9008 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9009 jmp wq 9010.w4_loop: 9011 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9012 add maskq, 8 9013 lea dstq, [dstq+strideq*2] 9014.w4: 9015 packuswb m2, m2 9016 psubb m1, m7, m2 9017%if ARCH_X86_64 9018 pavgb m1, m9 9019%else 9020 pxor m2, m2 9021 pavgb m1, m2 9022%endif 9023 movq [maskq], m1 9024 movd [dstq+strideq*0], m0 9025 pshuflw m1, m0, q1032 9026 movd [dstq+strideq*1], m1 9027 punpckhqdq m0, m0 9028 lea dstq, [dstq+strideq*2] 9029 movd [dstq+strideq*0], m0 9030 pshuflw m1, m0, q1032 9031 movd [dstq+strideq*1], m1 9032 sub hd, 4 9033 jg .w4_loop 9034 RET 9035.w8_loop: 9036 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9037 add maskq, 16 9038 lea dstq, [dstq+strideq*2] 9039.w8: 9040 W_MASK_422_BACKUP 0 9041 movq [dstq+strideq*0], m0 9042 movhps [dstq+strideq*1], m0 9043 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9044 lea dstq, [dstq+strideq*2] 9045 W_MASK_422_END 0 9046 movq [dstq+strideq*0], m0 9047 movhps [dstq+strideq*1], m0 9048 sub hd, 4 9049 jg .w8_loop 9050 RET 9051.w16_loop: 9052 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9053 add maskq, 16 9054 lea dstq, [dstq+strideq*2] 9055.w16: 9056 W_MASK_422_BACKUP 0 9057 mova [dstq+strideq*0], m0 9058 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9059 W_MASK_422_END 0 9060 mova [dstq+strideq*1], m0 9061 sub hd, 2 9062 jg .w16_loop 9063 RET 9064.w32_loop: 9065 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9066 add maskq, 16 9067 add dstq, strideq 9068.w32: 9069 W_MASK_422_BACKUP 0 9070 mova [dstq+16*0], m0 9071 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9072 W_MASK_422_END 0 9073 mova [dstq+16*1], m0 9074 dec hd 9075 jg .w32_loop 9076 RET 9077.w64_loop: 9078 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9079 add maskq, 16*2 9080 add dstq, strideq 9081.w64: 9082 W_MASK_422_BACKUP 0 9083 mova [dstq+16*0], m0 9084 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9085 W_MASK_422_END 0 9086 mova [dstq+16*1], m0 9087 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9088 W_MASK_422_BACKUP 1 9089 mova [dstq+16*2], m0 9090 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9091 W_MASK_422_END 1 9092 mova [dstq+16*3], m0 9093 dec hd 9094 jg .w64_loop 9095 RET 9096.w128_loop: 9097 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9098 add maskq, 16*4 9099 add dstq, strideq 9100.w128: 9101 W_MASK_422_BACKUP 0 9102 mova [dstq+16*0], m0 9103 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9104 W_MASK_422_END 0 9105 mova [dstq+16*1], m0 9106 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9107 W_MASK_422_BACKUP 1 9108 mova [dstq+16*2], m0 9109 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9110 W_MASK_422_END 1 9111 mova [dstq+16*3], m0 9112 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9113 W_MASK_422_BACKUP 2 9114 mova [dstq+16*4], m0 9115 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9116 W_MASK_422_END 2 9117 mova [dstq+16*5], m0 9118 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9119 W_MASK_422_BACKUP 3 9120 mova [dstq+16*6], m0 9121 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main 9122 W_MASK_422_END 3 9123 mova [dstq+16*7], m0 9124 dec hd 9125 jg .w128_loop 9126 RET 9127 9128cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask 9129%define base t0-w_mask_444_ssse3_table 9130 LEA t0, w_mask_444_ssse3_table 9131 tzcnt wd, wm 9132 mov maskq, maskmp 9133 sub tmp2q, tmp1q 9134 movsxd wq, [t0+wq*4] 9135 mova m6, [base+pw_6903] 9136 mova m7, [base+pw_2048] 9137 add wq, t0 9138%if ARCH_X86_64 9139 mova m8, [base+pb_64] 9140 movifnidn hd, hm 9141%else 9142 %define m8 [base+pb_64] 9143 %define hd dword hm 9144%endif 9145 call .main 9146 jmp wq 9147.w4_loop: 9148 call .main 9149 lea dstq, [dstq+strideq*2] 9150.w4: 9151 movd [dstq+strideq*0], m0 9152 pshuflw m1, m0, q1032 9153 movd [dstq+strideq*1], m1 9154 punpckhqdq m0, m0 9155 lea dstq, [dstq+strideq*2] 9156 movd [dstq+strideq*0], m0 9157 pshuflw m1, m0, q1032 9158 movd [dstq+strideq*1], m1 9159 sub hd, 4 9160 jg .w4_loop 9161 RET 9162.w8_loop: 9163 call .main 9164 lea dstq, [dstq+strideq*2] 9165.w8: 9166 movq [dstq+strideq*0], m0 9167 movhps [dstq+strideq*1], m0 9168 sub hd, 2 9169 jg .w8_loop 9170 RET 9171.w16_loop: 9172 call .main 9173 lea dstq, [dstq+strideq*2] 9174.w16: 9175 mova [dstq+strideq*0], m0 9176 call .main 9177 mova [dstq+strideq*1], m0 9178 sub hd, 2 9179 jg .w16_loop 9180 RET 9181.w32_loop: 9182 call .main 9183 add dstq, strideq 9184.w32: 9185 mova [dstq+16*0], m0 9186 call .main 9187 mova [dstq+16*1], m0 9188 dec hd 9189 jg .w32_loop 9190 RET 9191.w64_loop: 9192 call .main 9193 add dstq, strideq 9194.w64: 9195 mova [dstq+16*0], m0 9196 call .main 9197 mova [dstq+16*1], m0 9198 call .main 9199 mova [dstq+16*2], m0 9200 call .main 9201 mova [dstq+16*3], m0 9202 dec hd 9203 jg .w64_loop 9204 RET 9205.w128_loop: 9206 call .main 9207 add dstq, strideq 9208.w128: 9209 mova [dstq+16*0], m0 9210 call .main 9211 mova [dstq+16*1], m0 9212 call .main 9213 mova [dstq+16*2], m0 9214 call .main 9215 mova [dstq+16*3], m0 9216 call .main 9217 mova [dstq+16*4], m0 9218 call .main 9219 mova [dstq+16*5], m0 9220 call .main 9221 mova [dstq+16*6], m0 9222 call .main 9223 mova [dstq+16*7], m0 9224 dec hd 9225 jg .w128_loop 9226 RET 9227ALIGN function_align 9228.main: 9229 mova m0, [tmp1q +16*0] 9230 mova m3, [tmp1q+tmp2q+16*0] 9231 mova m1, [tmp1q +16*1] 9232 mova m4, [tmp1q+tmp2q+16*1] 9233 add tmp1q, 16*2 9234 psubw m3, m0 9235 psubw m4, m1 9236 pabsw m5, m3 9237 psubusw m2, m6, m5 9238 psrlw m2, 8 ; 64 - m 9239 psllw m5, m2, 10 9240 pmulhw m3, m5 9241 pabsw m5, m4 9242 paddw m0, m3 9243 psubusw m3, m6, m5 9244 psrlw m3, 8 9245 packuswb m2, m3 9246 psllw m3, 10 9247 pmulhw m4, m3 9248 psubb m3, m8, m2 9249 paddw m1, m4 9250 pmulhrsw m0, m7 9251 pmulhrsw m1, m7 9252 mova [maskq], m3 9253 add maskq, 16 9254 packuswb m0, m1 9255 ret 9256 9257%macro BLEND_64M 4; a, b, mask1, mask2 9258 punpcklbw m0, %1, %2; {b;a}[7..0] 9259 punpckhbw %1, %2 ; {b;a}[15..8] 9260 pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 9261 pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 9262 pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 9263 pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 9264 packuswb m0, %1 ; {blendpx}[15..0] u8 9265%endmacro 9266 9267%macro BLEND 2; a, b 9268 psubb m3, m4, m0 ; m3 = (64 - m) 9269 punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] 9270 punpckhbw m3, m0 ; {m;(64-m)}[15..8] 9271 BLEND_64M %1, %2, m2, m3 9272%endmacro 9273 9274cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 9275%define base r6-blend_ssse3_table 9276 LEA r6, blend_ssse3_table 9277 tzcnt wd, wm 9278 movifnidn hd, hm 9279 movifnidn maskq, maskmp 9280 movsxd wq, dword [r6+wq*4] 9281 mova m4, [base+pb_64] 9282 mova m5, [base+pw_512] 9283 add wq, r6 9284 lea r6, [dsq*3] 9285 jmp wq 9286.w4: 9287 movq m0, [maskq]; m 9288 movd m1, [dstq+dsq*0] ; a 9289 movd m6, [dstq+dsq*1] 9290 punpckldq m1, m6 9291 movq m6, [tmpq] ; b 9292 psubb m3, m4, m0 ; m3 = (64 - m) 9293 punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] 9294 punpcklbw m1, m6 ; {b;a}[7..0] 9295 pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 9296 pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 9297 packuswb m1, m0 ; {blendpx}[15..0] u8 9298 movd [dstq+dsq*0], m1 9299 psrlq m1, 32 9300 movd [dstq+dsq*1], m1 9301 add maskq, 8 9302 add tmpq, 8 9303 lea dstq, [dstq+dsq*2] ; dst_stride * 2 9304 sub hd, 2 9305 jg .w4 9306 RET 9307.w8: 9308 mova m0, [maskq]; m 9309 movq m1, [dstq+dsq*0] ; a 9310 movhps m1, [dstq+dsq*1] 9311 mova m6, [tmpq] ; b 9312 BLEND m1, m6 9313 movq [dstq+dsq*0], m0 9314 movhps [dstq+dsq*1], m0 9315 add maskq, 16 9316 add tmpq, 16 9317 lea dstq, [dstq+dsq*2] ; dst_stride * 2 9318 sub hd, 2 9319 jg .w8 9320 RET 9321.w16: 9322 mova m0, [maskq]; m 9323 mova m1, [dstq] ; a 9324 mova m6, [tmpq] ; b 9325 BLEND m1, m6 9326 mova [dstq], m0 9327 add maskq, 16 9328 add tmpq, 16 9329 add dstq, dsq ; dst_stride 9330 dec hd 9331 jg .w16 9332 RET 9333.w32: 9334 %assign i 0 9335 %rep 2 9336 mova m0, [maskq+16*i]; m 9337 mova m1, [dstq+16*i] ; a 9338 mova m6, [tmpq+16*i] ; b 9339 BLEND m1, m6 9340 mova [dstq+i*16], m0 9341 %assign i i+1 9342 %endrep 9343 add maskq, 32 9344 add tmpq, 32 9345 add dstq, dsq ; dst_stride 9346 dec hd 9347 jg .w32 9348 RET 9349 9350cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 9351%define base r5-blend_v_ssse3_table 9352 LEA r5, blend_v_ssse3_table 9353 tzcnt wd, wm 9354 movifnidn hd, hm 9355 movsxd wq, dword [r5+wq*4] 9356 mova m5, [base+pw_512] 9357 add wq, r5 9358 add maskq, obmc_masks-blend_v_ssse3_table 9359 jmp wq 9360.w2: 9361 movd m3, [maskq+4] 9362 punpckldq m3, m3 9363 ; 2 mask blend is provided for 4 pixels / 2 lines 9364.w2_loop: 9365 movd m1, [dstq+dsq*0] ; a {..;a;a} 9366 pinsrw m1, [dstq+dsq*1], 1 9367 movd m2, [tmpq] ; b 9368 punpcklbw m0, m1, m2; {b;a}[7..0] 9369 pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 9370 pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 9371 packuswb m0, m1 ; {blendpx}[8..0] u8 9372 movd r3d, m0 9373 mov [dstq+dsq*0], r3w 9374 shr r3d, 16 9375 mov [dstq+dsq*1], r3w 9376 add tmpq, 2*2 9377 lea dstq, [dstq + dsq * 2] 9378 sub hd, 2 9379 jg .w2_loop 9380 RET 9381.w4: 9382 movddup m3, [maskq+8] 9383 ; 4 mask blend is provided for 8 pixels / 2 lines 9384.w4_loop: 9385 movd m1, [dstq+dsq*0] ; a 9386 movd m2, [dstq+dsq*1] ; 9387 punpckldq m1, m2 9388 movq m2, [tmpq] ; b 9389 punpcklbw m1, m2 ; {b;a}[7..0] 9390 pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 9391 pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 9392 packuswb m1, m1 ; {blendpx}[8..0] u8 9393 movd [dstq], m1 9394 psrlq m1, 32 9395 movd [dstq+dsq*1], m1 9396 add tmpq, 2*4 9397 lea dstq, [dstq+dsq*2] 9398 sub hd, 2 9399 jg .w4_loop 9400 RET 9401.w8: 9402 mova m3, [maskq+16] 9403 ; 8 mask blend is provided for 16 pixels 9404.w8_loop: 9405 movq m1, [dstq+dsq*0] ; a 9406 movhps m1, [dstq+dsq*1] 9407 mova m2, [tmpq]; b 9408 BLEND_64M m1, m2, m3, m3 9409 movq [dstq+dsq*0], m0 9410 movhps [dstq+dsq*1], m0 9411 add tmpq, 16 9412 lea dstq, [dstq+dsq*2] 9413 sub hd, 2 9414 jg .w8_loop 9415 RET 9416.w16: 9417 ; 16 mask blend is provided for 32 pixels 9418 mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) 9419 mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) 9420.w16_loop: 9421 mova m1, [dstq] ; a 9422 mova m2, [tmpq] ; b 9423 BLEND_64M m1, m2, m3, m4 9424 mova [dstq], m0 9425 add tmpq, 16 9426 add dstq, dsq 9427 dec hd 9428 jg .w16_loop 9429 RET 9430.w32: 9431%if WIN64 9432 mova [rsp+8], xmm6 9433%endif 9434 mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) 9435 mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) 9436 mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) 9437 ; 16 mask blend is provided for 64 pixels 9438.w32_loop: 9439 mova m1, [dstq+16*0] ; a 9440 mova m2, [tmpq+16*0] ; b 9441 BLEND_64M m1, m2, m3, m4 9442 movq m1, [dstq+16*1] ; a 9443 punpcklbw m1, [tmpq+16*1] ; b 9444 pmaddubsw m1, m6 9445 pmulhrsw m1, m5 9446 packuswb m1, m1 9447 mova [dstq+16*0], m0 9448 movq [dstq+16*1], m1 9449 add tmpq, 32 9450 add dstq, dsq 9451 dec hd 9452 jg .w32_loop 9453%if WIN64 9454 mova xmm6, [rsp+8] 9455%endif 9456 RET 9457 9458cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask 9459%define base t0-blend_h_ssse3_table 9460%if ARCH_X86_32 9461 ; We need to keep the PIC pointer for w4, reload wd from stack instead 9462 DECLARE_REG_TMP 6 9463%else 9464 DECLARE_REG_TMP 5 9465 mov r6d, wd 9466%endif 9467 LEA t0, blend_h_ssse3_table 9468 tzcnt wd, wm 9469 mov hd, hm 9470 movsxd wq, dword [t0+wq*4] 9471 mova m5, [base+pw_512] 9472 add wq, t0 9473 lea maskq, [base+obmc_masks+hq*2] 9474 lea hd, [hq*3] 9475 shr hd, 2 ; h * 3/4 9476 lea maskq, [maskq+hq*2] 9477 neg hq 9478 jmp wq 9479.w2: 9480 movd m0, [dstq+dsq*0] 9481 pinsrw m0, [dstq+dsq*1], 1 9482 movd m2, [maskq+hq*2] 9483 movd m1, [tmpq] 9484 punpcklwd m2, m2 9485 punpcklbw m0, m1 9486 pmaddubsw m0, m2 9487 pmulhrsw m0, m5 9488 packuswb m0, m0 9489 movd r3d, m0 9490 mov [dstq+dsq*0], r3w 9491 shr r3d, 16 9492 mov [dstq+dsq*1], r3w 9493 lea dstq, [dstq+dsq*2] 9494 add tmpq, 2*2 9495 add hq, 2 9496 jl .w2 9497 RET 9498.w4: 9499%if ARCH_X86_32 9500 mova m3, [base+blend_shuf] 9501%else 9502 mova m3, [blend_shuf] 9503%endif 9504.w4_loop: 9505 movd m0, [dstq+dsq*0] 9506 movd m2, [dstq+dsq*1] 9507 punpckldq m0, m2 ; a 9508 movq m1, [tmpq] ; b 9509 movq m2, [maskq+hq*2] ; m 9510 pshufb m2, m3 9511 punpcklbw m0, m1 9512 pmaddubsw m0, m2 9513 pmulhrsw m0, m5 9514 packuswb m0, m0 9515 movd [dstq+dsq*0], m0 9516 psrlq m0, 32 9517 movd [dstq+dsq*1], m0 9518 lea dstq, [dstq+dsq*2] 9519 add tmpq, 4*2 9520 add hq, 2 9521 jl .w4_loop 9522 RET 9523.w8: 9524 movd m4, [maskq+hq*2] 9525 punpcklwd m4, m4 9526 pshufd m3, m4, q0000 9527 pshufd m4, m4, q1111 9528 movq m1, [dstq+dsq*0] ; a 9529 movhps m1, [dstq+dsq*1] 9530 mova m2, [tmpq] 9531 BLEND_64M m1, m2, m3, m4 9532 movq [dstq+dsq*0], m0 9533 movhps [dstq+dsq*1], m0 9534 lea dstq, [dstq+dsq*2] 9535 add tmpq, 8*2 9536 add hq, 2 9537 jl .w8 9538 RET 9539; w16/w32/w64/w128 9540.w16: 9541%if ARCH_X86_32 9542 mov r6d, wm 9543%endif 9544 sub dsq, r6 9545.w16_loop0: 9546 movd m3, [maskq+hq*2] 9547 pshuflw m3, m3, q0000 9548 punpcklqdq m3, m3 9549 mov wd, r6d 9550.w16_loop: 9551 mova m1, [dstq] ; a 9552 mova m2, [tmpq] ; b 9553 BLEND_64M m1, m2, m3, m3 9554 mova [dstq], m0 9555 add dstq, 16 9556 add tmpq, 16 9557 sub wd, 16 9558 jg .w16_loop 9559 add dstq, dsq 9560 inc hq 9561 jl .w16_loop0 9562 RET 9563 9564; emu_edge args: 9565; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, 9566; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, 9567; const pixel *ref, const ptrdiff_t ref_stride 9568; 9569; bw, bh total filled size 9570; iw, ih, copied block -> fill bottom, right 9571; x, y, offset in bw/bh -> fill top, left 9572cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ 9573 y, dst, dstride, src, sstride, \ 9574 bottomext, rightext, blk 9575 ; we assume that the buffer (stride) is larger than width, so we can 9576 ; safely overwrite by a few bytes 9577 pxor m1, m1 9578 9579%if ARCH_X86_64 9580 %define reg_zero r12q 9581 %define reg_tmp r10 9582 %define reg_src srcq 9583 %define reg_bottomext bottomextq 9584 %define reg_rightext rightextq 9585 %define reg_blkm r9m 9586%else 9587 %define reg_zero r6 9588 %define reg_tmp r0 9589 %define reg_src r1 9590 %define reg_bottomext r0 9591 %define reg_rightext r1 9592 %define reg_blkm r2m 9593%endif 9594 ; 9595 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 9596 xor reg_zero, reg_zero 9597 lea reg_tmp, [ihq-1] 9598 cmp yq, ihq 9599 cmovs reg_tmp, yq 9600 test yq, yq 9601 cmovs reg_tmp, reg_zero 9602%if ARCH_X86_64 9603 imul reg_tmp, sstrideq 9604 add srcq, reg_tmp 9605%else 9606 imul reg_tmp, sstridem 9607 mov reg_src, srcm 9608 add reg_src, reg_tmp 9609%endif 9610 ; 9611 ; ref += iclip(x, 0, iw - 1) 9612 lea reg_tmp, [iwq-1] 9613 cmp xq, iwq 9614 cmovs reg_tmp, xq 9615 test xq, xq 9616 cmovs reg_tmp, reg_zero 9617 add reg_src, reg_tmp 9618%if ARCH_X86_32 9619 mov srcm, reg_src 9620%endif 9621 ; 9622 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 9623%if ARCH_X86_32 9624 mov r1, r1m ; restore bh 9625%endif 9626 lea reg_bottomext, [yq+bhq] 9627 sub reg_bottomext, ihq 9628 lea r3, [bhq-1] 9629 cmovs reg_bottomext, reg_zero 9630 ; 9631 9632 DEFINE_ARGS bw, bh, iw, ih, x, \ 9633 topext, dst, dstride, src, sstride, \ 9634 bottomext, rightext, blk 9635 9636 ; top_ext = iclip(-y, 0, bh - 1) 9637 neg topextq 9638 cmovs topextq, reg_zero 9639 cmp reg_bottomext, bhq 9640 cmovns reg_bottomext, r3 9641 cmp topextq, bhq 9642 cmovg topextq, r3 9643 %if ARCH_X86_32 9644 mov r4m, reg_bottomext 9645 ; 9646 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 9647 mov r0, r0m ; restore bw 9648 %endif 9649 lea reg_rightext, [xq+bwq] 9650 sub reg_rightext, iwq 9651 lea r2, [bwq-1] 9652 cmovs reg_rightext, reg_zero 9653 9654 DEFINE_ARGS bw, bh, iw, ih, leftext, \ 9655 topext, dst, dstride, src, sstride, \ 9656 bottomext, rightext, blk 9657 9658 ; left_ext = iclip(-x, 0, bw - 1) 9659 neg leftextq 9660 cmovs leftextq, reg_zero 9661 cmp reg_rightext, bwq 9662 cmovns reg_rightext, r2 9663 %if ARCH_X86_32 9664 mov r3m, r1 9665 %endif 9666 cmp leftextq, bwq 9667 cmovns leftextq, r2 9668 9669%undef reg_zero 9670%undef reg_tmp 9671%undef reg_src 9672%undef reg_bottomext 9673%undef reg_rightext 9674 9675 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ 9676 topext, dst, dstride, src, sstride, \ 9677 bottomext, rightext, blk 9678 9679 ; center_h = bh - top_ext - bottom_ext 9680%if ARCH_X86_64 9681 lea r3, [bottomextq+topextq] 9682 sub centerhq, r3 9683%else 9684 mov r1, centerhm ; restore r1 9685 sub centerhq, topextq 9686 sub centerhq, r4m 9687 mov r1m, centerhq 9688%endif 9689 ; 9690 ; blk += top_ext * PXSTRIDE(dst_stride) 9691 mov r2, topextq 9692%if ARCH_X86_64 9693 imul r2, dstrideq 9694%else 9695 mov r6, r6m ; restore dstq 9696 imul r2, dstridem 9697%endif 9698 add dstq, r2 9699 mov reg_blkm, dstq ; save pointer for ext 9700 ; 9701 ; center_w = bw - left_ext - right_ext 9702 mov centerwq, bwq 9703%if ARCH_X86_64 9704 lea r3, [rightextq+leftextq] 9705 sub centerwq, r3 9706%else 9707 sub centerwq, r3m 9708 sub centerwq, leftextq 9709%endif 9710 9711; vloop Macro 9712%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 9713 %if ARCH_X86_64 9714 %define reg_tmp r12 9715 %else 9716 %define reg_tmp r0 9717 %endif 9718.v_loop_%3: 9719 %if ARCH_X86_32 9720 mov r0, r0m 9721 mov r1, r1m 9722 %endif 9723%if %1 9724 ; left extension 9725 %if ARCH_X86_64 9726 movd m0, [srcq] 9727 %else 9728 mov r3, srcm 9729 movd m0, [r3] 9730 %endif 9731 pshufb m0, m1 9732 xor r3, r3 9733.left_loop_%3: 9734 mova [dstq+r3], m0 9735 add r3, mmsize 9736 cmp r3, leftextq 9737 jl .left_loop_%3 9738 ; body 9739 lea reg_tmp, [dstq+leftextq] 9740%endif 9741 xor r3, r3 9742.body_loop_%3: 9743 %if ARCH_X86_64 9744 movu m0, [srcq+r3] 9745 %else 9746 mov r1, srcm 9747 movu m0, [r1+r3] 9748 %endif 9749%if %1 9750 movu [reg_tmp+r3], m0 9751%else 9752 movu [dstq+r3], m0 9753%endif 9754 add r3, mmsize 9755 cmp r3, centerwq 9756 jl .body_loop_%3 9757%if %2 9758 ; right extension 9759%if %1 9760 add reg_tmp, centerwq 9761%else 9762 lea reg_tmp, [dstq+centerwq] 9763%endif 9764 %if ARCH_X86_64 9765 movd m0, [srcq+centerwq-1] 9766 %else 9767 mov r3, srcm 9768 movd m0, [r3+centerwq-1] 9769 %endif 9770 pshufb m0, m1 9771 xor r3, r3 9772.right_loop_%3: 9773 movu [reg_tmp+r3], m0 9774 add r3, mmsize 9775 %if ARCH_X86_64 9776 cmp r3, rightextq 9777 %else 9778 cmp r3, r3m 9779 %endif 9780 jl .right_loop_%3 9781%endif 9782 %if ARCH_X86_64 9783 add dstq, dstrideq 9784 add srcq, sstrideq 9785 dec centerhq 9786 jg .v_loop_%3 9787 %else 9788 add dstq, dstridem 9789 mov r0, sstridem 9790 add srcm, r0 9791 sub dword centerhm, 1 9792 jg .v_loop_%3 9793 mov r0, r0m ; restore r0 9794 %endif 9795%endmacro ; vloop MACRO 9796 9797 test leftextq, leftextq 9798 jnz .need_left_ext 9799 %if ARCH_X86_64 9800 test rightextq, rightextq 9801 jnz .need_right_ext 9802 %else 9803 cmp leftextq, r3m ; leftextq == 0 9804 jne .need_right_ext 9805 %endif 9806 v_loop 0, 0, 0 9807 jmp .body_done 9808 9809 ;left right extensions 9810.need_left_ext: 9811 %if ARCH_X86_64 9812 test rightextq, rightextq 9813 %else 9814 mov r3, r3m 9815 test r3, r3 9816 %endif 9817 jnz .need_left_right_ext 9818 v_loop 1, 0, 1 9819 jmp .body_done 9820 9821.need_left_right_ext: 9822 v_loop 1, 1, 2 9823 jmp .body_done 9824 9825.need_right_ext: 9826 v_loop 0, 1, 3 9827 9828.body_done: 9829; r0 ; bw 9830; r1 ;; x loop 9831; r4 ;; y loop 9832; r5 ; topextq 9833; r6 ;dstq 9834; r7 ;dstrideq 9835; r8 ; srcq 9836%if ARCH_X86_64 9837 %define reg_dstride dstrideq 9838%else 9839 %define reg_dstride r2 9840%endif 9841 ; 9842 ; bottom edge extension 9843 %if ARCH_X86_64 9844 test bottomextq, bottomextq 9845 jz .top 9846 %else 9847 xor r1, r1 9848 cmp r1, r4m 9849 je .top 9850 %endif 9851 ; 9852 %if ARCH_X86_64 9853 mov srcq, dstq 9854 sub srcq, dstrideq 9855 xor r1, r1 9856 %else 9857 mov r3, dstq 9858 mov reg_dstride, dstridem 9859 sub r3, reg_dstride 9860 mov srcm, r3 9861 %endif 9862 ; 9863.bottom_x_loop: 9864 %if ARCH_X86_64 9865 mova m0, [srcq+r1] 9866 lea r3, [dstq+r1] 9867 mov r4, bottomextq 9868 %else 9869 mov r3, srcm 9870 mova m0, [r3+r1] 9871 lea r3, [dstq+r1] 9872 mov r4, r4m 9873 %endif 9874 ; 9875.bottom_y_loop: 9876 mova [r3], m0 9877 add r3, reg_dstride 9878 dec r4 9879 jg .bottom_y_loop 9880 add r1, mmsize 9881 cmp r1, bwq 9882 jl .bottom_x_loop 9883 9884.top: 9885 ; top edge extension 9886 test topextq, topextq 9887 jz .end 9888%if ARCH_X86_64 9889 mov srcq, reg_blkm 9890%else 9891 mov r3, reg_blkm 9892 mov reg_dstride, dstridem 9893%endif 9894 mov dstq, dstm 9895 xor r1, r1 9896 ; 9897.top_x_loop: 9898%if ARCH_X86_64 9899 mova m0, [srcq+r1] 9900%else 9901 mov r3, reg_blkm 9902 mova m0, [r3+r1] 9903%endif 9904 lea r3, [dstq+r1] 9905 mov r4, topextq 9906 ; 9907.top_y_loop: 9908 mova [r3], m0 9909 add r3, reg_dstride 9910 dec r4 9911 jg .top_y_loop 9912 add r1, mmsize 9913 cmp r1, bwq 9914 jl .top_x_loop 9915 9916.end: 9917 RET 9918 9919%undef reg_dstride 9920%undef reg_blkm 9921%undef reg_tmp 9922 9923cextern resize_filter 9924 9925%macro SCRATCH 3 9926%if ARCH_X86_32 9927 mova [rsp+%3*mmsize], m%1 9928%define m%2 [rsp+%3*mmsize] 9929%else 9930 SWAP %1, %2 9931%endif 9932%endmacro 9933 9934%if ARCH_X86_64 9935cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \ 9936 dst_w, h, src_w, dx, mx0 9937%elif STACK_ALIGNMENT >= 16 9938cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ 9939 dst_w, h, src_w, dx, mx0 9940%else 9941cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ 9942 dst_w, h, src_w, dx, mx0 9943%endif 9944 movifnidn dstq, dstmp 9945 movifnidn srcq, srcmp 9946%if STACK_ALIGNMENT >= 16 9947 movifnidn dst_wd, dst_wm 9948%endif 9949%if ARCH_X86_64 9950 movifnidn hd, hm 9951%endif 9952 sub dword mx0m, 4<<14 9953 sub dword src_wm, 8 9954 movd m7, dxm 9955 movd m6, mx0m 9956 movd m5, src_wm 9957 pshufd m7, m7, q0000 9958 pshufd m6, m6, q0000 9959 pshufd m5, m5, q0000 9960 9961%if ARCH_X86_64 9962 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 9963 LEA r7, $$ 9964%define base r7-$$ 9965%else 9966 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x 9967%define hd dword r5m 9968%if STACK_ALIGNMENT >= 16 9969 LEA r6, $$ 9970%define base r6-$$ 9971%else 9972 LEA r4, $$ 9973%define base r4-$$ 9974%endif 9975%endif 9976 9977%if ARCH_X86_64 9978 mova m10, [base+pw_m256] 9979 mova m9, [base+pd_63] 9980 mova m8, [base+pb_8x0_8x8] 9981%else 9982%define m10 [base+pw_m256] 9983%define m9 [base+pd_63] 9984%define m8 [base+pb_8x0_8x8] 9985%endif 9986 pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] 9987 pslld m7, 2 ; dx*4 9988 pslld m5, 14 9989 paddd m6, m4 ; mx+[0..3]*dx 9990 SCRATCH 7, 13, 0 9991 SCRATCH 6, 12, 1 9992 SCRATCH 5, 11, 2 9993 9994 ; m10 = pmulhrsw constant for x=(x+64)>>7 9995 ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8 9996 9997.loop_y: 9998 xor xd, xd 9999 mova m0, m12 ; per-line working version of mx 10000 10001.loop_x: 10002 pxor m1, m1 10003 pcmpgtd m1, m0 10004 pandn m1, m0 10005 psrad m2, m0, 8 ; filter offset (unmasked) 10006 pcmpgtd m3, m11, m1 10007 pand m1, m3 10008 pandn m3, m11 10009 por m1, m3 10010 psubd m3, m0, m1 ; pshufb offset 10011 psrad m1, 14 ; clipped src_x offset 10012 psrad m3, 14 ; pshufb edge_emu offset 10013 pand m2, m9 ; filter offset (masked) 10014 10015 ; load source pixels 10016%if ARCH_X86_64 10017 movd r8d, m1 10018 pshuflw m1, m1, q3232 10019 movd r9d, m1 10020 punpckhqdq m1, m1 10021 movd r10d, m1 10022 psrlq m1, 32 10023 movd r11d, m1 10024 movq m4, [srcq+r8] 10025 movq m5, [srcq+r10] 10026 movhps m4, [srcq+r9] 10027 movhps m5, [srcq+r11] 10028%else 10029 movd r3d, m1 10030 pshufd m1, m1, q3312 10031 movd r1d, m1 10032 pshuflw m1, m1, q3232 10033 movq m4, [srcq+r3] 10034 movq m5, [srcq+r1] 10035 movd r3d, m1 10036 punpckhqdq m1, m1 10037 movd r1d, m1 10038 movhps m4, [srcq+r3] 10039 movhps m5, [srcq+r1] 10040%endif 10041 10042 ; if no emulation is required, we don't need to shuffle or emulate edges 10043 ; this also saves 2 quasi-vpgatherdqs 10044 pxor m6, m6 10045 pcmpeqb m6, m3 10046%if ARCH_X86_64 10047 pmovmskb r8d, m6 10048 cmp r8d, 0xffff 10049%else 10050 pmovmskb r3d, m6 10051 cmp r3d, 0xffff 10052%endif 10053 je .filter 10054 10055%if ARCH_X86_64 10056 movd r8d, m3 10057 pshuflw m3, m3, q3232 10058 movd r9d, m3 10059 punpckhqdq m3, m3 10060 movd r10d, m3 10061 psrlq m3, 32 10062 movd r11d, m3 10063 movsxd r8, r8d 10064 movsxd r9, r9d 10065 movsxd r10, r10d 10066 movsxd r11, r11d 10067 movq m6, [base+resize_shuf+4+r8] 10068 movq m7, [base+resize_shuf+4+r10] 10069 movhps m6, [base+resize_shuf+4+r9] 10070 movhps m7, [base+resize_shuf+4+r11] 10071%else 10072 movd r3d, m3 10073 pshufd m3, m3, q3312 10074 movd r1d, m3 10075 pshuflw m3, m3, q3232 10076 movq m6, [base+resize_shuf+4+r3] 10077 movq m7, [base+resize_shuf+4+r1] 10078 movd r3d, m3 10079 punpckhqdq m3, m3 10080 movd r1d, m3 10081 movhps m6, [base+resize_shuf+4+r3] 10082 movhps m7, [base+resize_shuf+4+r1] 10083%endif 10084 10085 paddb m6, m8 10086 paddb m7, m8 10087 pshufb m4, m6 10088 pshufb m5, m7 10089 10090.filter: 10091%if ARCH_X86_64 10092 movd r8d, m2 10093 pshuflw m2, m2, q3232 10094 movd r9d, m2 10095 punpckhqdq m2, m2 10096 movd r10d, m2 10097 psrlq m2, 32 10098 movd r11d, m2 10099 movq m6, [base+resize_filter+r8*8] 10100 movq m7, [base+resize_filter+r10*8] 10101 movhps m6, [base+resize_filter+r9*8] 10102 movhps m7, [base+resize_filter+r11*8] 10103%else 10104 movd r3d, m2 10105 pshufd m2, m2, q3312 10106 movd r1d, m2 10107 pshuflw m2, m2, q3232 10108 movq m6, [base+resize_filter+r3*8] 10109 movq m7, [base+resize_filter+r1*8] 10110 movd r3d, m2 10111 punpckhqdq m2, m2 10112 movd r1d, m2 10113 movhps m6, [base+resize_filter+r3*8] 10114 movhps m7, [base+resize_filter+r1*8] 10115%endif 10116 10117 pmaddubsw m4, m6 10118 pmaddubsw m5, m7 10119 phaddw m4, m5 10120 phaddsw m4, m4 10121 pmulhrsw m4, m10 ; x=(x+64)>>7 10122 packuswb m4, m4 10123 movd [dstq+xq], m4 10124 10125 paddd m0, m13 10126 add xd, 4 10127%if STACK_ALIGNMENT >= 16 10128 cmp xd, dst_wd 10129%else 10130 cmp xd, dst_wm 10131%endif 10132 jl .loop_x 10133 10134 add dstq, dst_stridemp 10135 add srcq, src_stridemp 10136 dec hd 10137 jg .loop_y 10138 RET 10139 10140INIT_XMM ssse3 10141WARP_AFFINE_8X8 10142 10143INIT_XMM sse4 10144WARP_AFFINE_8X8 10145