1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workerspel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 34*c0909341SAndroid Build Coastguard Worker db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 35*c0909341SAndroid Build Coastguard Workerspel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 36*c0909341SAndroid Build Coastguard Worker db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 37*c0909341SAndroid Build Coastguard Worker db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 38*c0909341SAndroid Build Coastguard Worker db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 39*c0909341SAndroid Build Coastguard Workerspel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 40*c0909341SAndroid Build Coastguard Worker db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 41*c0909341SAndroid Build Coastguard Workerspel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 42*c0909341SAndroid Build Coastguard Worker db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 43*c0909341SAndroid Build Coastguard Worker db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 44*c0909341SAndroid Build Coastguard Worker db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 45*c0909341SAndroid Build Coastguard Workerspel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 46*c0909341SAndroid Build Coastguard Worker db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 47*c0909341SAndroid Build Coastguard Worker db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 48*c0909341SAndroid Build Coastguard Worker db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 49*c0909341SAndroid Build Coastguard Workerspel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 50*c0909341SAndroid Build Coastguard Worker db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 51*c0909341SAndroid Build Coastguard Worker db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 52*c0909341SAndroid Build Coastguard Worker db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 53*c0909341SAndroid Build Coastguard Workerprep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 54*c0909341SAndroid Build Coastguard Worker db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 55*c0909341SAndroid Build Coastguard Worker db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 56*c0909341SAndroid Build Coastguard Worker db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 57*c0909341SAndroid Build Coastguard Workerprep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 58*c0909341SAndroid Build Coastguard Worker db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 59*c0909341SAndroid Build Coastguard Worker db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 60*c0909341SAndroid Build Coastguard Worker db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 61*c0909341SAndroid Build Coastguard Workerprep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 62*c0909341SAndroid Build Coastguard Worker db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 63*c0909341SAndroid Build Coastguard Worker db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 64*c0909341SAndroid Build Coastguard Worker db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 65*c0909341SAndroid Build Coastguard Workerspel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 66*c0909341SAndroid Build Coastguard Worker db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 67*c0909341SAndroid Build Coastguard Worker db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 68*c0909341SAndroid Build Coastguard Worker db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78 69*c0909341SAndroid Build Coastguard Workerspel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78 70*c0909341SAndroid Build Coastguard Worker db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 71*c0909341SAndroid Build Coastguard Worker db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110 72*c0909341SAndroid Build Coastguard Worker db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 73*c0909341SAndroid Build Coastguard Workerspel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 74*c0909341SAndroid Build Coastguard Worker db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 75*c0909341SAndroid Build Coastguard Worker db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 76*c0909341SAndroid Build Coastguard Worker db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 77*c0909341SAndroid Build Coastguard Workerspel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 78*c0909341SAndroid Build Coastguard Worker db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 79*c0909341SAndroid Build Coastguard Worker db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 80*c0909341SAndroid Build Coastguard Worker db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 81*c0909341SAndroid Build Coastguard Workerspel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 82*c0909341SAndroid Build Coastguard Worker db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 83*c0909341SAndroid Build Coastguard Worker db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 84*c0909341SAndroid Build Coastguard Worker db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 85*c0909341SAndroid Build Coastguard Workerspel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 86*c0909341SAndroid Build Coastguard Worker db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 87*c0909341SAndroid Build Coastguard Worker db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 88*c0909341SAndroid Build Coastguard Worker db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 89*c0909341SAndroid Build Coastguard Workerspel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 90*c0909341SAndroid Build Coastguard Worker db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 91*c0909341SAndroid Build Coastguard Worker db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 92*c0909341SAndroid Build Coastguard Workerspel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 93*c0909341SAndroid Build Coastguard Workerspel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 94*c0909341SAndroid Build Coastguard Worker db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 95*c0909341SAndroid Build Coastguard Workerw_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 96*c0909341SAndroid Build Coastguard Worker db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 97*c0909341SAndroid Build Coastguard Workerw_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 98*c0909341SAndroid Build Coastguard Worker db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 99*c0909341SAndroid Build Coastguard Worker db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 100*c0909341SAndroid Build Coastguard Worker db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 101*c0909341SAndroid Build Coastguard Workerw_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 102*c0909341SAndroid Build Coastguard Worker db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 103*c0909341SAndroid Build Coastguard Worker db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 104*c0909341SAndroid Build Coastguard Worker db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 105*c0909341SAndroid Build Coastguard Workerw_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 106*c0909341SAndroid Build Coastguard Worker db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 107*c0909341SAndroid Build Coastguard Worker db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 108*c0909341SAndroid Build Coastguard Worker db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 109*c0909341SAndroid Build Coastguard Workerw_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 110*c0909341SAndroid Build Coastguard Worker db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 111*c0909341SAndroid Build Coastguard Worker db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 112*c0909341SAndroid Build Coastguard Worker db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 113*c0909341SAndroid Build Coastguard Workerwarp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 114*c0909341SAndroid Build Coastguard Worker db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 115*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 116*c0909341SAndroid Build Coastguard Worker db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 117*c0909341SAndroid Build Coastguard Workerwarp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 118*c0909341SAndroid Build Coastguard Worker db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 119*c0909341SAndroid Build Coastguard Worker db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 120*c0909341SAndroid Build Coastguard Worker db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 121*c0909341SAndroid Build Coastguard Workerwarp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 122*c0909341SAndroid Build Coastguard Worker db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 123*c0909341SAndroid Build Coastguard Worker db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 124*c0909341SAndroid Build Coastguard Worker db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 125*c0909341SAndroid Build Coastguard Workerdeint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 126*c0909341SAndroid Build Coastguard Workerpd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 127*c0909341SAndroid Build Coastguard Worker dd 1 128*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 129*c0909341SAndroid Build Coastguard Worker dd 3 130*c0909341SAndroid Build Coastguard Workerpw_8192: times 2 dw 8192 131*c0909341SAndroid Build Coastguard Workeravg_shift: dw 5, 5, 3, 3 132*c0909341SAndroid Build Coastguard Workerpw_27615: times 2 dw 27615 133*c0909341SAndroid Build Coastguard Workerpw_32766: times 2 dw 32766 134*c0909341SAndroid Build Coastguard Workerwarp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 135*c0909341SAndroid Build Coastguard Workerwarp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 136*c0909341SAndroid Build Coastguard Workerwarp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 137*c0909341SAndroid Build Coastguard Workerblend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 138*c0909341SAndroid Build Coastguard Workerresize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 139*c0909341SAndroid Build Coastguard Workerresize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 140*c0909341SAndroid Build Coastguard Workerresize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 141*c0909341SAndroid Build Coastguard Workerresize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 142*c0909341SAndroid Build Coastguard Workerresize_permE: dq 0, 2, 4, 6 143*c0909341SAndroid Build Coastguard Workerresize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 144*c0909341SAndroid Build Coastguard Workerresize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 145*c0909341SAndroid Build Coastguard Workerrescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 146*c0909341SAndroid Build Coastguard Workerresize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 147*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 148*c0909341SAndroid Build Coastguard Worker 149*c0909341SAndroid Build Coastguard Workerprep_hv_shift: dq 6, 4 150*c0909341SAndroid Build Coastguard Workerput_bilin_h_rnd: dw 8, 8, 10, 10 151*c0909341SAndroid Build Coastguard Workerprep_mul: dw 16, 16, 4, 4 152*c0909341SAndroid Build Coastguard Workerput_8tap_h_rnd: dd 34, 40 153*c0909341SAndroid Build Coastguard Workerprep_8tap_rnd: dd 128 - (8192 << 8) 154*c0909341SAndroid Build Coastguard Workerwarp_8x8_rnd_h: dd 512, 2048 155*c0909341SAndroid Build Coastguard Workerwarp_8x8_rnd_v: dd 262144, 65536 156*c0909341SAndroid Build Coastguard Workerwarp_8x8t_rnd_v: dd 16384 - (8192 << 15) 157*c0909341SAndroid Build Coastguard Workeravg_round: dw -16400, -16400, -16388, -16388 158*c0909341SAndroid Build Coastguard Workerw_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) 159*c0909341SAndroid Build Coastguard Workermask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) 160*c0909341SAndroid Build Coastguard Workerw_mask_round: dd 128, 64 161*c0909341SAndroid Build Coastguard Workerbidir_shift: dw 6, 6, 4, 4 162*c0909341SAndroid Build Coastguard Worker 163*c0909341SAndroid Build Coastguard Workerpb_64: times 4 db 64 164*c0909341SAndroid Build Coastguard Workerpw_m512: times 2 dw -512 165*c0909341SAndroid Build Coastguard Workerpw_2: times 2 dw 2 166*c0909341SAndroid Build Coastguard Workerpw_64: times 2 dw 64 167*c0909341SAndroid Build Coastguard Workerpd_32: dd 32 168*c0909341SAndroid Build Coastguard Workerpd_63: dd 63 169*c0909341SAndroid Build Coastguard Workerpd_128: dd 128 170*c0909341SAndroid Build Coastguard Workerpd_640: dd 640 171*c0909341SAndroid Build Coastguard Workerpd_2176: dd 2176 172*c0909341SAndroid Build Coastguard Workerpd_16384: dd 16384 173*c0909341SAndroid Build Coastguard Workerpd_0_4: dd 0, 4 174*c0909341SAndroid Build Coastguard Worker 175*c0909341SAndroid Build Coastguard Worker%define pw_16 prep_mul 176*c0909341SAndroid Build Coastguard Worker%define pd_512 warp_8x8_rnd_h 177*c0909341SAndroid Build Coastguard Worker 178*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-* 179*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - %3) 180*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2 181*c0909341SAndroid Build Coastguard Worker %%table: 182*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 183*c0909341SAndroid Build Coastguard Worker dw %%base %+ _w%3 - %%base 184*c0909341SAndroid Build Coastguard Worker %rotate 1 185*c0909341SAndroid Build Coastguard Worker %endrep 186*c0909341SAndroid Build Coastguard Worker%endmacro 187*c0909341SAndroid Build Coastguard Worker 188*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-* 189*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) 190*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%3 191*c0909341SAndroid Build Coastguard Worker %assign %%types %4 192*c0909341SAndroid Build Coastguard Worker %if %%types & 1 193*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_h_%3_table (%%h - %5) 194*c0909341SAndroid Build Coastguard Worker %%h: 195*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 196*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .h_w%5 - %%base 197*c0909341SAndroid Build Coastguard Worker %rotate 1 198*c0909341SAndroid Build Coastguard Worker %endrep 199*c0909341SAndroid Build Coastguard Worker %rotate 4 200*c0909341SAndroid Build Coastguard Worker %endif 201*c0909341SAndroid Build Coastguard Worker %if %%types & 2 202*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_v_%3_table (%%v - %5) 203*c0909341SAndroid Build Coastguard Worker %%v: 204*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 205*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .v_w%5 - %%base 206*c0909341SAndroid Build Coastguard Worker %rotate 1 207*c0909341SAndroid Build Coastguard Worker %endrep 208*c0909341SAndroid Build Coastguard Worker %rotate 4 209*c0909341SAndroid Build Coastguard Worker %endif 210*c0909341SAndroid Build Coastguard Worker %if %%types & 4 211*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_hv_%3_table (%%hv - %5) 212*c0909341SAndroid Build Coastguard Worker %%hv: 213*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 214*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .hv_w%5 - %%base 215*c0909341SAndroid Build Coastguard Worker %rotate 1 216*c0909341SAndroid Build Coastguard Worker %endrep 217*c0909341SAndroid Build Coastguard Worker %endif 218*c0909341SAndroid Build Coastguard Worker%endmacro 219*c0909341SAndroid Build Coastguard Worker 220*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-* 221*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*%3) 222*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2_table 223*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 224*c0909341SAndroid Build Coastguard Worker %%table: 225*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 226*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .w%3 - %%base 227*c0909341SAndroid Build Coastguard Worker %rotate 1 228*c0909341SAndroid Build Coastguard Worker %endrep 229*c0909341SAndroid Build Coastguard Worker%endmacro 230*c0909341SAndroid Build Coastguard Worker 231*c0909341SAndroid Build Coastguard Worker%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) 232*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) 233*c0909341SAndroid Build Coastguard Worker 234*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 235*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 236*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 237*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 238*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 239*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 240*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 241*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 242*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 243*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 244*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 245*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 246*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 247*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 248*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 249*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 250*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 251*c0909341SAndroid Build Coastguard Worker 252*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 253*c0909341SAndroid Build Coastguard Worker 254*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters 255*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 256*c0909341SAndroid Build Coastguard Worker 257*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter 258*c0909341SAndroid Build Coastguard Workercextern obmc_masks_avx2 259*c0909341SAndroid Build Coastguard Workercextern resize_filter 260*c0909341SAndroid Build Coastguard Worker 261*c0909341SAndroid Build Coastguard WorkerSECTION .text 262*c0909341SAndroid Build Coastguard Worker 263*c0909341SAndroid Build Coastguard Worker%if WIN64 264*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4 265*c0909341SAndroid Build Coastguard Worker%else 266*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8 267*c0909341SAndroid Build Coastguard Worker%endif 268*c0909341SAndroid Build Coastguard Worker 269*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 270*c0909341SAndroid Build Coastguard Workercglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy 271*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; mx 272*c0909341SAndroid Build Coastguard Worker lea r7, [put_avx512icl] 273*c0909341SAndroid Build Coastguard Worker tzcnt t0d, wm 274*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 275*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 276*c0909341SAndroid Build Coastguard Worker jnz .h 277*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 278*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 279*c0909341SAndroid Build Coastguard Worker jnz .v 280*c0909341SAndroid Build Coastguard Worker.put: 281*c0909341SAndroid Build Coastguard Worker movzx t0d, word [r7+t0*2+table_offset(put,)] 282*c0909341SAndroid Build Coastguard Worker add t0, r7 283*c0909341SAndroid Build Coastguard Worker jmp t0 284*c0909341SAndroid Build Coastguard Worker.put_w2: 285*c0909341SAndroid Build Coastguard Worker mov r6d, [srcq+ssq*0] 286*c0909341SAndroid Build Coastguard Worker mov r7d, [srcq+ssq*1] 287*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 288*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6d 289*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7d 290*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 291*c0909341SAndroid Build Coastguard Worker sub hd, 2 292*c0909341SAndroid Build Coastguard Worker jg .put_w2 293*c0909341SAndroid Build Coastguard Worker RET 294*c0909341SAndroid Build Coastguard Worker.put_w4: 295*c0909341SAndroid Build Coastguard Worker mov r6, [srcq+ssq*0] 296*c0909341SAndroid Build Coastguard Worker mov r7, [srcq+ssq*1] 297*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 298*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6 299*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7 300*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 301*c0909341SAndroid Build Coastguard Worker sub hd, 2 302*c0909341SAndroid Build Coastguard Worker jg .put_w4 303*c0909341SAndroid Build Coastguard Worker RET 304*c0909341SAndroid Build Coastguard Worker.put_w8: 305*c0909341SAndroid Build Coastguard Worker movu xmm0, [srcq+ssq*0] 306*c0909341SAndroid Build Coastguard Worker movu xmm1, [srcq+ssq*1] 307*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 308*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xmm0 309*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], xmm1 310*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 311*c0909341SAndroid Build Coastguard Worker sub hd, 2 312*c0909341SAndroid Build Coastguard Worker jg .put_w8 313*c0909341SAndroid Build Coastguard Worker RET 314*c0909341SAndroid Build Coastguard Worker.put_w16: 315*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 316*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*1] 317*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 318*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 319*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], ym1 320*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 321*c0909341SAndroid Build Coastguard Worker sub hd, 2 322*c0909341SAndroid Build Coastguard Worker jg .put_w16 323*c0909341SAndroid Build Coastguard Worker RET 324*c0909341SAndroid Build Coastguard Worker.put_w32: 325*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 326*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*1] 327*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 328*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 329*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 330*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 331*c0909341SAndroid Build Coastguard Worker sub hd, 2 332*c0909341SAndroid Build Coastguard Worker jg .put_w32 333*c0909341SAndroid Build Coastguard Worker RET 334*c0909341SAndroid Build Coastguard Worker.put_w64: 335*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+64*0] 336*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+64*1] 337*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*1+64*0] 338*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1+64*1] 339*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 340*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*0], m0 341*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*1], m1 342*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*0], m2 343*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*1], m3 344*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 345*c0909341SAndroid Build Coastguard Worker sub hd, 2 346*c0909341SAndroid Build Coastguard Worker jg .put_w64 347*c0909341SAndroid Build Coastguard Worker RET 348*c0909341SAndroid Build Coastguard Worker.put_w128: 349*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+64*0] 350*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+64*1] 351*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+64*2] 352*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+64*3] 353*c0909341SAndroid Build Coastguard Worker add srcq, ssq 354*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 355*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 356*c0909341SAndroid Build Coastguard Worker mova [dstq+64*2], m2 357*c0909341SAndroid Build Coastguard Worker mova [dstq+64*3], m3 358*c0909341SAndroid Build Coastguard Worker add dstq, dsq 359*c0909341SAndroid Build Coastguard Worker dec hd 360*c0909341SAndroid Build Coastguard Worker jg .put_w128 361*c0909341SAndroid Build Coastguard Worker RET 362*c0909341SAndroid Build Coastguard Worker.h: 363*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, mxyd 364*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 365*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_16] 366*c0909341SAndroid Build Coastguard Worker psubw m4, m5 367*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 368*c0909341SAndroid Build Coastguard Worker jnz .hv 369*c0909341SAndroid Build Coastguard Worker ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 370*c0909341SAndroid Build Coastguard Worker movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] 371*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; bitdepth_max 372*c0909341SAndroid Build Coastguard Worker add t0, r7 373*c0909341SAndroid Build Coastguard Worker shr r6d, 11 374*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] 375*c0909341SAndroid Build Coastguard Worker jmp t0 376*c0909341SAndroid Build Coastguard Worker.h_w2: 377*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*0] 378*c0909341SAndroid Build Coastguard Worker movhps xmm1, [srcq+ssq*1] 379*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 380*c0909341SAndroid Build Coastguard Worker pmullw xmm0, xmm1, xm4 381*c0909341SAndroid Build Coastguard Worker psrlq xmm1, 16 382*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm5 383*c0909341SAndroid Build Coastguard Worker paddw xmm0, xm6 384*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 385*c0909341SAndroid Build Coastguard Worker psrlw xmm0, 4 386*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 387*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 2 388*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 389*c0909341SAndroid Build Coastguard Worker sub hd, 2 390*c0909341SAndroid Build Coastguard Worker jg .h_w2 391*c0909341SAndroid Build Coastguard Worker RET 392*c0909341SAndroid Build Coastguard Worker.h_w4: 393*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0+0] 394*c0909341SAndroid Build Coastguard Worker movhps xmm0, [srcq+ssq*1+0] 395*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*0+2] 396*c0909341SAndroid Build Coastguard Worker movhps xmm1, [srcq+ssq*1+2] 397*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 398*c0909341SAndroid Build Coastguard Worker pmullw xmm0, xm4 399*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm5 400*c0909341SAndroid Build Coastguard Worker paddw xmm0, xm6 401*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 402*c0909341SAndroid Build Coastguard Worker psrlw xmm0, 4 403*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 404*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 405*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 406*c0909341SAndroid Build Coastguard Worker sub hd, 2 407*c0909341SAndroid Build Coastguard Worker jg .h_w4 408*c0909341SAndroid Build Coastguard Worker RET 409*c0909341SAndroid Build Coastguard Worker.h_w8: 410*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0+0] 411*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+ssq*1+0], 1 412*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0+2] 413*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [srcq+ssq*1+2], 1 414*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 415*c0909341SAndroid Build Coastguard Worker pmullw ym0, ym4 416*c0909341SAndroid Build Coastguard Worker pmullw ym1, ym5 417*c0909341SAndroid Build Coastguard Worker paddw ym0, ym6 418*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 419*c0909341SAndroid Build Coastguard Worker psrlw ym0, 4 420*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 421*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 422*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 423*c0909341SAndroid Build Coastguard Worker sub hd, 2 424*c0909341SAndroid Build Coastguard Worker jg .h_w8 425*c0909341SAndroid Build Coastguard Worker RET 426*c0909341SAndroid Build Coastguard Worker.h_w16: 427*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0+0] 428*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1+0], 1 429*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*0+2] 430*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+ssq*1+2], 1 431*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 432*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 433*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 434*c0909341SAndroid Build Coastguard Worker paddw m0, m6 435*c0909341SAndroid Build Coastguard Worker paddw m0, m1 436*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 437*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 438*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 439*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 440*c0909341SAndroid Build Coastguard Worker sub hd, 2 441*c0909341SAndroid Build Coastguard Worker jg .h_w16 442*c0909341SAndroid Build Coastguard Worker RET 443*c0909341SAndroid Build Coastguard Worker.h_w32: 444*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ssq*0+0] 445*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ssq*0+2] 446*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+ssq*1+0] 447*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, [srcq+ssq*1+2] 448*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 449*c0909341SAndroid Build Coastguard Worker paddw m0, m6 450*c0909341SAndroid Build Coastguard Worker paddw m1, m6 451*c0909341SAndroid Build Coastguard Worker paddw m0, m2 452*c0909341SAndroid Build Coastguard Worker paddw m1, m3 453*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 454*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 455*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 456*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 457*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 458*c0909341SAndroid Build Coastguard Worker sub hd, 2 459*c0909341SAndroid Build Coastguard Worker jg .h_w32 460*c0909341SAndroid Build Coastguard Worker RET 461*c0909341SAndroid Build Coastguard Worker.h_w64: 462*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+64*0+0] 463*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+64*0+2] 464*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+64*1+0] 465*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, [srcq+64*1+2] 466*c0909341SAndroid Build Coastguard Worker add srcq, ssq 467*c0909341SAndroid Build Coastguard Worker paddw m0, m6 468*c0909341SAndroid Build Coastguard Worker paddw m1, m6 469*c0909341SAndroid Build Coastguard Worker paddw m0, m2 470*c0909341SAndroid Build Coastguard Worker paddw m1, m3 471*c0909341SAndroid Build Coastguard Worker psrlw m0, 4 472*c0909341SAndroid Build Coastguard Worker psrlw m1, 4 473*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 474*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 475*c0909341SAndroid Build Coastguard Worker add dstq, dsq 476*c0909341SAndroid Build Coastguard Worker dec hd 477*c0909341SAndroid Build Coastguard Worker jg .h_w64 478*c0909341SAndroid Build Coastguard Worker RET 479*c0909341SAndroid Build Coastguard Worker.h_w128: 480*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+64*0+0] 481*c0909341SAndroid Build Coastguard Worker pmullw m7, m5, [srcq+64*0+2] 482*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+64*1+0] 483*c0909341SAndroid Build Coastguard Worker pmullw m8, m5, [srcq+64*1+2] 484*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+64*2+0] 485*c0909341SAndroid Build Coastguard Worker pmullw m9, m5, [srcq+64*2+2] 486*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+64*3+0] 487*c0909341SAndroid Build Coastguard Worker pmullw m10, m5, [srcq+64*3+2] 488*c0909341SAndroid Build Coastguard Worker add srcq, ssq 489*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m6}, m0, m1, m2, m3 490*c0909341SAndroid Build Coastguard Worker paddw m0, m7 491*c0909341SAndroid Build Coastguard Worker paddw m1, m8 492*c0909341SAndroid Build Coastguard Worker paddw m2, m9 493*c0909341SAndroid Build Coastguard Worker paddw m3, m10 494*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 4}, m0, m1, m2, m3 495*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 496*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 497*c0909341SAndroid Build Coastguard Worker mova [dstq+64*2], m2 498*c0909341SAndroid Build Coastguard Worker mova [dstq+64*3], m3 499*c0909341SAndroid Build Coastguard Worker add dstq, dsq 500*c0909341SAndroid Build Coastguard Worker dec hd 501*c0909341SAndroid Build Coastguard Worker jg .h_w128 502*c0909341SAndroid Build Coastguard Worker RET 503*c0909341SAndroid Build Coastguard Worker.v: 504*c0909341SAndroid Build Coastguard Worker movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] 505*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 506*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, mxyd 507*c0909341SAndroid Build Coastguard Worker add t0, r7 508*c0909341SAndroid Build Coastguard Worker jmp t0 509*c0909341SAndroid Build Coastguard Worker.v_w2: 510*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 511*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 512*c0909341SAndroid Build Coastguard Worker movd xmm1, [srcq+ssq*1] 513*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 514*c0909341SAndroid Build Coastguard Worker punpckldq xmm2, xmm0, xmm1 515*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 516*c0909341SAndroid Build Coastguard Worker punpckldq xmm1, xmm0 517*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm2 518*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm8 519*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 520*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm1 521*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm1, 1 522*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 523*c0909341SAndroid Build Coastguard Worker sub hd, 2 524*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 525*c0909341SAndroid Build Coastguard Worker RET 526*c0909341SAndroid Build Coastguard Worker.v_w4: 527*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 528*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 529*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*1] 530*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 531*c0909341SAndroid Build Coastguard Worker punpcklqdq xmm2, xmm0, xmm1 532*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 533*c0909341SAndroid Build Coastguard Worker punpcklqdq xmm1, xmm0 534*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm2 535*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm8 536*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 537*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm1 538*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm1 539*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 540*c0909341SAndroid Build Coastguard Worker sub hd, 2 541*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 542*c0909341SAndroid Build Coastguard Worker RET 543*c0909341SAndroid Build Coastguard Worker.v_w8: 544*c0909341SAndroid Build Coastguard Worker movu xmm0, [srcq+ssq*0] 545*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 546*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm1, [srcq+ssq*1] 547*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 548*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm0, ymm1, 0xf0 549*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm0, [srcq+ssq*0] 550*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm0, 0xf0 551*c0909341SAndroid Build Coastguard Worker psubw ymm1, ymm2 552*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm1, ym8 553*c0909341SAndroid Build Coastguard Worker paddw ymm1, ymm2 554*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xmm1 555*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ymm1, 1 556*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 557*c0909341SAndroid Build Coastguard Worker sub hd, 2 558*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 559*c0909341SAndroid Build Coastguard Worker vzeroupper 560*c0909341SAndroid Build Coastguard Worker RET 561*c0909341SAndroid Build Coastguard Worker.v_w16: 562*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 563*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 564*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*1] 565*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 566*c0909341SAndroid Build Coastguard Worker psubw ym1, ym3, ym0 567*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym8 568*c0909341SAndroid Build Coastguard Worker paddw ym1, ym0 569*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 570*c0909341SAndroid Build Coastguard Worker psubw ym2, ym0, ym3 571*c0909341SAndroid Build Coastguard Worker pmulhrsw ym2, ym8 572*c0909341SAndroid Build Coastguard Worker paddw ym2, ym3 573*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym1 574*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], ym2 575*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 576*c0909341SAndroid Build Coastguard Worker sub hd, 2 577*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 578*c0909341SAndroid Build Coastguard Worker RET 579*c0909341SAndroid Build Coastguard Worker.v_w32: 580*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 581*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 582*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1] 583*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 584*c0909341SAndroid Build Coastguard Worker psubw m1, m3, m0 585*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m8 586*c0909341SAndroid Build Coastguard Worker paddw m1, m0 587*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 588*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m3 589*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 590*c0909341SAndroid Build Coastguard Worker paddw m2, m3 591*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m1 592*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m2 593*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 594*c0909341SAndroid Build Coastguard Worker sub hd, 2 595*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 596*c0909341SAndroid Build Coastguard Worker RET 597*c0909341SAndroid Build Coastguard Worker.v_w64: 598*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+64*0] 599*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+64*1] 600*c0909341SAndroid Build Coastguard Worker.v_w64_loop: 601*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*1+64*0] 602*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1+64*1] 603*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 604*c0909341SAndroid Build Coastguard Worker psubw m4, m2, m0 605*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m8 606*c0909341SAndroid Build Coastguard Worker paddw m4, m0 607*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+64*0] 608*c0909341SAndroid Build Coastguard Worker psubw m5, m3, m1 609*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m8 610*c0909341SAndroid Build Coastguard Worker paddw m5, m1 611*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+64*1] 612*c0909341SAndroid Build Coastguard Worker psubw m6, m0, m2 613*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m8 614*c0909341SAndroid Build Coastguard Worker psubw m7, m1, m3 615*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m8 616*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*0], m4 617*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*1], m5 618*c0909341SAndroid Build Coastguard Worker paddw m6, m2 619*c0909341SAndroid Build Coastguard Worker paddw m7, m3 620*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*0], m6 621*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*1], m7 622*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 623*c0909341SAndroid Build Coastguard Worker sub hd, 2 624*c0909341SAndroid Build Coastguard Worker jg .v_w64_loop 625*c0909341SAndroid Build Coastguard Worker RET 626*c0909341SAndroid Build Coastguard Worker.v_w128: 627*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+64*0] 628*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+64*1] 629*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*0+64*2] 630*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*0+64*3] 631*c0909341SAndroid Build Coastguard Worker.v_w128_loop: 632*c0909341SAndroid Build Coastguard Worker movu m4, [srcq+ssq*1+64*0] 633*c0909341SAndroid Build Coastguard Worker movu m5, [srcq+ssq*1+64*1] 634*c0909341SAndroid Build Coastguard Worker movu m6, [srcq+ssq*1+64*2] 635*c0909341SAndroid Build Coastguard Worker movu m7, [srcq+ssq*1+64*3] 636*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 637*c0909341SAndroid Build Coastguard Worker psubw m9, m4, m0 638*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m8 639*c0909341SAndroid Build Coastguard Worker paddw m9, m0 640*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+64*0] 641*c0909341SAndroid Build Coastguard Worker psubw m10, m5, m1 642*c0909341SAndroid Build Coastguard Worker pmulhrsw m10, m8 643*c0909341SAndroid Build Coastguard Worker paddw m10, m1 644*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+64*1] 645*c0909341SAndroid Build Coastguard Worker psubw m11, m6, m2 646*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m8 647*c0909341SAndroid Build Coastguard Worker paddw m11, m2 648*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*0+64*2] 649*c0909341SAndroid Build Coastguard Worker psubw m12, m7, m3 650*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, m8 651*c0909341SAndroid Build Coastguard Worker paddw m12, m3 652*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*0+64*3] 653*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*0], m9 654*c0909341SAndroid Build Coastguard Worker psubw m9, m0, m4 655*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m8 656*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*1], m10 657*c0909341SAndroid Build Coastguard Worker psubw m10, m1, m5 658*c0909341SAndroid Build Coastguard Worker pmulhrsw m10, m8 659*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*2], m11 660*c0909341SAndroid Build Coastguard Worker psubw m11, m2, m6 661*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m8 662*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*3], m12 663*c0909341SAndroid Build Coastguard Worker psubw m12, m3, m7 664*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, m8 665*c0909341SAndroid Build Coastguard Worker paddw m9, m4 666*c0909341SAndroid Build Coastguard Worker paddw m10, m5 667*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*0], m9 668*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*1], m10 669*c0909341SAndroid Build Coastguard Worker paddw m11, m6 670*c0909341SAndroid Build Coastguard Worker paddw m12, m7 671*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*2], m11 672*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*3], m12 673*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 674*c0909341SAndroid Build Coastguard Worker sub hd, 2 675*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop 676*c0909341SAndroid Build Coastguard Worker RET 677*c0909341SAndroid Build Coastguard Worker.hv: 678*c0909341SAndroid Build Coastguard Worker movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] 679*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 680*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pw_2] 681*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, mxyd 682*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_8192] 683*c0909341SAndroid Build Coastguard Worker add t0, r7 684*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 685*c0909341SAndroid Build Coastguard Worker jnz .hv_12bpc 686*c0909341SAndroid Build Coastguard Worker psllw m4, 2 687*c0909341SAndroid Build Coastguard Worker psllw m5, 2 688*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_2048] 689*c0909341SAndroid Build Coastguard Worker.hv_12bpc: 690*c0909341SAndroid Build Coastguard Worker jmp t0 691*c0909341SAndroid Build Coastguard Worker.hv_w2: 692*c0909341SAndroid Build Coastguard Worker vpbroadcastq xmm1, [srcq+ssq*0] 693*c0909341SAndroid Build Coastguard Worker pmullw xmm0, xmm1, xm4 694*c0909341SAndroid Build Coastguard Worker psrlq xmm1, 16 695*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm5 696*c0909341SAndroid Build Coastguard Worker paddw xmm0, xm6 697*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 698*c0909341SAndroid Build Coastguard Worker psrlw xmm0, 2 699*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 700*c0909341SAndroid Build Coastguard Worker movq xmm2, [srcq+ssq*1] 701*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 702*c0909341SAndroid Build Coastguard Worker movhps xmm2, [srcq+ssq*0] 703*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xmm2, xm4 704*c0909341SAndroid Build Coastguard Worker psrlq xmm2, 16 705*c0909341SAndroid Build Coastguard Worker pmullw xmm2, xm5 706*c0909341SAndroid Build Coastguard Worker paddw xmm1, xm6 707*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 708*c0909341SAndroid Build Coastguard Worker psrlw xmm1, 2 ; 1 _ 2 _ 709*c0909341SAndroid Build Coastguard Worker shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ 710*c0909341SAndroid Build Coastguard Worker mova xmm0, xmm1 711*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm2 712*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm1 713*c0909341SAndroid Build Coastguard Worker pmulhw xmm1, xm7 714*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 715*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm8 716*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm1 717*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm1, 2 718*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 719*c0909341SAndroid Build Coastguard Worker sub hd, 2 720*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 721*c0909341SAndroid Build Coastguard Worker RET 722*c0909341SAndroid Build Coastguard Worker.hv_w4: 723*c0909341SAndroid Build Coastguard Worker pmullw xmm0, xm4, [srcq+ssq*0-8] 724*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm5, [srcq+ssq*0-6] 725*c0909341SAndroid Build Coastguard Worker paddw xmm0, xm6 726*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 727*c0909341SAndroid Build Coastguard Worker psrlw xmm0, 2 728*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 729*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*1+0] 730*c0909341SAndroid Build Coastguard Worker movq xmm2, [srcq+ssq*1+2] 731*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 732*c0909341SAndroid Build Coastguard Worker movhps xmm1, [srcq+ssq*0+0] 733*c0909341SAndroid Build Coastguard Worker movhps xmm2, [srcq+ssq*0+2] 734*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm4 735*c0909341SAndroid Build Coastguard Worker pmullw xmm2, xm5 736*c0909341SAndroid Build Coastguard Worker paddw xmm1, xm6 737*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 738*c0909341SAndroid Build Coastguard Worker psrlw xmm1, 2 ; 1 2 739*c0909341SAndroid Build Coastguard Worker shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 740*c0909341SAndroid Build Coastguard Worker mova xmm0, xmm1 741*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm2 742*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm1 743*c0909341SAndroid Build Coastguard Worker pmulhw xmm1, xm7 744*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 745*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm8 746*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm1 747*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm1 748*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 749*c0909341SAndroid Build Coastguard Worker sub hd, 2 750*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 751*c0909341SAndroid Build Coastguard Worker RET 752*c0909341SAndroid Build Coastguard Worker.hv_w8: 753*c0909341SAndroid Build Coastguard Worker pmullw xmm0, xm4, [srcq+ssq*0+0] 754*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm5, [srcq+ssq*0+2] 755*c0909341SAndroid Build Coastguard Worker paddw xmm0, xm6 756*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 757*c0909341SAndroid Build Coastguard Worker psrlw xmm0, 2 758*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, xmm0, 1 759*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 760*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1+0] 761*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*1+2] 762*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 763*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [srcq+ssq*0+0], 1 764*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+ssq*0+2], 1 765*c0909341SAndroid Build Coastguard Worker pmullw ym1, ym4 766*c0909341SAndroid Build Coastguard Worker pmullw ym2, ym5 767*c0909341SAndroid Build Coastguard Worker paddw ym1, ym6 768*c0909341SAndroid Build Coastguard Worker paddw ym1, ym2 769*c0909341SAndroid Build Coastguard Worker psrlw ym1, 2 ; 1 2 770*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 771*c0909341SAndroid Build Coastguard Worker mova ym0, ym1 772*c0909341SAndroid Build Coastguard Worker psubw ym1, ym2 773*c0909341SAndroid Build Coastguard Worker paddw ym1, ym1 774*c0909341SAndroid Build Coastguard Worker pmulhw ym1, ym7 775*c0909341SAndroid Build Coastguard Worker paddw ym1, ym2 776*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym8 777*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm1 778*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym1, 1 779*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 780*c0909341SAndroid Build Coastguard Worker sub hd, 2 781*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 782*c0909341SAndroid Build Coastguard Worker RET 783*c0909341SAndroid Build Coastguard Worker.hv_w16: 784*c0909341SAndroid Build Coastguard Worker pmullw ym0, ym4, [srcq+ssq*0+0] 785*c0909341SAndroid Build Coastguard Worker pmullw ym1, ym5, [srcq+ssq*0+2] 786*c0909341SAndroid Build Coastguard Worker paddw ym0, ym6 787*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 788*c0909341SAndroid Build Coastguard Worker psrlw ym0, 2 789*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym0, 1 790*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 791*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*1+0] 792*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+ssq*1+2] 793*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 794*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+ssq*0+0], 1 795*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+ssq*0+2], 1 796*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 797*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 798*c0909341SAndroid Build Coastguard Worker paddw m1, m6 799*c0909341SAndroid Build Coastguard Worker paddw m1, m2 800*c0909341SAndroid Build Coastguard Worker psrlw m1, 2 ; 1 2 801*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m1, q1032 ; 0 1 802*c0909341SAndroid Build Coastguard Worker mova m0, m1 803*c0909341SAndroid Build Coastguard Worker psubw m1, m2 804*c0909341SAndroid Build Coastguard Worker paddw m1, m1 805*c0909341SAndroid Build Coastguard Worker pmulhw m1, m7 806*c0909341SAndroid Build Coastguard Worker paddw m1, m2 807*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m8 808*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym1 809*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m1, 1 810*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 811*c0909341SAndroid Build Coastguard Worker sub hd, 2 812*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 813*c0909341SAndroid Build Coastguard Worker RET 814*c0909341SAndroid Build Coastguard Worker.hv_w32: 815*c0909341SAndroid Build Coastguard Worker.hv_w64: 816*c0909341SAndroid Build Coastguard Worker.hv_w128: 817*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 818*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq*8-256] 819*c0909341SAndroid Build Coastguard Worker mov r4, srcq 820*c0909341SAndroid Build Coastguard Worker mov r7, dstq 821*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0: 822*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ssq*0+0] 823*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+ssq*0+2] 824*c0909341SAndroid Build Coastguard Worker paddw m0, m6 825*c0909341SAndroid Build Coastguard Worker paddw m0, m1 826*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 827*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 828*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+ssq*1+0] 829*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+ssq*1+2] 830*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 831*c0909341SAndroid Build Coastguard Worker paddw m3, m6 832*c0909341SAndroid Build Coastguard Worker paddw m3, m1 833*c0909341SAndroid Build Coastguard Worker psrlw m3, 2 834*c0909341SAndroid Build Coastguard Worker psubw m1, m3, m0 835*c0909341SAndroid Build Coastguard Worker paddw m1, m1 836*c0909341SAndroid Build Coastguard Worker pmulhw m1, m7 837*c0909341SAndroid Build Coastguard Worker paddw m1, m0 838*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ssq*0+0] 839*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ssq*0+2] 840*c0909341SAndroid Build Coastguard Worker paddw m0, m6 841*c0909341SAndroid Build Coastguard Worker paddw m0, m2 842*c0909341SAndroid Build Coastguard Worker psrlw m0, 2 843*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m3 844*c0909341SAndroid Build Coastguard Worker paddw m2, m2 845*c0909341SAndroid Build Coastguard Worker pmulhw m2, m7 846*c0909341SAndroid Build Coastguard Worker paddw m2, m3 847*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m8 848*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 849*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m1 850*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m2 851*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 852*c0909341SAndroid Build Coastguard Worker sub hd, 2 853*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 854*c0909341SAndroid Build Coastguard Worker add r4, 64 855*c0909341SAndroid Build Coastguard Worker add r7, 64 856*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 857*c0909341SAndroid Build Coastguard Worker mov srcq, r4 858*c0909341SAndroid Build Coastguard Worker mov dstq, r7 859*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 860*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop0 861*c0909341SAndroid Build Coastguard Worker RET 862*c0909341SAndroid Build Coastguard Worker 863*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 864*c0909341SAndroid Build Coastguard Worker movifnidn mxyd, r5m ; mx 865*c0909341SAndroid Build Coastguard Worker lea r6, [prep_avx512icl] 866*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 867*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 868*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 869*c0909341SAndroid Build Coastguard Worker jnz .h 870*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 871*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 872*c0909341SAndroid Build Coastguard Worker jnz .v 873*c0909341SAndroid Build Coastguard Worker.prep: 874*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep,)] 875*c0909341SAndroid Build Coastguard Worker mov r5d, r7m ; bitdepth_max 876*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] 877*c0909341SAndroid Build Coastguard Worker add wq, r6 878*c0909341SAndroid Build Coastguard Worker shr r5d, 11 879*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] 880*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 881*c0909341SAndroid Build Coastguard Worker jmp wq 882*c0909341SAndroid Build Coastguard Worker.prep_w4: 883*c0909341SAndroid Build Coastguard Worker mov r3d, 0x0c 884*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 885*c0909341SAndroid Build Coastguard Worker.prep_w4_loop: 886*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 887*c0909341SAndroid Build Coastguard Worker movhps xm0, [srcq+strideq*1] 888*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym1, [srcq+strideq*2] 889*c0909341SAndroid Build Coastguard Worker vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4} 890*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 891*c0909341SAndroid Build Coastguard Worker pmullw ym0, ym4 892*c0909341SAndroid Build Coastguard Worker psubw ym0, ym5 893*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym0 894*c0909341SAndroid Build Coastguard Worker add tmpq, 32 895*c0909341SAndroid Build Coastguard Worker sub hd, 4 896*c0909341SAndroid Build Coastguard Worker jg .prep_w4_loop 897*c0909341SAndroid Build Coastguard Worker RET 898*c0909341SAndroid Build Coastguard Worker.prep_w8: 899*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 900*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+strideq*1], 1 901*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+strideq*2], 2 902*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+stride3q ], 3 903*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 904*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 905*c0909341SAndroid Build Coastguard Worker psubw m0, m5 906*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 907*c0909341SAndroid Build Coastguard Worker add tmpq, 64 908*c0909341SAndroid Build Coastguard Worker sub hd, 4 909*c0909341SAndroid Build Coastguard Worker jg .prep_w8 910*c0909341SAndroid Build Coastguard Worker RET 911*c0909341SAndroid Build Coastguard Worker.prep_w16: 912*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 913*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*1], 1 914*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+strideq*2] 915*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+stride3q ], 1 916*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 917*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 918*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 919*c0909341SAndroid Build Coastguard Worker psubw m0, m5 920*c0909341SAndroid Build Coastguard Worker psubw m1, m5 921*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 922*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 923*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 924*c0909341SAndroid Build Coastguard Worker sub hd, 4 925*c0909341SAndroid Build Coastguard Worker jg .prep_w16 926*c0909341SAndroid Build Coastguard Worker RET 927*c0909341SAndroid Build Coastguard Worker.prep_w32: 928*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0] 929*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*1] 930*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+strideq*2] 931*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+stride3q ] 932*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 933*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m5}, m0, m1, m2, m3 934*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 935*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 936*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 937*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 938*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 939*c0909341SAndroid Build Coastguard Worker sub hd, 4 940*c0909341SAndroid Build Coastguard Worker jg .prep_w32 941*c0909341SAndroid Build Coastguard Worker RET 942*c0909341SAndroid Build Coastguard Worker.prep_w64: 943*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0+64*0] 944*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*0+64*1] 945*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+strideq*1+64*0] 946*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+strideq*1+64*1] 947*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 948*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m5}, m0, m1, m2, m3 949*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 950*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 951*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 952*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 953*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 954*c0909341SAndroid Build Coastguard Worker sub hd, 2 955*c0909341SAndroid Build Coastguard Worker jg .prep_w64 956*c0909341SAndroid Build Coastguard Worker RET 957*c0909341SAndroid Build Coastguard Worker.prep_w128: 958*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+64*0] 959*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+64*1] 960*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+64*2] 961*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+64*3] 962*c0909341SAndroid Build Coastguard Worker add srcq, strideq 963*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m5}, m0, m1, m2, m3 964*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 965*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 966*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 967*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 968*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 969*c0909341SAndroid Build Coastguard Worker dec hd 970*c0909341SAndroid Build Coastguard Worker jg .prep_w128 971*c0909341SAndroid Build Coastguard Worker RET 972*c0909341SAndroid Build Coastguard Worker.h: 973*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, mxyd 974*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 975*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_16] 976*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pw_32766] 977*c0909341SAndroid Build Coastguard Worker psubw m4, m5 978*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 979*c0909341SAndroid Build Coastguard Worker jnz .h_12bpc 980*c0909341SAndroid Build Coastguard Worker psllw m4, 2 981*c0909341SAndroid Build Coastguard Worker psllw m5, 2 982*c0909341SAndroid Build Coastguard Worker.h_12bpc: 983*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 984*c0909341SAndroid Build Coastguard Worker jnz .hv 985*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 986*c0909341SAndroid Build Coastguard Worker add wq, r6 987*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 988*c0909341SAndroid Build Coastguard Worker jmp wq 989*c0909341SAndroid Build Coastguard Worker.h_w4: 990*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0] 991*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [srcq+strideq*2], 1 992*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*1] 993*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+stride3q ], 1 994*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 995*c0909341SAndroid Build Coastguard Worker punpcklqdq ym0, ym1, ym2 996*c0909341SAndroid Build Coastguard Worker psrldq ym1, 2 997*c0909341SAndroid Build Coastguard Worker psrldq ym2, 2 998*c0909341SAndroid Build Coastguard Worker pmullw ym0, ym4 999*c0909341SAndroid Build Coastguard Worker punpcklqdq ym1, ym2 1000*c0909341SAndroid Build Coastguard Worker pmullw ym1, ym5 1001*c0909341SAndroid Build Coastguard Worker psubw ym0, ym6 1002*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 1003*c0909341SAndroid Build Coastguard Worker psraw ym0, 2 1004*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym0 1005*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1006*c0909341SAndroid Build Coastguard Worker sub hd, 4 1007*c0909341SAndroid Build Coastguard Worker jg .h_w4 1008*c0909341SAndroid Build Coastguard Worker RET 1009*c0909341SAndroid Build Coastguard Worker.h_w8: 1010*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0+0] 1011*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*0+2] 1012*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+strideq*1+0], 1 1013*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [srcq+strideq*1+2], 1 1014*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+strideq*2+0], 2 1015*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+strideq*2+2], 2 1016*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+stride3q +0], 3 1017*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+stride3q +2], 3 1018*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1019*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 1020*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 1021*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1022*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1023*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1024*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 1025*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1026*c0909341SAndroid Build Coastguard Worker sub hd, 4 1027*c0909341SAndroid Build Coastguard Worker jg .h_w8 1028*c0909341SAndroid Build Coastguard Worker RET 1029*c0909341SAndroid Build Coastguard Worker.h_w16: 1030*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0+0] 1031*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*1+0], 1 1032*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+strideq*0+2] 1033*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+strideq*1+2], 1 1034*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1035*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 1036*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 1037*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1038*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1039*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1040*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 1041*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1042*c0909341SAndroid Build Coastguard Worker sub hd, 2 1043*c0909341SAndroid Build Coastguard Worker jg .h_w16 1044*c0909341SAndroid Build Coastguard Worker RET 1045*c0909341SAndroid Build Coastguard Worker.h_w32: 1046*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0+0] 1047*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+strideq*0+2] 1048*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+strideq*1+0] 1049*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, [srcq+strideq*1+2] 1050*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1051*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1052*c0909341SAndroid Build Coastguard Worker psubw m1, m6 1053*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1054*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1055*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1056*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1057*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1058*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1059*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1060*c0909341SAndroid Build Coastguard Worker sub hd, 2 1061*c0909341SAndroid Build Coastguard Worker jg .h_w32 1062*c0909341SAndroid Build Coastguard Worker RET 1063*c0909341SAndroid Build Coastguard Worker.h_w64: 1064*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ 0] 1065*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ 2] 1066*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+64] 1067*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, [srcq+66] 1068*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1069*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1070*c0909341SAndroid Build Coastguard Worker psubw m1, m6 1071*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1072*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1073*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1074*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1075*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1076*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1077*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1078*c0909341SAndroid Build Coastguard Worker dec hd 1079*c0909341SAndroid Build Coastguard Worker jg .h_w64 1080*c0909341SAndroid Build Coastguard Worker RET 1081*c0909341SAndroid Build Coastguard Worker.h_w128: 1082*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ 0] 1083*c0909341SAndroid Build Coastguard Worker pmullw m7, m5, [srcq+ 2] 1084*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+ 64] 1085*c0909341SAndroid Build Coastguard Worker pmullw m8, m5, [srcq+ 66] 1086*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+128] 1087*c0909341SAndroid Build Coastguard Worker pmullw m9, m5, [srcq+130] 1088*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+192] 1089*c0909341SAndroid Build Coastguard Worker pmullw m10, m5, [srcq+194] 1090*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1091*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m6}, m0, m1, m2, m3 1092*c0909341SAndroid Build Coastguard Worker paddw m0, m7 1093*c0909341SAndroid Build Coastguard Worker paddw m1, m8 1094*c0909341SAndroid Build Coastguard Worker paddw m2, m9 1095*c0909341SAndroid Build Coastguard Worker paddw m3, m10 1096*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m0, m1, m2, m3 1097*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1098*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1099*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 1100*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 1101*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1102*c0909341SAndroid Build Coastguard Worker dec hd 1103*c0909341SAndroid Build Coastguard Worker jg .h_w128 1104*c0909341SAndroid Build Coastguard Worker RET 1105*c0909341SAndroid Build Coastguard Worker.v: 1106*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1107*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, mxyd 1108*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_16] 1109*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_32766] 1110*c0909341SAndroid Build Coastguard Worker add wq, r6 1111*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1112*c0909341SAndroid Build Coastguard Worker psubw m8, m9 1113*c0909341SAndroid Build Coastguard Worker test dword r7m, 0x800 1114*c0909341SAndroid Build Coastguard Worker jnz .v_12bpc 1115*c0909341SAndroid Build Coastguard Worker psllw m8, 2 1116*c0909341SAndroid Build Coastguard Worker psllw m9, 2 1117*c0909341SAndroid Build Coastguard Worker.v_12bpc: 1118*c0909341SAndroid Build Coastguard Worker jmp wq 1119*c0909341SAndroid Build Coastguard Worker.v_w4: 1120*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+strideq*0] 1121*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1122*c0909341SAndroid Build Coastguard Worker vpbroadcastq xmm2, [srcq+strideq*1] 1123*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm1, [srcq+strideq*2] 1124*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+stride3q ] 1125*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1126*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm1, 0x30 1127*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm3, 0xc0 1128*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 1129*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+strideq*0] 1130*c0909341SAndroid Build Coastguard Worker valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 1131*c0909341SAndroid Build Coastguard Worker pmullw ymm1, ym8 1132*c0909341SAndroid Build Coastguard Worker pmullw ymm2, ym9 1133*c0909341SAndroid Build Coastguard Worker psubw ymm1, ym10 1134*c0909341SAndroid Build Coastguard Worker paddw ymm1, ymm2 1135*c0909341SAndroid Build Coastguard Worker psraw ymm1, 2 1136*c0909341SAndroid Build Coastguard Worker mova [tmpq], ymm1 1137*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1138*c0909341SAndroid Build Coastguard Worker sub hd, 4 1139*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1140*c0909341SAndroid Build Coastguard Worker vzeroupper 1141*c0909341SAndroid Build Coastguard Worker RET 1142*c0909341SAndroid Build Coastguard Worker.v_w8: 1143*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 1144*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1145*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1146*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+strideq*2], 2 1147*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 1148*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1149*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 1150*c0909341SAndroid Build Coastguard Worker valignq m2, m0, m1, 2 ; 1 2 3 4 1151*c0909341SAndroid Build Coastguard Worker pmullw m1, m8 1152*c0909341SAndroid Build Coastguard Worker pmullw m2, m9 1153*c0909341SAndroid Build Coastguard Worker psubw m1, m10 1154*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1155*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1156*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1157*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1158*c0909341SAndroid Build Coastguard Worker sub hd, 4 1159*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1160*c0909341SAndroid Build Coastguard Worker RET 1161*c0909341SAndroid Build Coastguard Worker.v_w16: 1162*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 1163*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1164*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 1165*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+strideq*2] 1166*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 1167*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1168*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 1169*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m3, q1032 ; 1 2 1170*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m2, m0, q1032 ; 3 4 1171*c0909341SAndroid Build Coastguard Worker pmullw m1, m8 1172*c0909341SAndroid Build Coastguard Worker pmullw m2, m8 1173*c0909341SAndroid Build Coastguard Worker pmullw m3, m9 1174*c0909341SAndroid Build Coastguard Worker pmullw m4, m9 1175*c0909341SAndroid Build Coastguard Worker psubw m1, m10 1176*c0909341SAndroid Build Coastguard Worker psubw m2, m10 1177*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1178*c0909341SAndroid Build Coastguard Worker paddw m2, m4 1179*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1180*c0909341SAndroid Build Coastguard Worker psraw m2, 2 1181*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m1 1182*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1183*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1184*c0909341SAndroid Build Coastguard Worker sub hd, 4 1185*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1186*c0909341SAndroid Build Coastguard Worker RET 1187*c0909341SAndroid Build Coastguard Worker.v_w32: 1188*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 1189*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 1190*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+strideq*1] 1191*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1192*c0909341SAndroid Build Coastguard Worker pmullw m1, m8, m0 1193*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 1194*c0909341SAndroid Build Coastguard Worker pmullw m2, m8, m3 1195*c0909341SAndroid Build Coastguard Worker pmullw m3, m9 1196*c0909341SAndroid Build Coastguard Worker pmullw m4, m9, m0 1197*c0909341SAndroid Build Coastguard Worker psubw m1, m10 1198*c0909341SAndroid Build Coastguard Worker psubw m2, m10 1199*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1200*c0909341SAndroid Build Coastguard Worker paddw m2, m4 1201*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1202*c0909341SAndroid Build Coastguard Worker psraw m2, 2 1203*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m1 1204*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1205*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1206*c0909341SAndroid Build Coastguard Worker sub hd, 2 1207*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 1208*c0909341SAndroid Build Coastguard Worker RET 1209*c0909341SAndroid Build Coastguard Worker.v_w64: 1210*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+64*0] 1211*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+64*1] 1212*c0909341SAndroid Build Coastguard Worker.v_w64_loop: 1213*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1214*c0909341SAndroid Build Coastguard Worker pmullw m2, m8, m0 1215*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+64*0] 1216*c0909341SAndroid Build Coastguard Worker pmullw m3, m8, m1 1217*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+64*1] 1218*c0909341SAndroid Build Coastguard Worker pmullw m4, m9, m0 1219*c0909341SAndroid Build Coastguard Worker pmullw m5, m9, m1 1220*c0909341SAndroid Build Coastguard Worker psubw m2, m10 1221*c0909341SAndroid Build Coastguard Worker psubw m3, m10 1222*c0909341SAndroid Build Coastguard Worker paddw m2, m4 1223*c0909341SAndroid Build Coastguard Worker paddw m3, m5 1224*c0909341SAndroid Build Coastguard Worker psraw m2, 2 1225*c0909341SAndroid Build Coastguard Worker psraw m3, 2 1226*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m2 1227*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m3 1228*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1229*c0909341SAndroid Build Coastguard Worker dec hd 1230*c0909341SAndroid Build Coastguard Worker jg .v_w64_loop 1231*c0909341SAndroid Build Coastguard Worker RET 1232*c0909341SAndroid Build Coastguard Worker.v_w128: 1233*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+64*0] 1234*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+64*1] 1235*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+64*2] 1236*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+64*3] 1237*c0909341SAndroid Build Coastguard Worker.v_w128_loop: 1238*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1239*c0909341SAndroid Build Coastguard Worker pmullw m4, m8, m0 1240*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+64*0] 1241*c0909341SAndroid Build Coastguard Worker pmullw m5, m8, m1 1242*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+64*1] 1243*c0909341SAndroid Build Coastguard Worker pmullw m6, m8, m2 1244*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+64*2] 1245*c0909341SAndroid Build Coastguard Worker pmullw m7, m8, m3 1246*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+64*3] 1247*c0909341SAndroid Build Coastguard Worker pmullw m11, m9, m0 1248*c0909341SAndroid Build Coastguard Worker pmullw m12, m9, m1 1249*c0909341SAndroid Build Coastguard Worker pmullw m13, m9, m2 1250*c0909341SAndroid Build Coastguard Worker pmullw m14, m9, m3 1251*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m10}, m4, m5, m6, m7 1252*c0909341SAndroid Build Coastguard Worker paddw m4, m11 1253*c0909341SAndroid Build Coastguard Worker paddw m5, m12 1254*c0909341SAndroid Build Coastguard Worker paddw m6, m13 1255*c0909341SAndroid Build Coastguard Worker paddw m7, m14 1256*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m4, m5, m6, m7 1257*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m4 1258*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m5 1259*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m6 1260*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m7 1261*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1262*c0909341SAndroid Build Coastguard Worker dec hd 1263*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop 1264*c0909341SAndroid Build Coastguard Worker RET 1265*c0909341SAndroid Build Coastguard Worker.hv: 1266*c0909341SAndroid Build Coastguard Worker movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1267*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 1268*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, mxyd 1269*c0909341SAndroid Build Coastguard Worker add wq, r6 1270*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1271*c0909341SAndroid Build Coastguard Worker jmp wq 1272*c0909341SAndroid Build Coastguard Worker.hv_w4: 1273*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+strideq*0+0] 1274*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+strideq*0+2] 1275*c0909341SAndroid Build Coastguard Worker pmullw xmm0, xm4 1276*c0909341SAndroid Build Coastguard Worker pmullw xmm1, xm5 1277*c0909341SAndroid Build Coastguard Worker psubw xmm0, xm6 1278*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 1279*c0909341SAndroid Build Coastguard Worker psraw xmm0, 2 1280*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym0, xmm0 1281*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1282*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1] 1283*c0909341SAndroid Build Coastguard Worker vinserti128 ym1, [srcq+stride3q ], 1 1284*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*2] 1285*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1286*c0909341SAndroid Build Coastguard Worker vinserti128 ym2, [srcq+strideq*0], 1 1287*c0909341SAndroid Build Coastguard Worker punpcklqdq ym3, ym1, ym2 1288*c0909341SAndroid Build Coastguard Worker psrldq ym1, 2 1289*c0909341SAndroid Build Coastguard Worker psrldq ym2, 2 1290*c0909341SAndroid Build Coastguard Worker pmullw ym3, ym4 1291*c0909341SAndroid Build Coastguard Worker punpcklqdq ym1, ym2 1292*c0909341SAndroid Build Coastguard Worker pmullw ym1, ym5 1293*c0909341SAndroid Build Coastguard Worker psubw ym3, ym6 1294*c0909341SAndroid Build Coastguard Worker paddw ym1, ym3 1295*c0909341SAndroid Build Coastguard Worker psraw ym1, 2 ; 1 2 3 4 1296*c0909341SAndroid Build Coastguard Worker valignq ym2, ym1, ym0, 3 ; 0 1 2 3 1297*c0909341SAndroid Build Coastguard Worker mova ym0, ym1 1298*c0909341SAndroid Build Coastguard Worker psubw ym1, ym2 1299*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym7 1300*c0909341SAndroid Build Coastguard Worker paddw ym1, ym2 1301*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym1 1302*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1303*c0909341SAndroid Build Coastguard Worker sub hd, 4 1304*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1305*c0909341SAndroid Build Coastguard Worker RET 1306*c0909341SAndroid Build Coastguard Worker.hv_w8: 1307*c0909341SAndroid Build Coastguard Worker pmullw xm0, xm4, [srcq+strideq*0+0] 1308*c0909341SAndroid Build Coastguard Worker pmullw xm1, xm5, [srcq+strideq*0+2] 1309*c0909341SAndroid Build Coastguard Worker psubw xm0, xm6 1310*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 1311*c0909341SAndroid Build Coastguard Worker psraw xm0, 2 1312*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, xm0, 3 1313*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1314*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+strideq*1+0] 1315*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*1+2] 1316*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [srcq+strideq*2+0], 1 1317*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+strideq*2+2], 1 1318*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+stride3q +0], 2 1319*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+stride3q +2], 2 1320*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1321*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+strideq*0+0], 3 1322*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+strideq*0+2], 3 1323*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 1324*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 1325*c0909341SAndroid Build Coastguard Worker psubw m1, m6 1326*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1327*c0909341SAndroid Build Coastguard Worker psraw m1, 2 ; 1 2 3 4 1328*c0909341SAndroid Build Coastguard Worker valignq m2, m1, m0, 6 ; 0 1 2 3 1329*c0909341SAndroid Build Coastguard Worker mova m0, m1 1330*c0909341SAndroid Build Coastguard Worker psubw m1, m2 1331*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1332*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1333*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1334*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1335*c0909341SAndroid Build Coastguard Worker sub hd, 4 1336*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 1337*c0909341SAndroid Build Coastguard Worker RET 1338*c0909341SAndroid Build Coastguard Worker.hv_w16: 1339*c0909341SAndroid Build Coastguard Worker pmullw ym0, ym4, [srcq+strideq*0+0] 1340*c0909341SAndroid Build Coastguard Worker pmullw ym1, ym5, [srcq+strideq*0+2] 1341*c0909341SAndroid Build Coastguard Worker psubw ym0, ym6 1342*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 1343*c0909341SAndroid Build Coastguard Worker psraw ym0, 2 1344*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym0, 1 1345*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 1346*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+strideq*1+0] 1347*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+strideq*1+2] 1348*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1349*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+strideq*0+0], 1 1350*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+strideq*0+2], 1 1351*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 1352*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 1353*c0909341SAndroid Build Coastguard Worker psubw m1, m6 1354*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1355*c0909341SAndroid Build Coastguard Worker psraw m1, 2 ; 1 2 1356*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m1, q1032 ; 0 1 1357*c0909341SAndroid Build Coastguard Worker mova m0, m1 1358*c0909341SAndroid Build Coastguard Worker psubw m1, m2 1359*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1360*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1361*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1362*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1363*c0909341SAndroid Build Coastguard Worker sub hd, 2 1364*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 1365*c0909341SAndroid Build Coastguard Worker RET 1366*c0909341SAndroid Build Coastguard Worker.hv_w32: 1367*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0+0] 1368*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+strideq*0+2] 1369*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1370*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1371*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1372*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 1373*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+strideq*1+0] 1374*c0909341SAndroid Build Coastguard Worker pmullw m1, m5, [srcq+strideq*1+2] 1375*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1376*c0909341SAndroid Build Coastguard Worker psubw m3, m6 1377*c0909341SAndroid Build Coastguard Worker paddw m3, m1 1378*c0909341SAndroid Build Coastguard Worker psraw m3, 2 1379*c0909341SAndroid Build Coastguard Worker psubw m1, m3, m0 1380*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1381*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1382*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+strideq*0+0] 1383*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+strideq*0+2] 1384*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1385*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1386*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1387*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m3 1388*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m7 1389*c0909341SAndroid Build Coastguard Worker paddw m2, m3 1390*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m1 1391*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1392*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1393*c0909341SAndroid Build Coastguard Worker sub hd, 2 1394*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 1395*c0909341SAndroid Build Coastguard Worker RET 1396*c0909341SAndroid Build Coastguard Worker.hv_w64: 1397*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ 0] 1398*c0909341SAndroid Build Coastguard Worker pmullw m2, m5, [srcq+ 2] 1399*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+64] 1400*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, [srcq+66] 1401*c0909341SAndroid Build Coastguard Worker psubw m0, m6 1402*c0909341SAndroid Build Coastguard Worker psubw m1, m6 1403*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1404*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1405*c0909341SAndroid Build Coastguard Worker psraw m0, 2 1406*c0909341SAndroid Build Coastguard Worker psraw m1, 2 1407*c0909341SAndroid Build Coastguard Worker.hv_w64_loop: 1408*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1409*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+ 0] 1410*c0909341SAndroid Build Coastguard Worker pmullw m8, m5, [srcq+ 2] 1411*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+64] 1412*c0909341SAndroid Build Coastguard Worker pmullw m9, m5, [srcq+66] 1413*c0909341SAndroid Build Coastguard Worker psubw m2, m6 1414*c0909341SAndroid Build Coastguard Worker psubw m3, m6 1415*c0909341SAndroid Build Coastguard Worker paddw m2, m8 1416*c0909341SAndroid Build Coastguard Worker paddw m3, m9 1417*c0909341SAndroid Build Coastguard Worker psraw m2, 2 1418*c0909341SAndroid Build Coastguard Worker psraw m3, 2 1419*c0909341SAndroid Build Coastguard Worker psubw m8, m2, m0 1420*c0909341SAndroid Build Coastguard Worker psubw m9, m3, m1 1421*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m7 1422*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m7 1423*c0909341SAndroid Build Coastguard Worker paddw m8, m0 1424*c0909341SAndroid Build Coastguard Worker mova m0, m2 1425*c0909341SAndroid Build Coastguard Worker paddw m9, m1 1426*c0909341SAndroid Build Coastguard Worker mova m1, m3 1427*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m8 1428*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m9 1429*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1430*c0909341SAndroid Build Coastguard Worker dec hd 1431*c0909341SAndroid Build Coastguard Worker jg .hv_w64_loop 1432*c0909341SAndroid Build Coastguard Worker RET 1433*c0909341SAndroid Build Coastguard Worker.hv_w128: 1434*c0909341SAndroid Build Coastguard Worker pmullw m0, m4, [srcq+ 0] 1435*c0909341SAndroid Build Coastguard Worker pmullw m8, m5, [srcq+ 2] 1436*c0909341SAndroid Build Coastguard Worker pmullw m1, m4, [srcq+ 64] 1437*c0909341SAndroid Build Coastguard Worker pmullw m9, m5, [srcq+ 66] 1438*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, [srcq+128] 1439*c0909341SAndroid Build Coastguard Worker pmullw m10, m5, [srcq+130] 1440*c0909341SAndroid Build Coastguard Worker pmullw m3, m4, [srcq+192] 1441*c0909341SAndroid Build Coastguard Worker pmullw m11, m5, [srcq+194] 1442*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m6}, m0, m1, m2, m3 1443*c0909341SAndroid Build Coastguard Worker paddw m0, m8 1444*c0909341SAndroid Build Coastguard Worker paddw m1, m9 1445*c0909341SAndroid Build Coastguard Worker paddw m2, m10 1446*c0909341SAndroid Build Coastguard Worker paddw m3, m11 1447*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m0, m1, m2, m3 1448*c0909341SAndroid Build Coastguard Worker.hv_w128_loop: 1449*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1450*c0909341SAndroid Build Coastguard Worker pmullw m8, m4, [srcq+ 0] 1451*c0909341SAndroid Build Coastguard Worker pmullw m12, m5, [srcq+ 2] 1452*c0909341SAndroid Build Coastguard Worker pmullw m9, m4, [srcq+ 64] 1453*c0909341SAndroid Build Coastguard Worker pmullw m13, m5, [srcq+ 66] 1454*c0909341SAndroid Build Coastguard Worker pmullw m10, m4, [srcq+128] 1455*c0909341SAndroid Build Coastguard Worker pmullw m14, m5, [srcq+130] 1456*c0909341SAndroid Build Coastguard Worker pmullw m11, m4, [srcq+192] 1457*c0909341SAndroid Build Coastguard Worker pmullw m15, m5, [srcq+194] 1458*c0909341SAndroid Build Coastguard Worker REPX {psubw x, m6}, m8, m9, m10, m11 1459*c0909341SAndroid Build Coastguard Worker paddw m8, m12 1460*c0909341SAndroid Build Coastguard Worker paddw m9, m13 1461*c0909341SAndroid Build Coastguard Worker paddw m10, m14 1462*c0909341SAndroid Build Coastguard Worker paddw m11, m15 1463*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m8, m9, m10, m11 1464*c0909341SAndroid Build Coastguard Worker psubw m12, m8, m0 1465*c0909341SAndroid Build Coastguard Worker psubw m13, m9, m1 1466*c0909341SAndroid Build Coastguard Worker psubw m14, m10, m2 1467*c0909341SAndroid Build Coastguard Worker psubw m15, m11, m3 1468*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m7}, m12, m13, m14, m15 1469*c0909341SAndroid Build Coastguard Worker paddw m12, m0 1470*c0909341SAndroid Build Coastguard Worker mova m0, m8 1471*c0909341SAndroid Build Coastguard Worker paddw m13, m1 1472*c0909341SAndroid Build Coastguard Worker mova m1, m9 1473*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m12 1474*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m13 1475*c0909341SAndroid Build Coastguard Worker paddw m14, m2 1476*c0909341SAndroid Build Coastguard Worker mova m2, m10 1477*c0909341SAndroid Build Coastguard Worker paddw m15, m3 1478*c0909341SAndroid Build Coastguard Worker mova m3, m11 1479*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m14 1480*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m15 1481*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1482*c0909341SAndroid Build Coastguard Worker dec hd 1483*c0909341SAndroid Build Coastguard Worker jg .hv_w128_loop 1484*c0909341SAndroid Build Coastguard Worker RET 1485*c0909341SAndroid Build Coastguard Worker 1486*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8] 1487*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15 1488*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1489*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP (2*15 << 16) | 3*15 1490*c0909341SAndroid Build Coastguard Worker 1491*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1492*c0909341SAndroid Build Coastguard Workercglobal %1_%2_16bpc 1493*c0909341SAndroid Build Coastguard Worker mov t0d, FILTER_%3 1494*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4 1495*c0909341SAndroid Build Coastguard Worker mov t1d, t0d 1496*c0909341SAndroid Build Coastguard Worker%else 1497*c0909341SAndroid Build Coastguard Worker mov t1d, FILTER_%4 1498*c0909341SAndroid Build Coastguard Worker%endif 1499*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter 1500*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1501*c0909341SAndroid Build Coastguard Worker%endif 1502*c0909341SAndroid Build Coastguard Worker%endmacro 1503*c0909341SAndroid Build Coastguard Worker 1504*c0909341SAndroid Build Coastguard Worker%if WIN64 1505*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5 1506*c0909341SAndroid Build Coastguard Worker%define buf rsp+stack_offset+8 ; shadow space 1507*c0909341SAndroid Build Coastguard Worker%else 1508*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8 1509*c0909341SAndroid Build Coastguard Worker%define buf rsp-40 ; red zone 1510*c0909341SAndroid Build Coastguard Worker%endif 1511*c0909341SAndroid Build Coastguard Worker 1512*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap, 1513*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc 1514*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc 1515*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc 1516*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular, REGULAR, REGULAR 1517*c0909341SAndroid Build Coastguard Worker 1518*c0909341SAndroid Build Coastguard Workercglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1519*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx512icl 1520*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 1521*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 1522*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 1523*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 1524*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx512icl] 1525*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1526*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1527*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 1528*c0909341SAndroid Build Coastguard Worker jnz .h 1529*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1530*c0909341SAndroid Build Coastguard Worker jnz .v 1531*c0909341SAndroid Build Coastguard Worker.put: 1532*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1533*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put,)] 1534*c0909341SAndroid Build Coastguard Worker add wq, r8 1535*c0909341SAndroid Build Coastguard Worker%if WIN64 1536*c0909341SAndroid Build Coastguard Worker pop r8 1537*c0909341SAndroid Build Coastguard Worker%endif 1538*c0909341SAndroid Build Coastguard Worker jmp wq 1539*c0909341SAndroid Build Coastguard Worker.h_w8: 1540*c0909341SAndroid Build Coastguard Worker mova m4, [spel_h_shufA] 1541*c0909341SAndroid Build Coastguard Worker movu m5, [spel_h_shufB] 1542*c0909341SAndroid Build Coastguard Worker movu m6, [spel_h_shufC] 1543*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 1544*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+ssq*0] 1545*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+ssq*1], 1 1546*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1547*c0909341SAndroid Build Coastguard Worker mova m0, m8 1548*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, m2 1549*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m1 1550*c0909341SAndroid Build Coastguard Worker vpermb m1, m5, m2 1551*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m1 1552*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m2 1553*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m1 1554*c0909341SAndroid Build Coastguard Worker psrad m0, 6 1555*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym1, m0, 1 1556*c0909341SAndroid Build Coastguard Worker packusdw ym0, ym1 1557*c0909341SAndroid Build Coastguard Worker pminsw ym0, ym15 1558*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 1559*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 1560*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1561*c0909341SAndroid Build Coastguard Worker sub hd, 2 1562*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 1563*c0909341SAndroid Build Coastguard Worker RET 1564*c0909341SAndroid Build Coastguard Worker.h: 1565*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m 1566*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1567*c0909341SAndroid Build Coastguard Worker jnz .hv 1568*c0909341SAndroid Build Coastguard Worker mov r7d, r8m 1569*c0909341SAndroid Build Coastguard Worker shr r7d, 11 1570*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] 1571*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1572*c0909341SAndroid Build Coastguard Worker jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4 1573*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1574*c0909341SAndroid Build Coastguard Worker sub srcq, 4 1575*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 1576*c0909341SAndroid Build Coastguard Worker mova [buf], xmm0 1577*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, xmm0 1578*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [buf+8] 1579*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [buf+4] 1580*c0909341SAndroid Build Coastguard Worker sub wd, 16 1581*c0909341SAndroid Build Coastguard Worker jl .h_w8 1582*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufA] 1583*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [spel_h_shufB] 1584*c0909341SAndroid Build Coastguard Worker jg .h_w32 1585*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 1586*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+ssq*0+ 0] 1587*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+ssq*1+ 0], 1 1588*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*0+12] 1589*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*1+12], 1 1590*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1591*c0909341SAndroid Build Coastguard Worker mova m0, m8 1592*c0909341SAndroid Build Coastguard Worker mova m1, m8 1593*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 1594*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m4 ; a0 b0 1595*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m7 1596*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m4 ; a2' b2' 1597*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 1598*c0909341SAndroid Build Coastguard Worker pshufb m3, m6 1599*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m2 ; a1 b1 1600*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m3 ; a1' b1' 1601*c0909341SAndroid Build Coastguard Worker shufpd m2, m3, 0x55 1602*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m2 ; a2 b2 1603*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m2 ; a0' b0' 1604*c0909341SAndroid Build Coastguard Worker psrad m0, 6 1605*c0909341SAndroid Build Coastguard Worker psrad m1, 6 1606*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 1607*c0909341SAndroid Build Coastguard Worker pminsw m0, m15 1608*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 1609*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 1610*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1611*c0909341SAndroid Build Coastguard Worker sub hd, 2 1612*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 1613*c0909341SAndroid Build Coastguard Worker RET 1614*c0909341SAndroid Build Coastguard Worker.h_w32: 1615*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 1616*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 1617*c0909341SAndroid Build Coastguard Worker neg wq 1618*c0909341SAndroid Build Coastguard Worker.h_w32_loop0: 1619*c0909341SAndroid Build Coastguard Worker mov r6, wq 1620*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 1621*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6*2+ 0] 1622*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+r6*2+12] 1623*c0909341SAndroid Build Coastguard Worker mova m0, m8 1624*c0909341SAndroid Build Coastguard Worker mova m1, m8 1625*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 1626*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m4 ; a0 1627*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m7 1628*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m4 ; b2 1629*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 1630*c0909341SAndroid Build Coastguard Worker pshufb m3, m6 1631*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m2 ; a1 1632*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m3 ; b1 1633*c0909341SAndroid Build Coastguard Worker shufpd m2, m3, 0x55 1634*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m2 ; a2 1635*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m2 ; b0 1636*c0909341SAndroid Build Coastguard Worker psrad m0, 6 1637*c0909341SAndroid Build Coastguard Worker psrad m1, 6 1638*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 1639*c0909341SAndroid Build Coastguard Worker pminsw m0, m15 1640*c0909341SAndroid Build Coastguard Worker mova [dstq+r6*2], m0 1641*c0909341SAndroid Build Coastguard Worker add r6, 32 1642*c0909341SAndroid Build Coastguard Worker jl .h_w32_loop 1643*c0909341SAndroid Build Coastguard Worker add srcq, ssq 1644*c0909341SAndroid Build Coastguard Worker add dstq, dsq 1645*c0909341SAndroid Build Coastguard Worker dec hd 1646*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop0 1647*c0909341SAndroid Build Coastguard Worker RET 1648*c0909341SAndroid Build Coastguard Worker.v: 1649*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1650*c0909341SAndroid Build Coastguard Worker shr myd, 16 1651*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1652*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1653*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [pd_32] 1654*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+1+myq*8] 1655*c0909341SAndroid Build Coastguard Worker tzcnt r7d, wd 1656*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m 1657*c0909341SAndroid Build Coastguard Worker mov r6, ssq 1658*c0909341SAndroid Build Coastguard Worker movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)] 1659*c0909341SAndroid Build Coastguard Worker neg r6 1660*c0909341SAndroid Build Coastguard Worker mova [rsp+stack_offset+8], xmm0 1661*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 1662*c0909341SAndroid Build Coastguard Worker add r7, r8 1663*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [rsp+stack_offset+12] 1664*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [rsp+stack_offset+16] 1665*c0909341SAndroid Build Coastguard Worker jmp r7 1666*c0909341SAndroid Build Coastguard Worker.v_w2: 1667*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+r6 *2] 1668*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+r6 *1], 1 1669*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*0], 2 1670*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 1671*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1672*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 1673*c0909341SAndroid Build Coastguard Worker palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1674*c0909341SAndroid Build Coastguard Worker punpcklwd xmm1, xmm2, xmm3 ; 01 12 1675*c0909341SAndroid Build Coastguard Worker punpckhwd xmm2, xmm3 ; 23 34 1676*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 1677*c0909341SAndroid Build Coastguard Worker movd xmm3, [srcq+ssq*1] 1678*c0909341SAndroid Build Coastguard Worker mova xmm4, xm11 1679*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm4, xmm1, xm12 ; a0 b0 1680*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1681*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 1682*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm4, xmm2, xm13 ; a1 b1 1683*c0909341SAndroid Build Coastguard Worker punpckldq xmm2, xmm0, xmm3 ; 4 5 1684*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 1685*c0909341SAndroid Build Coastguard Worker punpckldq xmm3, xmm0 ; 5 6 1686*c0909341SAndroid Build Coastguard Worker punpcklwd xmm2, xmm3 ; 45 56 1687*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm4, xmm2, xm14 ; a2 b2 1688*c0909341SAndroid Build Coastguard Worker psrad xmm4, 6 1689*c0909341SAndroid Build Coastguard Worker packusdw xmm4, xmm4 1690*c0909341SAndroid Build Coastguard Worker pminsw xmm4, xm15 1691*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm4 1692*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm4, 1 1693*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1694*c0909341SAndroid Build Coastguard Worker sub hd, 2 1695*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 1696*c0909341SAndroid Build Coastguard Worker RET 1697*c0909341SAndroid Build Coastguard Worker.v_w4: 1698*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+r6 *2] 1699*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+r6 *1] 1700*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm2, [srcq+ssq*0] 1701*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*1] 1702*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1703*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 1704*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm3, 0x30 1705*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm2, 0x30 1706*c0909341SAndroid Build Coastguard Worker punpcklwd ymm1, ymm3 ; 01 12 1707*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm4, 0x30 1708*c0909341SAndroid Build Coastguard Worker vpblendd ymm4, ymm0, 0x30 1709*c0909341SAndroid Build Coastguard Worker punpcklwd ymm2, ymm4 ; 23 34 1710*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1711*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+ssq*1] 1712*c0909341SAndroid Build Coastguard Worker mova ymm4, ym11 1713*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ymm1, ym12 ; a0 b0 1714*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1715*c0909341SAndroid Build Coastguard Worker mova ymm1, ymm2 1716*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ymm2, ym13 ; a1 b1 1717*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm0, ymm3, 0x30 1718*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 1719*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, 0x30 1720*c0909341SAndroid Build Coastguard Worker punpcklwd ymm2, ymm3 ; 45 56 1721*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ymm2, ym14 ; a2 b2 1722*c0909341SAndroid Build Coastguard Worker psrad ymm4, 6 1723*c0909341SAndroid Build Coastguard Worker vextracti128 xmm3, ymm4, 1 1724*c0909341SAndroid Build Coastguard Worker packusdw xmm4, xmm3 1725*c0909341SAndroid Build Coastguard Worker pminsw xmm4, xm15 1726*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm4 1727*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm4 1728*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1729*c0909341SAndroid Build Coastguard Worker sub hd, 2 1730*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1731*c0909341SAndroid Build Coastguard Worker vzeroupper 1732*c0909341SAndroid Build Coastguard Worker RET 1733*c0909341SAndroid Build Coastguard Worker.v_w8: 1734*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*0] 1735*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m0, [srcq+r6 *2], 0 1736*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2 1737*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+ssq*1], 1 1738*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1739*c0909341SAndroid Build Coastguard Worker mova m5, [spel_v_shuf8] 1740*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4 1741*c0909341SAndroid Build Coastguard Worker vpermb m1, m5, m1 ; 01 12 1742*c0909341SAndroid Build Coastguard Worker vpermb m2, m5, m0 ; 23 34 1743*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1744*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*1], 3 1745*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1746*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*0] 1747*c0909341SAndroid Build Coastguard Worker mova m4, m11 1748*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m1 ; a0 b0 1749*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q1032 ; 4 5 6 1750*c0909341SAndroid Build Coastguard Worker mova m1, m2 1751*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m2 ; a1 b1 1752*c0909341SAndroid Build Coastguard Worker vpermb m2, m5, m0 ; 45 56 1753*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m2 ; a2 b2 1754*c0909341SAndroid Build Coastguard Worker psrad m4, 6 1755*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym3, m4, 1 1756*c0909341SAndroid Build Coastguard Worker packusdw ym4, ym3 1757*c0909341SAndroid Build Coastguard Worker pminsw ym4, ym15 1758*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm4 1759*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym4, 1 1760*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1761*c0909341SAndroid Build Coastguard Worker sub hd, 2 1762*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1763*c0909341SAndroid Build Coastguard Worker RET 1764*c0909341SAndroid Build Coastguard Worker.v_w16: 1765*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+r6 *1] 1766*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m0, [srcq+ssq*0], 1 1767*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+r6*2], 0 1768*c0909341SAndroid Build Coastguard Worker mova m6, [spel_v_shuf16] 1769*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*1] 1770*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1771*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*0], 1 1772*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m1 ; 12 1773*c0909341SAndroid Build Coastguard Worker vpermb m0, m6, m0 ; 01 1774*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m3 ; 34 1775*c0909341SAndroid Build Coastguard Worker mova m7, [deint_q_shuf] 1776*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 23 1777*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1778*c0909341SAndroid Build Coastguard Worker mova m5, m11 1779*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m1 ; b0 1780*c0909341SAndroid Build Coastguard Worker mova m4, m11 1781*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m0 ; a0 1782*c0909341SAndroid Build Coastguard Worker mova m1, m3 1783*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m3 ; b1 1784*c0909341SAndroid Build Coastguard Worker mova m0, m2 1785*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m2 ; a1 1786*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*1] 1787*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1788*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*0], 1 1789*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m3 ; 56 1790*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 45 1791*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m3 ; b2 1792*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m2 ; a2 1793*c0909341SAndroid Build Coastguard Worker psrad m5, 6 1794*c0909341SAndroid Build Coastguard Worker psrad m4, 6 1795*c0909341SAndroid Build Coastguard Worker packusdw m4, m5 1796*c0909341SAndroid Build Coastguard Worker pminsw m4, m15 1797*c0909341SAndroid Build Coastguard Worker vpermq m4, m7, m4 1798*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym4 1799*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m4, 1 1800*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1801*c0909341SAndroid Build Coastguard Worker sub hd, 2 1802*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1803*c0909341SAndroid Build Coastguard Worker RET 1804*c0909341SAndroid Build Coastguard Worker.v_w32: 1805*c0909341SAndroid Build Coastguard Worker.v_w64: 1806*c0909341SAndroid Build Coastguard Worker.v_w128: 1807*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq*8-256] 1808*c0909341SAndroid Build Coastguard Worker.v_w32_loop0: 1809*c0909341SAndroid Build Coastguard Worker movu m16, [srcq+r6 *2] 1810*c0909341SAndroid Build Coastguard Worker movu m17, [srcq+r6 *1] 1811*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*2] 1812*c0909341SAndroid Build Coastguard Worker movu m18, [srcq+ssq*0] 1813*c0909341SAndroid Build Coastguard Worker movu m19, [srcq+ssq*1] 1814*c0909341SAndroid Build Coastguard Worker mov r8, dstq 1815*c0909341SAndroid Build Coastguard Worker movu m20, [r7 +ssq*0] 1816*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m16, m17 ; 01 1817*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 1818*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m17, m18 ; 12 1819*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m18 1820*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m18, m19 ; 23 1821*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19 1822*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m19, m20 ; 34 1823*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m20 1824*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 1825*c0909341SAndroid Build Coastguard Worker mova m4, m11 1826*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m0 ; a0 1827*c0909341SAndroid Build Coastguard Worker mova m6, m11 1828*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m16 1829*c0909341SAndroid Build Coastguard Worker mova m5, m11 1830*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m1 ; b0 1831*c0909341SAndroid Build Coastguard Worker mova m7, m11 1832*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m17 1833*c0909341SAndroid Build Coastguard Worker mova m0, m2 1834*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m2 ; a1 1835*c0909341SAndroid Build Coastguard Worker mova m16, m18 1836*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m18 1837*c0909341SAndroid Build Coastguard Worker mova m1, m3 1838*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m3 ; b1 1839*c0909341SAndroid Build Coastguard Worker mova m17, m19 1840*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m19 1841*c0909341SAndroid Build Coastguard Worker movu m19, [r7+ssq*1] 1842*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 1843*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m20, m19 ; 45 1844*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m20, m19 1845*c0909341SAndroid Build Coastguard Worker movu m20, [r7+ssq*0] 1846*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m2 ; a2 1847*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m18 1848*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m19, m20 ; 56 1849*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m20 1850*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m3 ; b2 1851*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m19 1852*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m4, m6, m5, m7 1853*c0909341SAndroid Build Coastguard Worker packusdw m4, m6 1854*c0909341SAndroid Build Coastguard Worker packusdw m5, m7 1855*c0909341SAndroid Build Coastguard Worker pminsw m4, m15 1856*c0909341SAndroid Build Coastguard Worker pminsw m5, m15 1857*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], m4 1858*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*1], m5 1859*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 1860*c0909341SAndroid Build Coastguard Worker sub hd, 2 1861*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 1862*c0909341SAndroid Build Coastguard Worker add srcq, 64 1863*c0909341SAndroid Build Coastguard Worker add dstq, 64 1864*c0909341SAndroid Build Coastguard Worker movzx hd, wb 1865*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 1866*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop0 1867*c0909341SAndroid Build Coastguard Worker vzeroupper 1868*c0909341SAndroid Build Coastguard Worker RET 1869*c0909341SAndroid Build Coastguard Worker.hv: 1870*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1871*c0909341SAndroid Build Coastguard Worker jg .hv_w8 1872*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1873*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 1874*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1875*c0909341SAndroid Build Coastguard Worker shr myd, 16 1876*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1877*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1878*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 1879*c0909341SAndroid Build Coastguard Worker mov r6, ssq 1880*c0909341SAndroid Build Coastguard Worker sub srcq, 2 1881*c0909341SAndroid Build Coastguard Worker neg r6 1882*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 1883*c0909341SAndroid Build Coastguard Worker jnz .hv_12bit 1884*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_2176] 1885*c0909341SAndroid Build Coastguard Worker psllw xmm0, 6 1886*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1887*c0909341SAndroid Build Coastguard Worker.hv_12bit: 1888*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_640] 1889*c0909341SAndroid Build Coastguard Worker psllw xmm0, 4 1890*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 1891*c0909341SAndroid Build Coastguard Worker.hv_main: 1892*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r6 *2] 1893*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym4, [srcq+r6 *1], 1 1894*c0909341SAndroid Build Coastguard Worker vinserti32x4 m4, [srcq+ssq*0], 2 1895*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufA] 1896*c0909341SAndroid Build Coastguard Worker vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3 1897*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1898*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*0] ; 4 1899*c0909341SAndroid Build Coastguard Worker mova [buf+ 0], xmm0 1900*c0909341SAndroid Build Coastguard Worker mova [buf+16], xmm1 1901*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [buf+ 4] 1902*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [buf+ 8] 1903*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, xmm1 1904*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym13, [buf+20] 1905*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym14, [buf+24] 1906*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1907*c0909341SAndroid Build Coastguard Worker je .hv_w4 1908*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [spel_h_shufA] 1909*c0909341SAndroid Build Coastguard Worker mova m3, [spel_h_shuf2b] 1910*c0909341SAndroid Build Coastguard Worker mova m1, m10 1911*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 1912*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm6 1913*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m4, m5 1914*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_ 1915*c0909341SAndroid Build Coastguard Worker mova ym6, [spel_h_shuf2a] 1916*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m5 1917*c0909341SAndroid Build Coastguard Worker mova xm5, [spel_shuf2] 1918*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m4 1919*c0909341SAndroid Build Coastguard Worker vpermb m1, m3, m1 ; 01 12 1920*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym1, 1 ; 23 34 1921*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 1922*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 1923*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1924*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [srcq+ssq*0], 1 1925*c0909341SAndroid Build Coastguard Worker vpermb ym3, ym6, ym3 1926*c0909341SAndroid Build Coastguard Worker pmaddwd xmm0, xm12, xm1 ; a0 b0 1927*c0909341SAndroid Build Coastguard Worker mova xm4, xm10 1928*c0909341SAndroid Build Coastguard Worker vpdpwssd xm4, xm8, xm3 1929*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, ym3, 1 1930*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 1931*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xm13, xm2 ; a1 b1 1932*c0909341SAndroid Build Coastguard Worker vpdpwssd xm4, xm9, xm3 ; 5 6 1933*c0909341SAndroid Build Coastguard Worker vpermt2b xm2, xm5, xm4 ; 45 56 1934*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xm14, xm2 ; a2 b2 1935*c0909341SAndroid Build Coastguard Worker psrad xmm0, 10 1936*c0909341SAndroid Build Coastguard Worker packusdw xmm0, xmm0 1937*c0909341SAndroid Build Coastguard Worker pminsw xmm0, xm15 1938*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 1939*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 1940*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1941*c0909341SAndroid Build Coastguard Worker sub hd, 2 1942*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 1943*c0909341SAndroid Build Coastguard Worker RET 1944*c0909341SAndroid Build Coastguard Worker.hv_w4: 1945*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [spel_h_shufB] 1946*c0909341SAndroid Build Coastguard Worker mova ym0, [spel_shuf4a] 1947*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 1948*c0909341SAndroid Build Coastguard Worker mova m2, m10 1949*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m8, m1 1950*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm5, xm6 1951*c0909341SAndroid Build Coastguard Worker mova xm3, xm10 1952*c0909341SAndroid Build Coastguard Worker vpdpwssd xm3, xm8, xm1 1953*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 1954*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm7 1955*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m4 ; 0 1 2 3 1956*c0909341SAndroid Build Coastguard Worker vpdpwssd xm3, xm9, xm5 ; 4 1957*c0909341SAndroid Build Coastguard Worker mova ym5, [spel_shuf4b] 1958*c0909341SAndroid Build Coastguard Worker vpermb m1, m0, m2 ; 01 12 1959*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m3, q1032 ; 2 3 4 1960*c0909341SAndroid Build Coastguard Worker vpermb m2, m0, m2 ; 23 34 1961*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1962*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+ssq*1] 1963*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1964*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [srcq+ssq*0], 1 1965*c0909341SAndroid Build Coastguard Worker pmaddwd ym0, ym12, ym1 ; a0 b0 1966*c0909341SAndroid Build Coastguard Worker mova ym1, ym2 1967*c0909341SAndroid Build Coastguard Worker pshufb ym4, ym3, ym6 1968*c0909341SAndroid Build Coastguard Worker mova ym2, ym10 1969*c0909341SAndroid Build Coastguard Worker vpdpwssd ym2, ym8, ym4 1970*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym7 1971*c0909341SAndroid Build Coastguard Worker vpdpwssd ym0, ym13, ym1 ; a1 b1 1972*c0909341SAndroid Build Coastguard Worker vpdpwssd ym2, ym9, ym3 ; 5 6 1973*c0909341SAndroid Build Coastguard Worker vpermt2b ym2, ym5, ym1 ; 45 56 1974*c0909341SAndroid Build Coastguard Worker vpdpwssd ym0, ym14, ym2 ; a2 b2 1975*c0909341SAndroid Build Coastguard Worker psrad ym0, 10 1976*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm4, ym0, 1 1977*c0909341SAndroid Build Coastguard Worker packusdw xm0, xm4 1978*c0909341SAndroid Build Coastguard Worker pminsw xmm0, xm0, xm15 1979*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 1980*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 1981*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1982*c0909341SAndroid Build Coastguard Worker sub hd, 2 1983*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1984*c0909341SAndroid Build Coastguard Worker RET 1985*c0909341SAndroid Build Coastguard Worker.hv_w8: 1986*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1987*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 1988*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1989*c0909341SAndroid Build Coastguard Worker shr myd, 16 1990*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1991*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1992*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 1993*c0909341SAndroid Build Coastguard Worker mov r6, ssq 1994*c0909341SAndroid Build Coastguard Worker sub srcq, 4 1995*c0909341SAndroid Build Coastguard Worker neg r6 1996*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 1997*c0909341SAndroid Build Coastguard Worker jnz .hv_w8_12bit 1998*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_2176] 1999*c0909341SAndroid Build Coastguard Worker psllw xmm0, 6 2000*c0909341SAndroid Build Coastguard Worker jmp .hv_w8_main 2001*c0909341SAndroid Build Coastguard Worker.hv_w8_12bit: 2002*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_640] 2003*c0909341SAndroid Build Coastguard Worker psllw xmm0, 4 2004*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 2005*c0909341SAndroid Build Coastguard Worker.hv_w8_main: 2006*c0909341SAndroid Build Coastguard Worker mova [buf+ 0], xmm0 2007*c0909341SAndroid Build Coastguard Worker mova [buf+16], xmm1 2008*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, xmm0 2009*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [buf+ 4] 2010*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [buf+ 8] 2011*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm1 2012*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [buf+20] 2013*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [buf+24] 2014*c0909341SAndroid Build Coastguard Worker cmp wd, 16 2015*c0909341SAndroid Build Coastguard Worker jge .hv_w16 2016*c0909341SAndroid Build Coastguard Worker mova m6, [spel_h_shufA] 2017*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+r6 *2] 2018*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1 2019*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+ssq*0] 2020*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3 2021*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2022*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*0] ; 4 2023*c0909341SAndroid Build Coastguard Worker movu m7, [spel_h_shufC] 2024*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m16 2025*c0909341SAndroid Build Coastguard Worker mova m1, m8 2026*c0909341SAndroid Build Coastguard Worker vpermb m4, m6, m17 2027*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m3 ; a0 b0 2028*c0909341SAndroid Build Coastguard Worker mova m2, m8 2029*c0909341SAndroid Build Coastguard Worker vpermb m5, m6, m18 2030*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m4 ; c0 d0 2031*c0909341SAndroid Build Coastguard Worker mova m0, m8 2032*c0909341SAndroid Build Coastguard Worker vpermb m16, m7, m16 2033*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m9, m5 ; e0 2034*c0909341SAndroid Build Coastguard Worker vpermb m17, m7, m17 2035*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m16 ; a2 b2 2036*c0909341SAndroid Build Coastguard Worker vpermb m18, m7, m18 2037*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m11, m17 ; c2 d2 2038*c0909341SAndroid Build Coastguard Worker shufpd m3, m16, 0x55 2039*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m18 ; e2 2040*c0909341SAndroid Build Coastguard Worker mova m16, [spel_shuf8a] 2041*c0909341SAndroid Build Coastguard Worker shufpd m4, m17, 0x55 2042*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m3 ; a1 b1 2043*c0909341SAndroid Build Coastguard Worker shufpd m5, m18, 0x55 2044*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m10, m4 ; c1 d1 2045*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m5 ; e1 2046*c0909341SAndroid Build Coastguard Worker mova m5, [spel_shuf8b] 2047*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m16, m2 ; 01 12 2048*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m16, m0 ; 23 34 2049*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 2050*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*1] 2051*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2052*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+ssq*0], 1 2053*c0909341SAndroid Build Coastguard Worker mova m0, m8 2054*c0909341SAndroid Build Coastguard Worker vpermb m17, m6, m18 2055*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m9, m17 ; f0 g0 2056*c0909341SAndroid Build Coastguard Worker vpermb m18, m7, m18 2057*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m12, m1 ; A0 B0 2058*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m18 ; f2 g2 2059*c0909341SAndroid Build Coastguard Worker shufpd m17, m18, 0x55 2060*c0909341SAndroid Build Coastguard Worker mova m1, m2 2061*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m13, m2 ; A1 B1 2062*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m17 ; f1 g1 2063*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m5, m0 ; 45 56 2064*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m14, m2 ; A2 B2 2065*c0909341SAndroid Build Coastguard Worker psrad m16, 10 2066*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym17, m16, 1 2067*c0909341SAndroid Build Coastguard Worker packusdw ym16, ym17 2068*c0909341SAndroid Build Coastguard Worker pminsw ym16, ym15 2069*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm16 2070*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ym16, 1 2071*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2072*c0909341SAndroid Build Coastguard Worker sub hd, 2 2073*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 2074*c0909341SAndroid Build Coastguard Worker vzeroupper 2075*c0909341SAndroid Build Coastguard Worker RET 2076*c0909341SAndroid Build Coastguard Worker.hv_w16: 2077*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m20, [spel_h_shufA] 2078*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m21, [spel_h_shufB] 2079*c0909341SAndroid Build Coastguard Worker jg .hv_w32 2080*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [srcq+r6 *2+ 8] 2081*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 2082*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 2083*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+r6 *1+ 0] 2084*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+r6 *1+12] 2085*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [srcq+ssq*0+ 0], 1 2086*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2 2087*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*1+ 0] 2088*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*1+12] 2089*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2090*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+ssq*0+ 0], 1 2091*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4 2092*c0909341SAndroid Build Coastguard Worker pshufb m2, m20 2093*c0909341SAndroid Build Coastguard Worker mova m1, m8 2094*c0909341SAndroid Build Coastguard Worker pshufb m3, m16, m20 2095*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m2 ; a2 2096*c0909341SAndroid Build Coastguard Worker mova m2, m8 2097*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m21 2098*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m3 ; b0 c0 2099*c0909341SAndroid Build Coastguard Worker mova m3, m8 2100*c0909341SAndroid Build Coastguard Worker pshufb m5, m18, m20 2101*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m11, m4 ; b2' c2' 2102*c0909341SAndroid Build Coastguard Worker mova m4, m8 2103*c0909341SAndroid Build Coastguard Worker pshufb m7, m19, m21 2104*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m9, m5 ; d0 e0 2105*c0909341SAndroid Build Coastguard Worker mova m5, m8 2106*c0909341SAndroid Build Coastguard Worker pshufb m0, m6, m20 2107*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m11, m7 ; d2' e2' 2108*c0909341SAndroid Build Coastguard Worker mova m7, [spel_shuf16] 2109*c0909341SAndroid Build Coastguard Worker pshufb m16, m21 2110*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m0 ; a0 2111*c0909341SAndroid Build Coastguard Worker pshufb m17, m20 2112*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m10, m16 ; b1 c1 2113*c0909341SAndroid Build Coastguard Worker pshufb m18, m21 2114*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m10, m17 ; b1' c1' 2115*c0909341SAndroid Build Coastguard Worker pshufb m19, m20 2116*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m10, m18 ; d1 e1 2117*c0909341SAndroid Build Coastguard Worker pshufb m6, m21 2118*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m10, m19 ; d1' e1' 2119*c0909341SAndroid Build Coastguard Worker shufpd m16, m17, 0x55 2120*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m6 ; a1 2121*c0909341SAndroid Build Coastguard Worker shufpd m18, m19, 0x55 2122*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m11, m16 ; b2 c2 2123*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m9, m16 ; b0' c0' 2124*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m11, m18 ; d2 e2 2125*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m9, m18 ; d0' e0' 2126*c0909341SAndroid Build Coastguard Worker pslldq m1, 1 2127*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m7, m3 ; 12 2128*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m7, m5 ; 34 2129*c0909341SAndroid Build Coastguard Worker vpshrdd m1, m2, 16 ; 01 2130*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 23 2131*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 2132*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*1+ 0] 2133*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*1+12] 2134*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2135*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+ssq*0+ 0], 1 2136*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*0+12], 1 2137*c0909341SAndroid Build Coastguard Worker mova m5, m8 2138*c0909341SAndroid Build Coastguard Worker mova m6, m8 2139*c0909341SAndroid Build Coastguard Worker pshufb m17, m18, m20 2140*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m9, m17 ; f0 g0 2141*c0909341SAndroid Build Coastguard Worker pshufb m16, m19, m21 2142*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m11, m16 ; f2' g2' 2143*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m12, m2 ; B0 2144*c0909341SAndroid Build Coastguard Worker mova m2, m4 2145*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m12, m1 ; A0 2146*c0909341SAndroid Build Coastguard Worker mova m1, m3 2147*c0909341SAndroid Build Coastguard Worker pshufb m18, m21 2148*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m10, m18 ; f1 g1 2149*c0909341SAndroid Build Coastguard Worker pshufb m19, m20 2150*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m10, m19 ; f1' g1' 2151*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m13, m4 ; B1 2152*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m13, m3 ; A1 2153*c0909341SAndroid Build Coastguard Worker shufpd m18, m19, 0x55 2154*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m11, m18 ; f2 g2 2155*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m9, m18 ; f0' g0' 2156*c0909341SAndroid Build Coastguard Worker mova m4, m7 2157*c0909341SAndroid Build Coastguard Worker vpermi2b m4, m5, m6 ; 56 2158*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 45 2159*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m14, m4 ; B2 2160*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m14, m3 ; A2 2161*c0909341SAndroid Build Coastguard Worker psrad m16, 10 2162*c0909341SAndroid Build Coastguard Worker psrad m17, 10 2163*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m16, m17, q3232 2164*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, ym17, 1 2165*c0909341SAndroid Build Coastguard Worker packusdw m16, m18 2166*c0909341SAndroid Build Coastguard Worker pminsw m16, m15 2167*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym16 2168*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m16, 1 2169*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2170*c0909341SAndroid Build Coastguard Worker sub hd, 2 2171*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 2172*c0909341SAndroid Build Coastguard Worker vzeroupper 2173*c0909341SAndroid Build Coastguard Worker RET 2174*c0909341SAndroid Build Coastguard Worker.hv_w32: 2175*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 28 2176*c0909341SAndroid Build Coastguard Worker mova m27, [spel_shuf32] 2177*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq*8-256] 2178*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0: 2179*c0909341SAndroid Build Coastguard Worker movu m16, [srcq+r6 *2+ 0] 2180*c0909341SAndroid Build Coastguard Worker movu m7, [srcq+r6 *2+12] 2181*c0909341SAndroid Build Coastguard Worker movu m6, [srcq+r6 *1+ 0] 2182*c0909341SAndroid Build Coastguard Worker movu m18, [srcq+r6 *1+12] 2183*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*2] 2184*c0909341SAndroid Build Coastguard Worker movu m17, [srcq+ssq*0+ 0] 2185*c0909341SAndroid Build Coastguard Worker movu m19, [srcq+ssq*0+12] 2186*c0909341SAndroid Build Coastguard Worker movu m22, [srcq+ssq*1+ 0] 2187*c0909341SAndroid Build Coastguard Worker movu m24, [srcq+ssq*1+12] 2188*c0909341SAndroid Build Coastguard Worker mov r8, dstq 2189*c0909341SAndroid Build Coastguard Worker movu m23, [r7 +ssq*0+ 0] 2190*c0909341SAndroid Build Coastguard Worker movu m25, [r7 +ssq*0+12] 2191*c0909341SAndroid Build Coastguard Worker pshufb m1, m16, m20 2192*c0909341SAndroid Build Coastguard Worker mova m0, m8 2193*c0909341SAndroid Build Coastguard Worker pshufb m2, m7, m21 2194*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m9, m1 ; a0 2195*c0909341SAndroid Build Coastguard Worker mova m1, m8 2196*c0909341SAndroid Build Coastguard Worker pshufb m4, m6, m20 2197*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m2 ; a2' 2198*c0909341SAndroid Build Coastguard Worker mova m2, m8 2199*c0909341SAndroid Build Coastguard Worker pshufb m3, m17, m20 2200*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m4 ; b0 2201*c0909341SAndroid Build Coastguard Worker mova m4, m8 2202*c0909341SAndroid Build Coastguard Worker pshufb m5, m18, m21 2203*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m9, m3 ; c0 2204*c0909341SAndroid Build Coastguard Worker mova m3, m8 2205*c0909341SAndroid Build Coastguard Worker pshufb m26, m19, m21 2206*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m11, m5 ; b2' 2207*c0909341SAndroid Build Coastguard Worker mova m5, m8 2208*c0909341SAndroid Build Coastguard Worker pshufb m16, m21 2209*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m11, m26 ; c2' 2210*c0909341SAndroid Build Coastguard Worker pshufb m7, m20 2211*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m16 ; a1 2212*c0909341SAndroid Build Coastguard Worker pshufb m6, m21 2213*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m7 ; a1' 2214*c0909341SAndroid Build Coastguard Worker pshufb m17, m21 2215*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m10, m6 ; b1 2216*c0909341SAndroid Build Coastguard Worker pshufb m18, m20 2217*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m10, m17 ; c1 2218*c0909341SAndroid Build Coastguard Worker pshufb m19, m20 2219*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m10, m18 ; b1' 2220*c0909341SAndroid Build Coastguard Worker shufpd m16, m7, 0x55 2221*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m10, m19 ; c1' 2222*c0909341SAndroid Build Coastguard Worker shufpd m6, m18, 0x55 2223*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m16 ; a2 2224*c0909341SAndroid Build Coastguard Worker shufpd m17, m19, 0x55 2225*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m16 ; a0' 2226*c0909341SAndroid Build Coastguard Worker pshufb m16, m22, m20 2227*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m11, m6 ; b2 2228*c0909341SAndroid Build Coastguard Worker pshufb m7, m23, m20 2229*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m11, m17 ; c2 2230*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m9, m6 ; b0' 2231*c0909341SAndroid Build Coastguard Worker mova m6, m8 2232*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m9, m17 ; c0' 2233*c0909341SAndroid Build Coastguard Worker pshufb m17, m24, m21 2234*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m9, m16 ; d0 2235*c0909341SAndroid Build Coastguard Worker mova m16, m8 2236*c0909341SAndroid Build Coastguard Worker pshufb m26, m25, m21 2237*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m9, m7 ; e0 2238*c0909341SAndroid Build Coastguard Worker mova m7, m8 2239*c0909341SAndroid Build Coastguard Worker pshufb m22, m21 2240*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m11, m17 ; d2' 2241*c0909341SAndroid Build Coastguard Worker mova m17, m8 2242*c0909341SAndroid Build Coastguard Worker pshufb m23, m21 2243*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m11, m26 ; e2' 2244*c0909341SAndroid Build Coastguard Worker pshufb m24, m20 2245*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m10, m22 ; d1 2246*c0909341SAndroid Build Coastguard Worker pshufb m25, m20 2247*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m10, m23 ; e1 2248*c0909341SAndroid Build Coastguard Worker shufpd m22, m24, 0x55 2249*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m10, m24 ; d1' 2250*c0909341SAndroid Build Coastguard Worker shufpd m23, m25, 0x55 2251*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m10, m25 ; e1' 2252*c0909341SAndroid Build Coastguard Worker pslldq m0, 1 2253*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m11, m22 ; d2 2254*c0909341SAndroid Build Coastguard Worker pslldq m1, 1 2255*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m11, m23 ; e2 2256*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m27, m4 ; 12 2257*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m9, m22 ; d0' 2258*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m27, m5 ; 12' 2259*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m9, m23 ; e0' 2260*c0909341SAndroid Build Coastguard Worker vpshrdd m0, m2, 16 ; 01 2261*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m27, m16 ; 34 2262*c0909341SAndroid Build Coastguard Worker vpshrdd m1, m3, 16 ; 01' 2263*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m27, m17 ; 34' 2264*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m2, m6, 16 ; 23 2265*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m3, m7, 16 ; 23' 2266*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 2267*c0909341SAndroid Build Coastguard Worker movu m22, [r7+ssq*1+ 0] 2268*c0909341SAndroid Build Coastguard Worker movu m24, [r7+ssq*1+12] 2269*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 2270*c0909341SAndroid Build Coastguard Worker movu m23, [r7+ssq*0+ 0] 2271*c0909341SAndroid Build Coastguard Worker movu m25, [r7+ssq*0+12] 2272*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m12, m2 ; B0 2273*c0909341SAndroid Build Coastguard Worker mova m2, m6 2274*c0909341SAndroid Build Coastguard Worker pmaddwd m19, m12, m3 ; B0' 2275*c0909341SAndroid Build Coastguard Worker mova m3, m7 2276*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m12, m0 ; A0 2277*c0909341SAndroid Build Coastguard Worker mova m0, m4 2278*c0909341SAndroid Build Coastguard Worker pmaddwd m18, m12, m1 ; A0' 2279*c0909341SAndroid Build Coastguard Worker mova m1, m5 2280*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m13, m6 ; B1 2281*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m13, m7 ; B1' 2282*c0909341SAndroid Build Coastguard Worker mova m6, m8 2283*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m13, m4 ; A1 2284*c0909341SAndroid Build Coastguard Worker pshufb m4, m22, m20 2285*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m13, m5 ; A1' 2286*c0909341SAndroid Build Coastguard Worker pshufb m7, m23, m20 2287*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m9, m4 ; f0 2288*c0909341SAndroid Build Coastguard Worker mova m4, m8 2289*c0909341SAndroid Build Coastguard Worker pshufb m5, m24, m21 2290*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m9, m7 ; g0 2291*c0909341SAndroid Build Coastguard Worker mova m7, m8 2292*c0909341SAndroid Build Coastguard Worker pshufb m26, m25, m21 2293*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m11, m5 ; f2' 2294*c0909341SAndroid Build Coastguard Worker mova m5, m8 2295*c0909341SAndroid Build Coastguard Worker pshufb m22, m21 2296*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m11, m26 ; g2' 2297*c0909341SAndroid Build Coastguard Worker pshufb m23, m21 2298*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m10, m22 ; f1 2299*c0909341SAndroid Build Coastguard Worker pshufb m24, m20 2300*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m10, m23 ; g1 2301*c0909341SAndroid Build Coastguard Worker pshufb m25, m20 2302*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m10, m24 ; f1' 2303*c0909341SAndroid Build Coastguard Worker shufpd m22, m24, 0x55 2304*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m10, m25 ; g1' 2305*c0909341SAndroid Build Coastguard Worker shufpd m23, m25, 0x55 2306*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m11, m22 ; f2 2307*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m11, m23 ; g2 2308*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m9, m22 ; f0' 2309*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m9, m23 ; g0' 2310*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m27, m4 ; 56 2311*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m27, m5 ; 56' 2312*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m14, m6 ; B2 2313*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m2, m6, 16 ; 45 2314*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m14, m7 ; B2' 2315*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m3, m7, 16 ; 45' 2316*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m14, m4 ; A2 2317*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m14, m5 ; A2' 2318*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 10}, m17, m19, m16, m18 2319*c0909341SAndroid Build Coastguard Worker packusdw m17, m19 2320*c0909341SAndroid Build Coastguard Worker packusdw m16, m18 2321*c0909341SAndroid Build Coastguard Worker pminsw m17, m15 2322*c0909341SAndroid Build Coastguard Worker pminsw m16, m15 2323*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], m16 2324*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*1], m17 2325*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 2326*c0909341SAndroid Build Coastguard Worker sub hd, 2 2327*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 2328*c0909341SAndroid Build Coastguard Worker add srcq, 64 2329*c0909341SAndroid Build Coastguard Worker add dstq, 64 2330*c0909341SAndroid Build Coastguard Worker movzx hd, wb 2331*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 2332*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop0 2333*c0909341SAndroid Build Coastguard Worker RET 2334*c0909341SAndroid Build Coastguard Worker 2335*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc 2336*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc 2337*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc 2338*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc 2339*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp, SHARP, SHARP 2340*c0909341SAndroid Build Coastguard Worker 2341*c0909341SAndroid Build Coastguard Workercglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 2342*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2343*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 2344*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2345*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 2346*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx512icl] 2347*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 2348*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2349*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2350*c0909341SAndroid Build Coastguard Worker jnz .h 2351*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2352*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put 2353*c0909341SAndroid Build Coastguard Worker.v: 2354*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2355*c0909341SAndroid Build Coastguard Worker shr myd, 16 2356*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2357*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2358*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_32] 2359*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+myq*8] 2360*c0909341SAndroid Build Coastguard Worker tzcnt r7d, wd 2361*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, r8m 2362*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2363*c0909341SAndroid Build Coastguard Worker movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] 2364*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2365*c0909341SAndroid Build Coastguard Worker mova [rsp+stack_offset+8], xmm0 2366*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 2367*c0909341SAndroid Build Coastguard Worker add r7, r8 2368*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [rsp+stack_offset+12] 2369*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [rsp+stack_offset+16] 2370*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [rsp+stack_offset+20] 2371*c0909341SAndroid Build Coastguard Worker jmp r7 2372*c0909341SAndroid Build Coastguard Worker.v_w2: 2373*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+ssq*0] 2374*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*1], 1 2375*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*2], 2 2376*c0909341SAndroid Build Coastguard Worker add srcq, r6 2377*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 2378*c0909341SAndroid Build Coastguard Worker movd xmm3, [srcq+ssq*1] 2379*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm1, [srcq+ssq*2] 2380*c0909341SAndroid Build Coastguard Worker add srcq, r6 2381*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 2382*c0909341SAndroid Build Coastguard Worker vpblendd xmm3, xmm1, 0x02 ; 4 5 2383*c0909341SAndroid Build Coastguard Worker vpblendd xmm1, xmm0, 0x02 ; 5 6 2384*c0909341SAndroid Build Coastguard Worker palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2385*c0909341SAndroid Build Coastguard Worker punpcklwd xmm3, xmm1 ; 45 56 2386*c0909341SAndroid Build Coastguard Worker punpcklwd xmm1, xmm2, xmm4 ; 01 12 2387*c0909341SAndroid Build Coastguard Worker punpckhwd xmm2, xmm4 ; 23 34 2388*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 2389*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm4, [srcq+ssq*1] 2390*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2391*c0909341SAndroid Build Coastguard Worker mova xmm5, xm10 2392*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xm12, xmm1 ; a0 b0 2393*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 2394*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xm13, xmm2 ; a1 b1 2395*c0909341SAndroid Build Coastguard Worker mova xmm2, xmm3 2396*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xm14, xmm3 ; a2 b2 2397*c0909341SAndroid Build Coastguard Worker vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2398*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 2399*c0909341SAndroid Build Coastguard Worker vpblendd xmm4, xmm0, 0x02 ; 7 8 2400*c0909341SAndroid Build Coastguard Worker punpcklwd xmm3, xmm4 ; 67 78 2401*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xm15, xmm3 ; a3 b3 2402*c0909341SAndroid Build Coastguard Worker psrad xmm5, 6 2403*c0909341SAndroid Build Coastguard Worker packusdw xmm5, xmm5 2404*c0909341SAndroid Build Coastguard Worker pminsw xmm5, xm11 2405*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm5 2406*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm5, 1 2407*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2408*c0909341SAndroid Build Coastguard Worker sub hd, 2 2409*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 2410*c0909341SAndroid Build Coastguard Worker RET 2411*c0909341SAndroid Build Coastguard Worker.v_w4: 2412*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*0] 2413*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*1] 2414*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm2, [srcq+ssq*2] 2415*c0909341SAndroid Build Coastguard Worker add srcq, r6 2416*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*0] 2417*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+ssq*1] 2418*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm5, [srcq+ssq*2] 2419*c0909341SAndroid Build Coastguard Worker add srcq, r6 2420*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm0, 0x30 2421*c0909341SAndroid Build Coastguard Worker vpblendd ymm0, ymm2, 0x30 2422*c0909341SAndroid Build Coastguard Worker punpcklwd ymm1, ymm0 ; 01 12 2423*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 2424*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm4, 0x30 2425*c0909341SAndroid Build Coastguard Worker vpblendd ymm4, ymm3, 0x30 2426*c0909341SAndroid Build Coastguard Worker punpcklwd ymm2, ymm4 ; 23 34 2427*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm5, 0x30 2428*c0909341SAndroid Build Coastguard Worker vpblendd ymm5, ymm0, 0x30 2429*c0909341SAndroid Build Coastguard Worker punpcklwd ymm3, ymm5 ; 45 56 2430*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 2431*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm5, [srcq+ssq*1] 2432*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2433*c0909341SAndroid Build Coastguard Worker mova ymm4, ym10 2434*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ym12, ymm1 ; a0 b0 2435*c0909341SAndroid Build Coastguard Worker mova ymm1, ymm2 2436*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ym13, ymm2 ; a1 b1 2437*c0909341SAndroid Build Coastguard Worker mova ymm2, ymm3 2438*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ym14, ymm3 ; a2 b2 2439*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, ymm5, 0x30 2440*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 2441*c0909341SAndroid Build Coastguard Worker vpblendd ymm5, ymm0, 0x30 2442*c0909341SAndroid Build Coastguard Worker punpcklwd ymm3, ymm5 ; 67 78 2443*c0909341SAndroid Build Coastguard Worker vpdpwssd ymm4, ym15, ymm3 ; a3 b3 2444*c0909341SAndroid Build Coastguard Worker psrad ymm4, 6 2445*c0909341SAndroid Build Coastguard Worker vextracti128 xmm5, ymm4, 1 2446*c0909341SAndroid Build Coastguard Worker packusdw xmm4, xmm5 2447*c0909341SAndroid Build Coastguard Worker pminsw xmm4, xm11 2448*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm4 2449*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm4 2450*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2451*c0909341SAndroid Build Coastguard Worker sub hd, 2 2452*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 2453*c0909341SAndroid Build Coastguard Worker vzeroupper 2454*c0909341SAndroid Build Coastguard Worker RET 2455*c0909341SAndroid Build Coastguard Worker.v_w8: 2456*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ssq*2] 2457*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m2, [srcq+ssq*0], 0 2458*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 2459*c0909341SAndroid Build Coastguard Worker add srcq, r6 2460*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+ssq*0], 1 2461*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 2462*c0909341SAndroid Build Coastguard Worker mova m6, [spel_v_shuf8] 2463*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*1] 2464*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+ssq*2], 1 2465*c0909341SAndroid Build Coastguard Worker add srcq, r6 2466*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 2467*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m1 ; 01 12 2468*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m2 ; 23 34 2469*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m0 ; 45 56 2470*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 2471*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*1], 3 2472*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2473*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*0] 2474*c0909341SAndroid Build Coastguard Worker mova m4, m10 2475*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m1 ; a0 b0 2476*c0909341SAndroid Build Coastguard Worker mova m1, m2 2477*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m5, q1032 ; 6 7 8 2478*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m2 ; a1 b1 2479*c0909341SAndroid Build Coastguard Worker mova m2, m3 2480*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m3 ; a2 b2 2481*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m0 ; 67 78 2482*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m15, m3 ; a3 b3 2483*c0909341SAndroid Build Coastguard Worker psrad m4, 6 2484*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym5, m4, 1 2485*c0909341SAndroid Build Coastguard Worker packusdw ym4, ym5 2486*c0909341SAndroid Build Coastguard Worker pminsw ym4, ym11 2487*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm4 2488*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym4, 1 2489*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2490*c0909341SAndroid Build Coastguard Worker sub hd, 2 2491*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2492*c0909341SAndroid Build Coastguard Worker RET 2493*c0909341SAndroid Build Coastguard Worker.v_w16: 2494*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+ssq*1] 2495*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m0, [srcq+ssq*2], 1 2496*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 0 2497*c0909341SAndroid Build Coastguard Worker mova m8, [spel_v_shuf16] 2498*c0909341SAndroid Build Coastguard Worker add srcq, r6 2499*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*0] 2500*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*1], 1 2501*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+ssq*2] 2502*c0909341SAndroid Build Coastguard Worker add srcq, r6 2503*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+ssq*0], 1 2504*c0909341SAndroid Build Coastguard Worker vpermb m1, m8, m1 ; 12 2505*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 ; 01 2506*c0909341SAndroid Build Coastguard Worker vpermb m3, m8, m3 ; 34 2507*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, m5 ; 56 2508*c0909341SAndroid Build Coastguard Worker mova m9, [deint_q_shuf] 2509*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 23 2510*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m3, m5, 16 ; 45 2511*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 2512*c0909341SAndroid Build Coastguard Worker mova m7, m10 2513*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m1 ; b0 2514*c0909341SAndroid Build Coastguard Worker mova m6, m10 2515*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m0 ; a0 2516*c0909341SAndroid Build Coastguard Worker mova m1, m3 2517*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m3 ; b1 2518*c0909341SAndroid Build Coastguard Worker mova m0, m2 2519*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m2 ; a1 2520*c0909341SAndroid Build Coastguard Worker mova m3, m5 2521*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m5 ; b2 2522*c0909341SAndroid Build Coastguard Worker mova m2, m4 2523*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m4 ; a2 2524*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+ssq*1] 2525*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2526*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+ssq*0], 1 2527*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, m5 ; 78 2528*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m3, m5, 16 ; 67 2529*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m15, m5 ; b3 2530*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m15, m4 ; a3 2531*c0909341SAndroid Build Coastguard Worker psrad m7, 6 2532*c0909341SAndroid Build Coastguard Worker psrad m6, 6 2533*c0909341SAndroid Build Coastguard Worker packusdw m6, m7 2534*c0909341SAndroid Build Coastguard Worker pminsw m6, m11 2535*c0909341SAndroid Build Coastguard Worker vpermq m6, m9, m6 2536*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym6 2537*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m6, 1 2538*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2539*c0909341SAndroid Build Coastguard Worker sub hd, 2 2540*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 2541*c0909341SAndroid Build Coastguard Worker RET 2542*c0909341SAndroid Build Coastguard Worker.v_w32: 2543*c0909341SAndroid Build Coastguard Worker.v_w64: 2544*c0909341SAndroid Build Coastguard Worker.v_w128: 2545*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 23 2546*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq*8-256] 2547*c0909341SAndroid Build Coastguard Worker.v_w32_loop0: 2548*c0909341SAndroid Build Coastguard Worker movu m16, [srcq+ssq*0] 2549*c0909341SAndroid Build Coastguard Worker movu m17, [srcq+ssq*1] 2550*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+r6 ] 2551*c0909341SAndroid Build Coastguard Worker movu m18, [srcq+ssq*2] 2552*c0909341SAndroid Build Coastguard Worker movu m19, [r7 +ssq*0] 2553*c0909341SAndroid Build Coastguard Worker mov r8, dstq 2554*c0909341SAndroid Build Coastguard Worker movu m20, [r7 +ssq*1] 2555*c0909341SAndroid Build Coastguard Worker movu m21, [r7 +ssq*2] 2556*c0909341SAndroid Build Coastguard Worker add r7, r6 2557*c0909341SAndroid Build Coastguard Worker movu m22, [r7 +ssq*0] 2558*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m16, m17 ; 01l 2559*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 ; 01h 2560*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m17, m18 ; 12l 2561*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m18 ; 12h 2562*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m18, m19 ; 23l 2563*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19 ; 23h 2564*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m19, m20 ; 34l 2565*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m20 ; 34h 2566*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m20, m21 ; 45l 2567*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m21 ; 45h 2568*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m21, m22 ; 56l 2569*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m22 ; 56h 2570*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 2571*c0909341SAndroid Build Coastguard Worker mova m6, m10 2572*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m0 ; a0l 2573*c0909341SAndroid Build Coastguard Worker mova m8, m10 2574*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m12, m16 ; a0h 2575*c0909341SAndroid Build Coastguard Worker mova m7, m10 2576*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m1 ; b0l 2577*c0909341SAndroid Build Coastguard Worker mova m9, m10 2578*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m12, m17 ; b0h 2579*c0909341SAndroid Build Coastguard Worker mova m0, m2 2580*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m2 ; a1l 2581*c0909341SAndroid Build Coastguard Worker mova m16, m18 2582*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m13, m18 ; a1h 2583*c0909341SAndroid Build Coastguard Worker mova m1, m3 2584*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m3 ; b1l 2585*c0909341SAndroid Build Coastguard Worker mova m17, m19 2586*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m13, m19 ; b1h 2587*c0909341SAndroid Build Coastguard Worker mova m2, m4 2588*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m4 ; a2l 2589*c0909341SAndroid Build Coastguard Worker mova m18, m20 2590*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m14, m20 ; a2h 2591*c0909341SAndroid Build Coastguard Worker mova m3, m5 2592*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m5 ; b2l 2593*c0909341SAndroid Build Coastguard Worker mova m19, m21 2594*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m14, m21 ; b2h 2595*c0909341SAndroid Build Coastguard Worker movu m21, [r7+ssq*1] 2596*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 2597*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m22, m21 ; 67l 2598*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m22, m21 ; 67h 2599*c0909341SAndroid Build Coastguard Worker movu m22, [r7+ssq*0] 2600*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m15, m4 ; a3l 2601*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m15, m20 ; a3h 2602*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m21, m22 ; 78l 2603*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m22 ; 78h 2604*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m15, m5 ; b3l 2605*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m15, m21 ; b3h 2606*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m6, m8, m7, m9 2607*c0909341SAndroid Build Coastguard Worker packusdw m6, m8 2608*c0909341SAndroid Build Coastguard Worker packusdw m7, m9 2609*c0909341SAndroid Build Coastguard Worker pminsw m6, m11 2610*c0909341SAndroid Build Coastguard Worker pminsw m7, m11 2611*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], m6 2612*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*1], m7 2613*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 2614*c0909341SAndroid Build Coastguard Worker sub hd, 2 2615*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 2616*c0909341SAndroid Build Coastguard Worker add srcq, 64 2617*c0909341SAndroid Build Coastguard Worker add dstq, 64 2618*c0909341SAndroid Build Coastguard Worker movzx hd, wb 2619*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 2620*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop0 2621*c0909341SAndroid Build Coastguard Worker RET 2622*c0909341SAndroid Build Coastguard Worker.h_w2: 2623*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 2624*c0909341SAndroid Build Coastguard Worker mova ym2, [spel_h_shuf2a] 2625*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2626*c0909341SAndroid Build Coastguard Worker pshufd xmm3, xmm0, q1111 2627*c0909341SAndroid Build Coastguard Worker pshufd xmm4, xmm0, q2222 2628*c0909341SAndroid Build Coastguard Worker.h_w2_loop: 2629*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*0] 2630*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [srcq+ssq*1], 1 2631*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2632*c0909341SAndroid Build Coastguard Worker mova xmm0, xm8 2633*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym2, ym1 2634*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xmm3, xm1 2635*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym1, 1 2636*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xmm4, xm1 2637*c0909341SAndroid Build Coastguard Worker psrad xmm0, 6 2638*c0909341SAndroid Build Coastguard Worker packusdw xmm0, xmm0 2639*c0909341SAndroid Build Coastguard Worker pminsw xmm0, xm15 2640*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 2641*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 2642*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2643*c0909341SAndroid Build Coastguard Worker sub hd, 2 2644*c0909341SAndroid Build Coastguard Worker jg .h_w2_loop 2645*c0909341SAndroid Build Coastguard Worker RET 2646*c0909341SAndroid Build Coastguard Worker.h_w4: 2647*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2648*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2649*c0909341SAndroid Build Coastguard Worker jl .h_w2 2650*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [spel_h_shufA] 2651*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym5, [spel_h_shufB] 2652*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2653*c0909341SAndroid Build Coastguard Worker pshufd xmm0, xmm0, q2211 2654*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym6, xmm0 2655*c0909341SAndroid Build Coastguard Worker vpermq ym7, ymm0, q1111 2656*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 2657*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+ssq*0] 2658*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+ssq*1], 1 2659*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2660*c0909341SAndroid Build Coastguard Worker mova ym0, ym8 2661*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym2, ym4 2662*c0909341SAndroid Build Coastguard Worker vpdpwssd ym0, ym6, ym1 2663*c0909341SAndroid Build Coastguard Worker pshufb ym2, ym5 2664*c0909341SAndroid Build Coastguard Worker vpdpwssd ym0, ym7, ym2 2665*c0909341SAndroid Build Coastguard Worker psrad ym0, 6 2666*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 2667*c0909341SAndroid Build Coastguard Worker packusdw xm0, xm1 2668*c0909341SAndroid Build Coastguard Worker pminsw xmm0, xm0, xm15 2669*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 2670*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 2671*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2672*c0909341SAndroid Build Coastguard Worker sub hd, 2 2673*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 2674*c0909341SAndroid Build Coastguard Worker RET 2675*c0909341SAndroid Build Coastguard Worker.h_w8: 2676*c0909341SAndroid Build Coastguard Worker mova m4, [spel_h_shufA] 2677*c0909341SAndroid Build Coastguard Worker movu m5, [spel_h_shufB] 2678*c0909341SAndroid Build Coastguard Worker movu m6, [spel_h_shufC] 2679*c0909341SAndroid Build Coastguard Worker mova m7, [spel_h_shufD] 2680*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 2681*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+ssq*0] 2682*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+ssq*1], 1 2683*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2684*c0909341SAndroid Build Coastguard Worker mova m0, m8 2685*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, m2 2686*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m1 2687*c0909341SAndroid Build Coastguard Worker vpermb m1, m5, m2 2688*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m1 2689*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m2 2690*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m1 2691*c0909341SAndroid Build Coastguard Worker vpermb m1, m7, m2 2692*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m1 2693*c0909341SAndroid Build Coastguard Worker psrad m0, 6 2694*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym1, m0, 1 2695*c0909341SAndroid Build Coastguard Worker packusdw ym0, ym1 2696*c0909341SAndroid Build Coastguard Worker pminsw ym0, ym15 2697*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 2698*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 2699*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2700*c0909341SAndroid Build Coastguard Worker sub hd, 2 2701*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 2702*c0909341SAndroid Build Coastguard Worker RET 2703*c0909341SAndroid Build Coastguard Worker.h: 2704*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, r8m 2705*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2706*c0909341SAndroid Build Coastguard Worker jnz .hv 2707*c0909341SAndroid Build Coastguard Worker mov r7d, r8m 2708*c0909341SAndroid Build Coastguard Worker shr r7d, 11 2709*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] 2710*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2711*c0909341SAndroid Build Coastguard Worker jle .h_w4 2712*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2713*c0909341SAndroid Build Coastguard Worker sub srcq, 6 2714*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2715*c0909341SAndroid Build Coastguard Worker mova [buf], xmm0 2716*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, xmm0 2717*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [buf+ 4] 2718*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [buf+ 8] 2719*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [buf+12] 2720*c0909341SAndroid Build Coastguard Worker sub wd, 16 2721*c0909341SAndroid Build Coastguard Worker jl .h_w8 2722*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufA] 2723*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [spel_h_shufB] 2724*c0909341SAndroid Build Coastguard Worker jg .h_w32 2725*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 2726*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+ssq*0+ 0] 2727*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+ssq*1+ 0], 1 2728*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*0+16] 2729*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*1+16], 1 2730*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2731*c0909341SAndroid Build Coastguard Worker mova m0, m8 2732*c0909341SAndroid Build Coastguard Worker mova m1, m8 2733*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 2734*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m4 ; a0 2735*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m6 2736*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m4 ; b2 2737*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m7 2738*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m4 ; a1 2739*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m7 2740*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m4 ; b3 2741*c0909341SAndroid Build Coastguard Worker shufpd m2, m3, 0x55 2742*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 2743*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m4 ; a2 2744*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m4 ; b0 2745*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 2746*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m2 ; a3 2747*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m2 ; b1 2748*c0909341SAndroid Build Coastguard Worker psrad m0, 6 2749*c0909341SAndroid Build Coastguard Worker psrad m1, 6 2750*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 2751*c0909341SAndroid Build Coastguard Worker pminsw m0, m15 2752*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 2753*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 2754*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2755*c0909341SAndroid Build Coastguard Worker sub hd, 2 2756*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 2757*c0909341SAndroid Build Coastguard Worker RET 2758*c0909341SAndroid Build Coastguard Worker.h_w32: 2759*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 2760*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 2761*c0909341SAndroid Build Coastguard Worker neg wq 2762*c0909341SAndroid Build Coastguard Worker.h_w32_loop0: 2763*c0909341SAndroid Build Coastguard Worker mov r6, wq 2764*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 2765*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6*2+ 0] 2766*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+r6*2+ 8] 2767*c0909341SAndroid Build Coastguard Worker mova m0, m8 2768*c0909341SAndroid Build Coastguard Worker mova m1, m8 2769*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 2770*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m4 ; a0 2771*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m6 2772*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m10, m4 ; b0 2773*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m4 ; a2 2774*c0909341SAndroid Build Coastguard Worker movu m4, [srcq+r6*2+16] 2775*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 2776*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m3 ; b1 2777*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m3 ; a3 2778*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 2779*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m3 ; b2 2780*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 2781*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m11, m2 ; a1 2782*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 2783*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m4 ; b3 2784*c0909341SAndroid Build Coastguard Worker psrad m0, 6 2785*c0909341SAndroid Build Coastguard Worker psrad m1, 6 2786*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 2787*c0909341SAndroid Build Coastguard Worker pminsw m0, m15 2788*c0909341SAndroid Build Coastguard Worker mova [dstq+r6*2], m0 2789*c0909341SAndroid Build Coastguard Worker add r6, 32 2790*c0909341SAndroid Build Coastguard Worker jl .h_w32_loop 2791*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2792*c0909341SAndroid Build Coastguard Worker add dstq, dsq 2793*c0909341SAndroid Build Coastguard Worker dec hd 2794*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop0 2795*c0909341SAndroid Build Coastguard Worker RET 2796*c0909341SAndroid Build Coastguard Worker.hv: 2797*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2798*c0909341SAndroid Build Coastguard Worker jg .hv_w8 2799*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2800*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2801*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2802*c0909341SAndroid Build Coastguard Worker shr myd, 16 2803*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2804*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2805*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+myq*8] 2806*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2807*c0909341SAndroid Build Coastguard Worker sub srcq, 2 2808*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2809*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 2810*c0909341SAndroid Build Coastguard Worker jnz .hv_12bit 2811*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_2176] 2812*c0909341SAndroid Build Coastguard Worker psllw xmm0, 6 2813*c0909341SAndroid Build Coastguard Worker jmp .hv_main 2814*c0909341SAndroid Build Coastguard Worker.hv_12bit: 2815*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_640] 2816*c0909341SAndroid Build Coastguard Worker psllw xmm0, 4 2817*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 2818*c0909341SAndroid Build Coastguard Worker.hv_main: 2819*c0909341SAndroid Build Coastguard Worker mova [buf+ 0], xmm0 2820*c0909341SAndroid Build Coastguard Worker mova [buf+16], xmm1 2821*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [buf+ 4] 2822*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [buf+ 8] 2823*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym11, xmm1 2824*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, [buf+20] 2825*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym13, [buf+24] 2826*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym14, [buf+28] 2827*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 2828*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym4, [srcq+ssq*1], 1 2829*c0909341SAndroid Build Coastguard Worker vinserti32x4 m4, [srcq+ssq*2], 2 2830*c0909341SAndroid Build Coastguard Worker add srcq, r6 2831*c0909341SAndroid Build Coastguard Worker vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 2832*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*1] 2833*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+ssq*2], 1 2834*c0909341SAndroid Build Coastguard Worker add srcq, r6 2835*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 2836*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2837*c0909341SAndroid Build Coastguard Worker je .hv_w4 2838*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [spel_h_shufA] 2839*c0909341SAndroid Build Coastguard Worker mova m3, [spel_h_shuf2b] 2840*c0909341SAndroid Build Coastguard Worker mova ym6, [spel_h_shuf2a] 2841*c0909341SAndroid Build Coastguard Worker mova xm7, [spel_shuf2] 2842*c0909341SAndroid Build Coastguard Worker mova m1, m10 2843*c0909341SAndroid Build Coastguard Worker pshufb m4, m2 2844*c0909341SAndroid Build Coastguard Worker pshufb m0, m2 2845*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m4, m0 2846*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m8, m2 ; 04 15 26 3_ 2847*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m0 2848*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m4 2849*c0909341SAndroid Build Coastguard Worker vpermb m1, m3, m1 ; 01 12 2850*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym1, 1 ; 23 34 2851*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m1, 2 ; 45 56 2852*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 2853*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*1] 2854*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2855*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym5, [srcq+ssq*0], 1 2856*c0909341SAndroid Build Coastguard Worker mova xm4, xm10 2857*c0909341SAndroid Build Coastguard Worker vpermb ym5, ym6, ym5 2858*c0909341SAndroid Build Coastguard Worker pmaddwd xmm0, xm11, xm1 ; a0 b0 2859*c0909341SAndroid Build Coastguard Worker vpdpwssd xm4, xm8, xm5 2860*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm5, ym5, 1 2861*c0909341SAndroid Build Coastguard Worker mova xm1, xm2 2862*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xm12, xm2 ; a1 b1 2863*c0909341SAndroid Build Coastguard Worker vpdpwssd xm4, xm9, xm5 ; 7 8 2864*c0909341SAndroid Build Coastguard Worker mova xm2, xm3 2865*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xm13, xm3 ; a2 b2 2866*c0909341SAndroid Build Coastguard Worker vpermt2b xm3, xm7, xm4 ; 67 78 2867*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm0, xm14, xm3 ; a3 b3 2868*c0909341SAndroid Build Coastguard Worker psrad xmm0, 10 2869*c0909341SAndroid Build Coastguard Worker packusdw xmm0, xmm0 2870*c0909341SAndroid Build Coastguard Worker pminsw xmm0, xm15 2871*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 2872*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 2873*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2874*c0909341SAndroid Build Coastguard Worker sub hd, 2 2875*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 2876*c0909341SAndroid Build Coastguard Worker RET 2877*c0909341SAndroid Build Coastguard Worker.hv_w4: 2878*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m19, [spel_h_shufA] 2879*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m20, [spel_h_shufB] 2880*c0909341SAndroid Build Coastguard Worker mova ym6, [spel_shuf4a] 2881*c0909341SAndroid Build Coastguard Worker mova ym7, [spel_shuf4b] 2882*c0909341SAndroid Build Coastguard Worker mova m2, m10 2883*c0909341SAndroid Build Coastguard Worker mova m3, m10 2884*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m19 2885*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m8, m1 2886*c0909341SAndroid Build Coastguard Worker pshufb m1, m0, m19 2887*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m8, m1 2888*c0909341SAndroid Build Coastguard Worker pshufb m4, m20 2889*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m4 2890*c0909341SAndroid Build Coastguard Worker pshufb m0, m20 2891*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m9, m0 2892*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m2 ; 01 12 2893*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m3, q1032 2894*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m3 ; 45 56 2895*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m2 ; 23 34 2896*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 2897*c0909341SAndroid Build Coastguard Worker movu xm18, [srcq+ssq*1] 2898*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2899*c0909341SAndroid Build Coastguard Worker vinserti128 ym18, [srcq+ssq*0], 1 2900*c0909341SAndroid Build Coastguard Worker pmaddwd ym16, ym11, ym1 ; a0 b0 2901*c0909341SAndroid Build Coastguard Worker mova ym1, ym2 2902*c0909341SAndroid Build Coastguard Worker mova ym2, ym3 2903*c0909341SAndroid Build Coastguard Worker pshufb ym17, ym18, ym19 2904*c0909341SAndroid Build Coastguard Worker mova ym3, ym10 2905*c0909341SAndroid Build Coastguard Worker vpdpwssd ym3, ym8, ym17 2906*c0909341SAndroid Build Coastguard Worker pshufb ym18, ym20 2907*c0909341SAndroid Build Coastguard Worker vpdpwssd ym16, ym12, ym1 ; a1 b1 2908*c0909341SAndroid Build Coastguard Worker vpdpwssd ym3, ym9, ym18 ; 7 8 2909*c0909341SAndroid Build Coastguard Worker vpdpwssd ym16, ym13, ym2 ; a2 b2 2910*c0909341SAndroid Build Coastguard Worker vpermt2b ym3, ym7, ym2 ; 67 78 2911*c0909341SAndroid Build Coastguard Worker vpdpwssd ym16, ym14, ym3 ; a3 b3 2912*c0909341SAndroid Build Coastguard Worker psrad ym16, 10 2913*c0909341SAndroid Build Coastguard Worker vextracti128 xm17, ym16, 1 2914*c0909341SAndroid Build Coastguard Worker packusdw xm16, xm17 2915*c0909341SAndroid Build Coastguard Worker pminsw xm16, xm15 2916*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm16 2917*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm16 2918*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2919*c0909341SAndroid Build Coastguard Worker sub hd, 2 2920*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 2921*c0909341SAndroid Build Coastguard Worker vzeroupper 2922*c0909341SAndroid Build Coastguard Worker RET 2923*c0909341SAndroid Build Coastguard Worker.hv_w8: 2924*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2925*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 2926*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2927*c0909341SAndroid Build Coastguard Worker shr myd, 16 2928*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2929*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2930*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+myq*8] 2931*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2932*c0909341SAndroid Build Coastguard Worker sub srcq, 6 2933*c0909341SAndroid Build Coastguard Worker sub srcq, r6 2934*c0909341SAndroid Build Coastguard Worker test dword r8m, 0x800 2935*c0909341SAndroid Build Coastguard Worker jnz .hv_w8_12bit 2936*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_2176] 2937*c0909341SAndroid Build Coastguard Worker psllw xmm0, 6 2938*c0909341SAndroid Build Coastguard Worker jmp .hv_w8_main 2939*c0909341SAndroid Build Coastguard Worker.hv_w8_12bit: 2940*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_640] 2941*c0909341SAndroid Build Coastguard Worker psllw xmm0, 4 2942*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 2943*c0909341SAndroid Build Coastguard Worker.hv_w8_main: 2944*c0909341SAndroid Build Coastguard Worker mova [buf+ 0], xmm0 2945*c0909341SAndroid Build Coastguard Worker mova [buf+16], xmm1 2946*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, xmm0 2947*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [buf+ 4] 2948*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [buf+ 8] 2949*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [buf+12] 2950*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, xmm1 2951*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [buf+20] 2952*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [buf+24] 2953*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [buf+28] 2954*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2955*c0909341SAndroid Build Coastguard Worker jg .hv_w16 2956*c0909341SAndroid Build Coastguard Worker mova m5, [spel_h_shufA] 2957*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 2958*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 2959*c0909341SAndroid Build Coastguard Worker movu ym9, [srcq+ssq*2] 2960*c0909341SAndroid Build Coastguard Worker add srcq, r6 2961*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 2962*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+ssq*1] 2963*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 2964*c0909341SAndroid Build Coastguard Worker add srcq, r6 2965*c0909341SAndroid Build Coastguard Worker movu ym21, [srcq+ssq*0] ; 6 2966*c0909341SAndroid Build Coastguard Worker movu m6, [spel_h_shufB] 2967*c0909341SAndroid Build Coastguard Worker movu m7, [spel_h_shufC] 2968*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m0 2969*c0909341SAndroid Build Coastguard Worker mova m1, m10 2970*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m8 ; a0 b0 2971*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m9 2972*c0909341SAndroid Build Coastguard Worker mova m2, m10 2973*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m11, m8 ; c0 d0 2974*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m20 2975*c0909341SAndroid Build Coastguard Worker mova m3, m10 2976*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m11, m8 ; e0 f0 2977*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m21 2978*c0909341SAndroid Build Coastguard Worker mova m4, m10 2979*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m11, m8 ; g0 2980*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m0 2981*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m8 ; a1 b1 2982*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m9 2983*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m8 ; c1 d1 2984*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m20 2985*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m8 ; e1 f1 2986*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m21 2987*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m8 ; g1 2988*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m0 2989*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m8 ; a2 b2 2990*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m9 2991*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m8 ; c2 d2 2992*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m20 2993*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m8 ; e2 f2 2994*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m21 2995*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m8 ; g2 2996*c0909341SAndroid Build Coastguard Worker mova m8, [spel_h_shufD] 2997*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 2998*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m0 ; a3 b3 2999*c0909341SAndroid Build Coastguard Worker mova m0, [spel_shuf8a] 3000*c0909341SAndroid Build Coastguard Worker vpermb m9, m8, m9 3001*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m9 ; c3 d3 3002*c0909341SAndroid Build Coastguard Worker mova m9, [spel_shuf8b] 3003*c0909341SAndroid Build Coastguard Worker vpermb m20, m8, m20 3004*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m20 ; e3 f3 3005*c0909341SAndroid Build Coastguard Worker vpermb m21, m8, m21 3006*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m21 ; g3 3007*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m0, m2 ; 01 12 3008*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m0, m3 ; 23 34 3009*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m0, m4 ; 45 56 3010*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 3011*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*1] 3012*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3013*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 1 3014*c0909341SAndroid Build Coastguard Worker mova m4, m10 3015*c0909341SAndroid Build Coastguard Worker vpermb m21, m5, m0 3016*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m11, m21 ; h0 i0 3017*c0909341SAndroid Build Coastguard Worker vpermb m21, m6, m0 3018*c0909341SAndroid Build Coastguard Worker pmaddwd m20, m16, m1 ; A0 B0 3019*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m21 ; h1 i1 3020*c0909341SAndroid Build Coastguard Worker vpermb m21, m7, m0 3021*c0909341SAndroid Build Coastguard Worker mova m1, m2 3022*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m17, m2 ; A1 B1 3023*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m21 ; h2 i2 3024*c0909341SAndroid Build Coastguard Worker vpermb m21, m8, m0 3025*c0909341SAndroid Build Coastguard Worker mova m2, m3 3026*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m18, m3 ; A2 B2 3027*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m21 ; h3 i3 3028*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m9, m4 ; 67 78 3029*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m19, m3 ; A3 B3 3030*c0909341SAndroid Build Coastguard Worker psrad m20, 10 3031*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym21, m20, 1 3032*c0909341SAndroid Build Coastguard Worker packusdw ym20, ym21 3033*c0909341SAndroid Build Coastguard Worker pminsw ym20, ym15 3034*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm20 3035*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ym20, 1 3036*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 3037*c0909341SAndroid Build Coastguard Worker sub hd, 2 3038*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 3039*c0909341SAndroid Build Coastguard Worker vzeroupper 3040*c0909341SAndroid Build Coastguard Worker RET 3041*c0909341SAndroid Build Coastguard Worker.hv_w16: 3042*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 26 3043*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m20, [spel_h_shufA] 3044*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m21, [spel_h_shufB] 3045*c0909341SAndroid Build Coastguard Worker add wd, wd 3046*c0909341SAndroid Build Coastguard Worker mova m9, [spel_shuf16] 3047*c0909341SAndroid Build Coastguard Worker lea wd, [hq+wq*8-256] 3048*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 3049*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m5, [srcq+ssq*0+ 8] 3050*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 3051*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 3052*c0909341SAndroid Build Coastguard Worker movu ym6, [srcq+ssq*1+ 0] 3053*c0909341SAndroid Build Coastguard Worker movu ym7, [srcq+ssq*1+16] 3054*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+r6] 3055*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, [srcq+ssq*2+ 0], 1 3056*c0909341SAndroid Build Coastguard Worker vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 3057*c0909341SAndroid Build Coastguard Worker movu ym22, [r7 +ssq*0+ 0] 3058*c0909341SAndroid Build Coastguard Worker movu ym23, [r7 +ssq*0+16] 3059*c0909341SAndroid Build Coastguard Worker mov r8, dstq 3060*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, [r7 +ssq*1+ 0], 1 3061*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4 3062*c0909341SAndroid Build Coastguard Worker movu ym24, [r7 +ssq*2+ 0] 3063*c0909341SAndroid Build Coastguard Worker movu ym25, [r7 +ssq*2+16] 3064*c0909341SAndroid Build Coastguard Worker add r7, r6 3065*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, [r7 +ssq*0+ 0], 1 3066*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6 3067*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m20 3068*c0909341SAndroid Build Coastguard Worker mova m1, m10 3069*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m11, m0 ; a0 3070*c0909341SAndroid Build Coastguard Worker pshufb m0, m6, m20 3071*c0909341SAndroid Build Coastguard Worker mova m2, m10 3072*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m11, m0 ; b0 3073*c0909341SAndroid Build Coastguard Worker pshufb m0, m7, m20 3074*c0909341SAndroid Build Coastguard Worker mova m3, m10 3075*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m0 ; c2 3076*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m21 3077*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m0 ; a1 3078*c0909341SAndroid Build Coastguard Worker pshufb m0, m6, m21 3079*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m0 ; b1 3080*c0909341SAndroid Build Coastguard Worker pshufb m0, m7, m21 3081*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m0 ; c3 3082*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m20 3083*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m0 ; a2 3084*c0909341SAndroid Build Coastguard Worker shufpd m6, m7, 0x55 3085*c0909341SAndroid Build Coastguard Worker pshufb m7, m6, m20 3086*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m7 ; b2 3087*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m11, m7 ; c0 3088*c0909341SAndroid Build Coastguard Worker pshufb m5, m21 3089*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m5 ; a3 3090*c0909341SAndroid Build Coastguard Worker pshufb m6, m21 3091*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m6 ; b3 3092*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m6 ; c1 3093*c0909341SAndroid Build Coastguard Worker pshufb m0, m22, m20 3094*c0909341SAndroid Build Coastguard Worker mova m4, m10 3095*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m11, m0 ; d0 3096*c0909341SAndroid Build Coastguard Worker pshufb m0, m23, m20 3097*c0909341SAndroid Build Coastguard Worker mova m5, m10 3098*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m0 ; e2 3099*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m20 3100*c0909341SAndroid Build Coastguard Worker mova m6, m10 3101*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m11, m0 ; f0 3102*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m20 3103*c0909341SAndroid Build Coastguard Worker mova m7, m10 3104*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m0 ; g2 3105*c0909341SAndroid Build Coastguard Worker pshufb m0, m22, m21 3106*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m0 ; d1 3107*c0909341SAndroid Build Coastguard Worker pshufb m0, m23, m21 3108*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m0 ; e3 3109*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m21 3110*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m0 ; f1 3111*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m21 3112*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m0 ; g3 3113*c0909341SAndroid Build Coastguard Worker shufpd m22, m23, 0x55 3114*c0909341SAndroid Build Coastguard Worker pshufb m23, m22, m20 3115*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m23 ; d2 3116*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m11, m23 ; e0 3117*c0909341SAndroid Build Coastguard Worker shufpd m24, m25, 0x55 3118*c0909341SAndroid Build Coastguard Worker pshufb m25, m24, m20 3119*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m25 ; f2 3120*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m11, m25 ; g0 3121*c0909341SAndroid Build Coastguard Worker pshufb m22, m21 3122*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m22 ; d3 3123*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m22 ; e1 3124*c0909341SAndroid Build Coastguard Worker pshufb m24, m21 3125*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m24 ; f3 3126*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m24 ; g1 3127*c0909341SAndroid Build Coastguard Worker pslldq m1, 1 3128*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m9, m3 ; 12 3129*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m9, m5 ; 34 3130*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m9, m7 ; 56 3131*c0909341SAndroid Build Coastguard Worker vpshrdd m1, m2, 16 ; 01 3132*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 23 3133*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m4, m6, 16 ; 45 3134*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 3135*c0909341SAndroid Build Coastguard Worker movu ym24, [r7+ssq*1+ 0] 3136*c0909341SAndroid Build Coastguard Worker movu ym25, [r7+ssq*1+16] 3137*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 3138*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, [r7+ssq*0+ 0], 1 3139*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, [r7+ssq*0+16], 1 3140*c0909341SAndroid Build Coastguard Worker mova m7, m10 3141*c0909341SAndroid Build Coastguard Worker mova m8, m10 3142*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m20 3143*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m11, m0 ; h0 3144*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m20 3145*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m13, m0 ; i2 3146*c0909341SAndroid Build Coastguard Worker pmaddwd m22, m16, m1 ; A0 3147*c0909341SAndroid Build Coastguard Worker mova m1, m3 3148*c0909341SAndroid Build Coastguard Worker pmaddwd m23, m16, m2 ; B0 3149*c0909341SAndroid Build Coastguard Worker mova m2, m4 3150*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m21 3151*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m0 ; h1 3152*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m21 3153*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m14, m0 ; i3 3154*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m17, m3 ; A1 3155*c0909341SAndroid Build Coastguard Worker mova m3, m5 3156*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m17, m4 ; B1 3157*c0909341SAndroid Build Coastguard Worker mova m4, m6 3158*c0909341SAndroid Build Coastguard Worker shufpd m24, m25, 0x55 3159*c0909341SAndroid Build Coastguard Worker pshufb m25, m24, m20 3160*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m25 ; h2 3161*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m11, m25 ; i0 3162*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m18, m5 ; A2 3163*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m18, m6 ; B2 3164*c0909341SAndroid Build Coastguard Worker pshufb m24, m21 3165*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m24 ; h3 3166*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m12, m24 ; i1 3167*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m9, m8 ; 78 3168*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m6, m7, 16 ; 67 3169*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m19, m5 ; A3 3170*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m19, m7 ; B3 3171*c0909341SAndroid Build Coastguard Worker mova m6, m7 3172*c0909341SAndroid Build Coastguard Worker psrad m22, 10 3173*c0909341SAndroid Build Coastguard Worker psrad m23, 10 3174*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m22, m23, q3232 3175*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, ym23, 1 3176*c0909341SAndroid Build Coastguard Worker packusdw m22, m0 3177*c0909341SAndroid Build Coastguard Worker pminsw m22, m15 3178*c0909341SAndroid Build Coastguard Worker mova [r8+dsq*0], ym22 3179*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r8+dsq*1], m22, 1 3180*c0909341SAndroid Build Coastguard Worker lea r8, [r8+dsq*2] 3181*c0909341SAndroid Build Coastguard Worker sub hd, 2 3182*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 3183*c0909341SAndroid Build Coastguard Worker add srcq, 32 3184*c0909341SAndroid Build Coastguard Worker add dstq, 32 3185*c0909341SAndroid Build Coastguard Worker movzx hd, wb 3186*c0909341SAndroid Build Coastguard Worker sub wd, 1<<8 3187*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 3188*c0909341SAndroid Build Coastguard Worker RET 3189*c0909341SAndroid Build Coastguard Worker 3190*c0909341SAndroid Build Coastguard Worker%if WIN64 3191*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4 3192*c0909341SAndroid Build Coastguard Worker%else 3193*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7 3194*c0909341SAndroid Build Coastguard Worker%endif 3195*c0909341SAndroid Build Coastguard Worker 3196*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap, 3197*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc 3198*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc 3199*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc 3200*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular, REGULAR, REGULAR 3201*c0909341SAndroid Build Coastguard Worker 3202*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my 3203*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx512icl 3204*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 3205*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 3206*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 3207*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 3208*c0909341SAndroid Build Coastguard Worker lea r7, [prep_avx512icl] 3209*c0909341SAndroid Build Coastguard Worker mov wd, wm 3210*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3211*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 3212*c0909341SAndroid Build Coastguard Worker jnz .h 3213*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3214*c0909341SAndroid Build Coastguard Worker jnz .v 3215*c0909341SAndroid Build Coastguard Worker.prep: 3216*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 3217*c0909341SAndroid Build Coastguard Worker mov r5d, r7m ; bitdepth_max 3218*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_8192] 3219*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep,)] 3220*c0909341SAndroid Build Coastguard Worker shr r5d, 11 3221*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] 3222*c0909341SAndroid Build Coastguard Worker add wq, r7 3223*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 3224*c0909341SAndroid Build Coastguard Worker%if WIN64 3225*c0909341SAndroid Build Coastguard Worker pop r7 3226*c0909341SAndroid Build Coastguard Worker%endif 3227*c0909341SAndroid Build Coastguard Worker jmp wq 3228*c0909341SAndroid Build Coastguard Worker.h_w8: 3229*c0909341SAndroid Build Coastguard Worker mova m6, [spel_h_shufA] 3230*c0909341SAndroid Build Coastguard Worker movu m7, [spel_h_shufC] 3231*c0909341SAndroid Build Coastguard Worker mova m8, [prep_endB] 3232*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 3233*c0909341SAndroid Build Coastguard Worker movu ym4, [srcq+ssq*0] 3234*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, [srcq+ssq*1], 1 3235*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+ssq*2] 3236*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+r6 ], 1 3237*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3238*c0909341SAndroid Build Coastguard Worker mova m0, m10 3239*c0909341SAndroid Build Coastguard Worker mova m1, m10 3240*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m4 3241*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m5 3242*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m2 ; a0 b0 3243*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m3 ; c0 d0 3244*c0909341SAndroid Build Coastguard Worker vpermb m4, m7, m4 3245*c0909341SAndroid Build Coastguard Worker vpermb m5, m7, m5 3246*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m4 ; a2 b2 3247*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m5 ; c2 d2 3248*c0909341SAndroid Build Coastguard Worker shufpd m2, m4, 0x55 3249*c0909341SAndroid Build Coastguard Worker shufpd m3, m5, 0x55 3250*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m2 ; a1 b1 3251*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m3 ; c1 d1 3252*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m8, m1 3253*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 3254*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3255*c0909341SAndroid Build Coastguard Worker sub hd, 4 3256*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 3257*c0909341SAndroid Build Coastguard Worker RET 3258*c0909341SAndroid Build Coastguard Worker.h: 3259*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [prep_8tap_rnd] 3260*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3261*c0909341SAndroid Build Coastguard Worker jnz .hv 3262*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 3263*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3264*c0909341SAndroid Build Coastguard Worker je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4 3265*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3266*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 3267*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 3268*c0909341SAndroid Build Coastguard Worker sub srcq, 4 3269*c0909341SAndroid Build Coastguard Worker shr r5d, 11 3270*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 3271*c0909341SAndroid Build Coastguard Worker mova [tmpq], xmm0 3272*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 3273*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+ 4] 3274*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+ 8] 3275*c0909341SAndroid Build Coastguard Worker cmp wd, 16 3276*c0909341SAndroid Build Coastguard Worker jl .h_w8 3277*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [spel_h_shufA] 3278*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufB] 3279*c0909341SAndroid Build Coastguard Worker mova m7, [prep_endC] 3280*c0909341SAndroid Build Coastguard Worker jg .h_w32 3281*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 3282*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+ssq*0+ 0] 3283*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+ssq*1+ 0], 1 3284*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*0+12] 3285*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*1+12], 1 3286*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3287*c0909341SAndroid Build Coastguard Worker mova m0, m10 3288*c0909341SAndroid Build Coastguard Worker mova m1, m10 3289*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m5 ; 01 3290*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m4 ; a0 b0 3291*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m6 ; 89 3292*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m4 ; a2' b2' 3293*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 ; 23 3294*c0909341SAndroid Build Coastguard Worker pshufb m3, m5 ; 67 3295*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m2 ; a1 b1 3296*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m3 ; a1' b1' 3297*c0909341SAndroid Build Coastguard Worker shufpd m2, m3, 0x55 ; 45 3298*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m2 ; a2 b2 3299*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m2 ; a0' b0' 3300*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m7, m1 3301*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 3302*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3303*c0909341SAndroid Build Coastguard Worker sub hd, 2 3304*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 3305*c0909341SAndroid Build Coastguard Worker RET 3306*c0909341SAndroid Build Coastguard Worker.h_w32: 3307*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 3308*c0909341SAndroid Build Coastguard Worker neg wq 3309*c0909341SAndroid Build Coastguard Worker.h_w32_loop0: 3310*c0909341SAndroid Build Coastguard Worker mov r6, wq 3311*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 3312*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6*2+ 0] 3313*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+r6*2+12] 3314*c0909341SAndroid Build Coastguard Worker mova m0, m10 3315*c0909341SAndroid Build Coastguard Worker mova m1, m10 3316*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m5 3317*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m4 3318*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m6 3319*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m4 3320*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 3321*c0909341SAndroid Build Coastguard Worker pshufb m3, m5 3322*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m2 3323*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m3 3324*c0909341SAndroid Build Coastguard Worker shufpd m2, m3, 0x55 3325*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m2 3326*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m2 3327*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m7, m1 3328*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 3329*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3330*c0909341SAndroid Build Coastguard Worker add r6, 32 3331*c0909341SAndroid Build Coastguard Worker jl .h_w32_loop 3332*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3333*c0909341SAndroid Build Coastguard Worker dec hd 3334*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop0 3335*c0909341SAndroid Build Coastguard Worker RET 3336*c0909341SAndroid Build Coastguard Worker.v: 3337*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3338*c0909341SAndroid Build Coastguard Worker shr myd, 16 3339*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3340*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3341*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 3342*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [prep_8tap_rnd] 3343*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+1+myq*8] 3344*c0909341SAndroid Build Coastguard Worker tzcnt r6d, wd 3345*c0909341SAndroid Build Coastguard Worker shr r5d, 11 3346*c0909341SAndroid Build Coastguard Worker movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)] 3347*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 3348*c0909341SAndroid Build Coastguard Worker add r7, r6 3349*c0909341SAndroid Build Coastguard Worker mova [tmpq], xmm0 3350*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 3351*c0909341SAndroid Build Coastguard Worker mov r6, ssq 3352*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+ 4] 3353*c0909341SAndroid Build Coastguard Worker neg r6 3354*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+ 8] 3355*c0909341SAndroid Build Coastguard Worker jmp r7 3356*c0909341SAndroid Build Coastguard Worker.v_w4: 3357*c0909341SAndroid Build Coastguard Worker mov r3d, 0x330c 3358*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+r6 *2] 3359*c0909341SAndroid Build Coastguard Worker kmovw k1, r3d 3360*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym1{k1}, [srcq+r6 *1] 3361*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+ssq*0] 3362*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3 3363*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*2] 3364*c0909341SAndroid Build Coastguard Worker mova ym4, [prep_endA] 3365*c0909341SAndroid Build Coastguard Worker valignq m0, m1, 2 3366*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0 ; 01 12 23 34 3367*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 3368*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3369*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+r6 *1] 3370*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym2{k1}, [srcq+ssq*0] 3371*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 3372*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3 3373*c0909341SAndroid Build Coastguard Worker mova m3, m10 3374*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 3375*c0909341SAndroid Build Coastguard Worker valignq m0, m2, m0, 6 ; 4 5 6 7 3376*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 ; 45 56 67 78 3377*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m0 ; a2 b2 c2 d2 3378*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m0, q1032 ; 23 34 45 56 3379*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m1 ; a1 b1 c1 d1 3380*c0909341SAndroid Build Coastguard Worker mova m1, m0 3381*c0909341SAndroid Build Coastguard Worker mova m0, m2 3382*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, m3 3383*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym3 3384*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3385*c0909341SAndroid Build Coastguard Worker sub hd, 4 3386*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 3387*c0909341SAndroid Build Coastguard Worker RET 3388*c0909341SAndroid Build Coastguard Worker.v_w8: 3389*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym1, [srcq+r6 *1] 3390*c0909341SAndroid Build Coastguard Worker mov r3d, 0x33 3391*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ssq*0] 3392*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 3393*c0909341SAndroid Build Coastguard Worker mova m6, [spel_v_shuf8] 3394*c0909341SAndroid Build Coastguard Worker vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2 3395*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [srcq+ssq*1] 3396*c0909341SAndroid Build Coastguard Worker vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4 3397*c0909341SAndroid Build Coastguard Worker mova m7, [prep_endB] 3398*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m1 ; 01 12 3399*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m0 ; 23 34 3400*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 3401*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3402*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [srcq+r6 *1] 3403*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*0] 3404*c0909341SAndroid Build Coastguard Worker vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6 3405*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [srcq+ssq*1] 3406*c0909341SAndroid Build Coastguard Worker vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8 3407*c0909341SAndroid Build Coastguard Worker mova m4, m10 3408*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m1 ; a0 b0 3409*c0909341SAndroid Build Coastguard Worker mova m5, m10 3410*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m2 ; c0 d0 3411*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m3 ; 45 56 3412*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m2 ; a1 b1 3413*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m0 ; 67 78 3414*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m1 ; c1 d1 3415*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m1 ; a2 b2 3416*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m2 ; c2 d2 3417*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m7, m5 3418*c0909341SAndroid Build Coastguard Worker mova [tmpq], m4 3419*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3420*c0909341SAndroid Build Coastguard Worker sub hd, 4 3421*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 3422*c0909341SAndroid Build Coastguard Worker RET 3423*c0909341SAndroid Build Coastguard Worker.v_w16: 3424*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+r6 *1] 3425*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2 3426*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1 3427*c0909341SAndroid Build Coastguard Worker mova m6, [spel_v_shuf16] 3428*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*1] 3429*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3430*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4 3431*c0909341SAndroid Build Coastguard Worker mova m7, [prep_endA] 3432*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m1 ; 12 3433*c0909341SAndroid Build Coastguard Worker vpermb m0, m6, m0 ; 01 3434*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m3 ; 34 3435*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 23 3436*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 3437*c0909341SAndroid Build Coastguard Worker mova m5, m10 3438*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m1 ; b0 3439*c0909341SAndroid Build Coastguard Worker mova m4, m10 3440*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m0 ; a0 3441*c0909341SAndroid Build Coastguard Worker mova m1, m3 3442*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m3 ; b1 3443*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+ssq*1] 3444*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3445*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m2 ; a1 3446*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+ssq*0], 1 3447*c0909341SAndroid Build Coastguard Worker mova m0, m2 3448*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m3 ; 56 3449*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 45 3450*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m3 ; b2 3451*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m2 ; a2 3452*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m7, m5 3453*c0909341SAndroid Build Coastguard Worker mova [tmpq], m4 3454*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3455*c0909341SAndroid Build Coastguard Worker sub hd, 2 3456*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 3457*c0909341SAndroid Build Coastguard Worker RET 3458*c0909341SAndroid Build Coastguard Worker.v_w32: 3459*c0909341SAndroid Build Coastguard Worker.v_w64: 3460*c0909341SAndroid Build Coastguard Worker.v_w128: 3461*c0909341SAndroid Build Coastguard Worker%if WIN64 3462*c0909341SAndroid Build Coastguard Worker push r8 3463*c0909341SAndroid Build Coastguard Worker%endif 3464*c0909341SAndroid Build Coastguard Worker mova m11, [prep_endC] 3465*c0909341SAndroid Build Coastguard Worker lea r5, [hq+wq*8-256] 3466*c0909341SAndroid Build Coastguard Worker.v_w32_loop0: 3467*c0909341SAndroid Build Coastguard Worker movu m4, [srcq+r6 *2] 3468*c0909341SAndroid Build Coastguard Worker movu m5, [srcq+r6 *1] 3469*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*2] 3470*c0909341SAndroid Build Coastguard Worker movu m6, [srcq+ssq*0] 3471*c0909341SAndroid Build Coastguard Worker movu m7, [srcq+ssq*1] 3472*c0909341SAndroid Build Coastguard Worker mov r8, tmpq 3473*c0909341SAndroid Build Coastguard Worker movu m8, [r7 +ssq*0] 3474*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m5 ; 01 3475*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 3476*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m5, m6 ; 12 3477*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 3478*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m6, m7 ; 23 3479*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 3480*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m7, m8 ; 34 3481*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m8 3482*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 3483*c0909341SAndroid Build Coastguard Worker mova m16, m10 3484*c0909341SAndroid Build Coastguard Worker movu m9, [r7+ssq*1] 3485*c0909341SAndroid Build Coastguard Worker mova m18, m10 3486*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m12, m0 ; a0 3487*c0909341SAndroid Build Coastguard Worker mova m17, m10 3488*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m12, m4 3489*c0909341SAndroid Build Coastguard Worker mova m19, m10 3490*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m12, m1 ; b0 3491*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 3492*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m12, m5 3493*c0909341SAndroid Build Coastguard Worker mova m0, m2 3494*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m13, m2 ; a1 3495*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m8, m9 ; 45 3496*c0909341SAndroid Build Coastguard Worker mova m4, m6 3497*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m13, m6 3498*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m8, m9 3499*c0909341SAndroid Build Coastguard Worker movu m8, [r7+ssq*0] 3500*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m13, m3 ; b1 3501*c0909341SAndroid Build Coastguard Worker mova m1, m3 3502*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m13, m7 3503*c0909341SAndroid Build Coastguard Worker mova m5, m7 3504*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m14, m2 ; a2 3505*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m9, m8 ; 56 3506*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m14, m6 3507*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m9, m8 3508*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m14, m3 ; b2 3509*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m14, m7 3510*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m11, m18 3511*c0909341SAndroid Build Coastguard Worker vpermt2b m17, m11, m19 3512*c0909341SAndroid Build Coastguard Worker mova [r8+wq*0], m16 3513*c0909341SAndroid Build Coastguard Worker mova [r8+wq*2], m17 3514*c0909341SAndroid Build Coastguard Worker lea r8, [r8+wq*4] 3515*c0909341SAndroid Build Coastguard Worker sub hd, 2 3516*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 3517*c0909341SAndroid Build Coastguard Worker add srcq, 64 3518*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3519*c0909341SAndroid Build Coastguard Worker movzx hd, r5b 3520*c0909341SAndroid Build Coastguard Worker sub r5d, 1<<8 3521*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop0 3522*c0909341SAndroid Build Coastguard Worker%if WIN64 3523*c0909341SAndroid Build Coastguard Worker pop r8 3524*c0909341SAndroid Build Coastguard Worker%endif 3525*c0909341SAndroid Build Coastguard Worker vzeroupper 3526*c0909341SAndroid Build Coastguard Worker RET 3527*c0909341SAndroid Build Coastguard Worker.hv_w4: 3528*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 3529*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 3530*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3531*c0909341SAndroid Build Coastguard Worker shr myd, 16 3532*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3533*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3534*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 3535*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 3536*c0909341SAndroid Build Coastguard Worker mov r6, ssq 3537*c0909341SAndroid Build Coastguard Worker sub srcq, 2 3538*c0909341SAndroid Build Coastguard Worker shr r5d, 11 3539*c0909341SAndroid Build Coastguard Worker neg r6 3540*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 3541*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 3542*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], xmm0 3543*c0909341SAndroid Build Coastguard Worker mova [tmpq+16], xmm1 3544*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [tmpq+ 4] 3545*c0909341SAndroid Build Coastguard Worker mov r3d, 0xf0 3546*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [tmpq+ 8] 3547*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm1 3548*c0909341SAndroid Build Coastguard Worker movu xm3, [srcq+r6 *2] 3549*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 3550*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [srcq+r6 *1], 1 3551*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ssq*0] 3552*c0909341SAndroid Build Coastguard Worker vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3 3553*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+ssq*2] 3554*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [spel_h_shufA] 3555*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufB] 3556*c0909341SAndroid Build Coastguard Worker mova m1, m11 3557*c0909341SAndroid Build Coastguard Worker mova m15, [spel_shuf4a] 3558*c0909341SAndroid Build Coastguard Worker mova xm2, xm11 3559*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m5 3560*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m8, m0 3561*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm4, xm5 3562*c0909341SAndroid Build Coastguard Worker vpdpwssd xm2, xm8, xm0 3563*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+20] 3564*c0909341SAndroid Build Coastguard Worker pshufb m3, m6 3565*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+24] 3566*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm6 3567*c0909341SAndroid Build Coastguard Worker mova m7, [spel_shuf4b] 3568*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m3 ; 0 1 2 3 3569*c0909341SAndroid Build Coastguard Worker vpdpwssd xm2, xm9, xm4 ; 4 3570*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m15, m2 ; 01 12 23 34 3571*c0909341SAndroid Build Coastguard Worker mova ym15, [prep_endA] 3572*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 3573*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3574*c0909341SAndroid Build Coastguard Worker movu xm4, [srcq+r6 *1] 3575*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym4, [srcq+ssq*0], 1 3576*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [srcq+ssq*1] 3577*c0909341SAndroid Build Coastguard Worker vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3 3578*c0909341SAndroid Build Coastguard Worker mova m2, m11 3579*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m5 3580*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m8, m3 3581*c0909341SAndroid Build Coastguard Worker mova m3, m10 3582*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 3583*c0909341SAndroid Build Coastguard Worker pshufb m4, m6 3584*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m4 ; 5 6 7 8 3585*c0909341SAndroid Build Coastguard Worker mova m4, m1 3586*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m7, m2 ; 45 56 67 78 3587*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 3588*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m1, q1032 ; 23 34 45 56 3589*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m4 ; a1 b1 c1 d1 3590*c0909341SAndroid Build Coastguard Worker vpermb m3, m15, m3 3591*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym3 3592*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3593*c0909341SAndroid Build Coastguard Worker sub hd, 4 3594*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 3595*c0909341SAndroid Build Coastguard Worker RET 3596*c0909341SAndroid Build Coastguard Worker.hv_w8: 3597*c0909341SAndroid Build Coastguard Worker mova m8, [spel_h_shufA] 3598*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+r6 *2] 3599*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1 3600*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*0] 3601*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3 3602*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+ssq*2] ; 4 3603*c0909341SAndroid Build Coastguard Worker movu m9, [spel_h_shufC] 3604*c0909341SAndroid Build Coastguard Worker mova m21, [spel_shuf8a] 3605*c0909341SAndroid Build Coastguard Worker mova m0, [spel_shuf8b] 3606*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m18 3607*c0909341SAndroid Build Coastguard Worker mova m1, m10 3608*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, m19 3609*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m4 ; a0 b0 3610*c0909341SAndroid Build Coastguard Worker mova m2, m10 3611*c0909341SAndroid Build Coastguard Worker vpermb m6, m8, m20 3612*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m5 ; c0 d0 3613*c0909341SAndroid Build Coastguard Worker mova m3, m10 3614*c0909341SAndroid Build Coastguard Worker vpermb m18, m9, m18 3615*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m6 ; e0 3616*c0909341SAndroid Build Coastguard Worker mova m7, [prep_endB] 3617*c0909341SAndroid Build Coastguard Worker vpermb m19, m9, m19 3618*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m18 ; a2 b2 3619*c0909341SAndroid Build Coastguard Worker vpermb m20, m9, m20 3620*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m19 ; c2 d2 3621*c0909341SAndroid Build Coastguard Worker shufpd m4, m18, 0x55 3622*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m20 ; e2 3623*c0909341SAndroid Build Coastguard Worker shufpd m5, m19, 0x55 3624*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m4 ; a1 b1 3625*c0909341SAndroid Build Coastguard Worker shufpd m6, m20, 0x55 3626*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m5 ; c1 d1 3627*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m6 ; e1 3628*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m21, m2 ; 01 12 3629*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m21, m3 ; 23 34 3630*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 3631*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3632*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+r6 *1] 3633*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+ssq*0], 1 3634*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*1] 3635*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*2], 1 3636*c0909341SAndroid Build Coastguard Worker mova m3, m10 3637*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, m18 3638*c0909341SAndroid Build Coastguard Worker mova m4, m10 3639*c0909341SAndroid Build Coastguard Worker vpermb m6, m8, m19 3640*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m5 ; f0 g0 3641*c0909341SAndroid Build Coastguard Worker mova m20, m11 3642*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m6 ; h0 i0 3643*c0909341SAndroid Build Coastguard Worker mova m21, m11 3644*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m15, m1 ; A0 B0 3645*c0909341SAndroid Build Coastguard Worker vpermb m18, m9, m18 3646*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m15, m2 ; C0 D0 3647*c0909341SAndroid Build Coastguard Worker vpermb m19, m9, m19 3648*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m18 ; f2 g2 3649*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m19 ; h2 i2 3650*c0909341SAndroid Build Coastguard Worker shufpd m5, m18, 0x55 3651*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m16, m2 ; A1 B1 3652*c0909341SAndroid Build Coastguard Worker shufpd m6, m19, 0x55 3653*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m5 ; f1 g1 3654*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m6 ; h1 i1 3655*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m0, m3 ; 45 56 3656*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m16, m2 ; C1 D1 3657*c0909341SAndroid Build Coastguard Worker mova m1, m2 3658*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m0, m4 ; 67 78 3659*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m17, m1 ; A2 B2 3660*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m17, m2 ; A2 B2 3661*c0909341SAndroid Build Coastguard Worker vpermt2b m20, m7, m21 3662*c0909341SAndroid Build Coastguard Worker mova [tmpq], m20 3663*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3664*c0909341SAndroid Build Coastguard Worker sub hd, 4 3665*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 3666*c0909341SAndroid Build Coastguard Worker vzeroupper 3667*c0909341SAndroid Build Coastguard Worker RET 3668*c0909341SAndroid Build Coastguard Worker.hv: 3669*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [pd_128] 3670*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3671*c0909341SAndroid Build Coastguard Worker je .hv_w4 3672*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3673*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] 3674*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3675*c0909341SAndroid Build Coastguard Worker shr myd, 16 3676*c0909341SAndroid Build Coastguard Worker cmp hd, 6 3677*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 3678*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 3679*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+1+myq*8] 3680*c0909341SAndroid Build Coastguard Worker mov r6, ssq 3681*c0909341SAndroid Build Coastguard Worker sub srcq, 4 3682*c0909341SAndroid Build Coastguard Worker shr r5d, 11 3683*c0909341SAndroid Build Coastguard Worker neg r6 3684*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 3685*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 3686*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], xmm0 3687*c0909341SAndroid Build Coastguard Worker mova [tmpq+16], xmm1 3688*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 3689*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+ 4] 3690*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+ 8] 3691*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, xmm1 3692*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [tmpq+20] 3693*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [tmpq+24] 3694*c0909341SAndroid Build Coastguard Worker cmp wd, 16 3695*c0909341SAndroid Build Coastguard Worker jl .hv_w8 3696*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [spel_h_shufA] 3697*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [spel_h_shufB] 3698*c0909341SAndroid Build Coastguard Worker jg .hv_w32 3699*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [srcq+r6 *2+ 8] 3700*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 3701*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 3702*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+r6 *1+ 0] 3703*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+r6 *1+12] 3704*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+ssq*0+ 0], 1 3705*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2 3706*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+ssq*1+ 0] 3707*c0909341SAndroid Build Coastguard Worker movu ym21, [srcq+ssq*1+12] 3708*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3709*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [srcq+ssq*0+ 0], 1 3710*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4 3711*c0909341SAndroid Build Coastguard Worker pshufb m2, m8 3712*c0909341SAndroid Build Coastguard Worker mova m1, m10 3713*c0909341SAndroid Build Coastguard Worker pshufb m3, m18, m8 3714*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m2 ; a2 3715*c0909341SAndroid Build Coastguard Worker mova m2, m10 3716*c0909341SAndroid Build Coastguard Worker pshufb m4, m19, m9 3717*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m3 ; b0 c0 3718*c0909341SAndroid Build Coastguard Worker mova m3, m10 3719*c0909341SAndroid Build Coastguard Worker pshufb m5, m20, m8 3720*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m4 ; b2' c2' 3721*c0909341SAndroid Build Coastguard Worker mova m4, m10 3722*c0909341SAndroid Build Coastguard Worker pshufb m7, m21, m9 3723*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m5 ; d0 e0 3724*c0909341SAndroid Build Coastguard Worker mova m5, m10 3725*c0909341SAndroid Build Coastguard Worker pshufb m0, m6, m8 3726*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m7 ; d2' e2' 3727*c0909341SAndroid Build Coastguard Worker mova m7, [spel_shuf16] 3728*c0909341SAndroid Build Coastguard Worker pshufb m18, m9 3729*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m0 ; a0 3730*c0909341SAndroid Build Coastguard Worker pshufb m19, m8 3731*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m18 ; b1 c1 3732*c0909341SAndroid Build Coastguard Worker pshufb m20, m9 3733*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m19 ; b1' c1' 3734*c0909341SAndroid Build Coastguard Worker pshufb m21, m8 3735*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m20 ; d1 e1 3736*c0909341SAndroid Build Coastguard Worker pshufb m6, m9 3737*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m21 ; d1' e1' 3738*c0909341SAndroid Build Coastguard Worker mova m0, [prep_endB] 3739*c0909341SAndroid Build Coastguard Worker shufpd m18, m19, 0x55 3740*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m6 ; a1 3741*c0909341SAndroid Build Coastguard Worker shufpd m20, m21, 0x55 3742*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m18 ; b2 c2 3743*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m18 ; b0' c0' 3744*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m20 ; d2 e2 3745*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m20 ; d0' e0' 3746*c0909341SAndroid Build Coastguard Worker pslldq m1, 1 3747*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m7, m3 ; 12 3748*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m7, m5 ; 34 3749*c0909341SAndroid Build Coastguard Worker vpshrdd m1, m2, 16 ; 01 3750*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 23 3751*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 3752*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*1+ 0] 3753*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*1+12] 3754*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3755*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+ssq*0+ 0], 1 3756*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*0+12], 1 3757*c0909341SAndroid Build Coastguard Worker mova m5, m10 3758*c0909341SAndroid Build Coastguard Worker mova m6, m10 3759*c0909341SAndroid Build Coastguard Worker pshufb m21, m18, m8 3760*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m21 ; f0 g0 3761*c0909341SAndroid Build Coastguard Worker pshufb m20, m19, m9 3762*c0909341SAndroid Build Coastguard Worker mova m21, m11 3763*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m20 ; f2' g2' 3764*c0909341SAndroid Build Coastguard Worker mova m20, m11 3765*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m15, m2 ; B0 3766*c0909341SAndroid Build Coastguard Worker mova m2, m4 3767*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m15, m1 ; A0 3768*c0909341SAndroid Build Coastguard Worker mova m1, m3 3769*c0909341SAndroid Build Coastguard Worker pshufb m18, m9 3770*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m18 ; f1 g1 3771*c0909341SAndroid Build Coastguard Worker pshufb m19, m8 3772*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m19 ; f1' g1' 3773*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m16, m4 ; B1 3774*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m16, m3 ; A1 3775*c0909341SAndroid Build Coastguard Worker shufpd m18, m19, 0x55 3776*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m18 ; f2 g2 3777*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m18 ; f0' g0' 3778*c0909341SAndroid Build Coastguard Worker mova m4, m7 3779*c0909341SAndroid Build Coastguard Worker vpermi2b m4, m5, m6 ; 56 3780*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 45 3781*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m17, m4 ; B2 3782*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m17, m3 ; A2 3783*c0909341SAndroid Build Coastguard Worker vpermt2b m20, m0, m21 3784*c0909341SAndroid Build Coastguard Worker mova [tmpq], m20 3785*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3786*c0909341SAndroid Build Coastguard Worker sub hd, 2 3787*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 3788*c0909341SAndroid Build Coastguard Worker vzeroupper 3789*c0909341SAndroid Build Coastguard Worker RET 3790*c0909341SAndroid Build Coastguard Worker.hv_w32: 3791*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 29 3792*c0909341SAndroid Build Coastguard Worker%if WIN64 3793*c0909341SAndroid Build Coastguard Worker push r8 3794*c0909341SAndroid Build Coastguard Worker%endif 3795*c0909341SAndroid Build Coastguard Worker mova m27, [spel_shuf32] 3796*c0909341SAndroid Build Coastguard Worker lea r5d, [hq+wq*8-256] 3797*c0909341SAndroid Build Coastguard Worker mova m28, [prep_endC] 3798*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0: 3799*c0909341SAndroid Build Coastguard Worker movu m18, [srcq+r6 *2+ 0] 3800*c0909341SAndroid Build Coastguard Worker movu m7, [srcq+r6 *2+12] 3801*c0909341SAndroid Build Coastguard Worker movu m6, [srcq+r6 *1+ 0] 3802*c0909341SAndroid Build Coastguard Worker movu m20, [srcq+r6 *1+12] 3803*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+ssq*2] 3804*c0909341SAndroid Build Coastguard Worker movu m19, [srcq+ssq*0+ 0] 3805*c0909341SAndroid Build Coastguard Worker movu m21, [srcq+ssq*0+12] 3806*c0909341SAndroid Build Coastguard Worker movu m22, [srcq+ssq*1+ 0] 3807*c0909341SAndroid Build Coastguard Worker movu m24, [srcq+ssq*1+12] 3808*c0909341SAndroid Build Coastguard Worker mov r8, tmpq 3809*c0909341SAndroid Build Coastguard Worker movu m23, [r7 +ssq*0+ 0] 3810*c0909341SAndroid Build Coastguard Worker movu m25, [r7 +ssq*0+12] 3811*c0909341SAndroid Build Coastguard Worker pshufb m1, m18, m8 3812*c0909341SAndroid Build Coastguard Worker mova m0, m10 3813*c0909341SAndroid Build Coastguard Worker pshufb m2, m7, m9 3814*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m1 ; a0 3815*c0909341SAndroid Build Coastguard Worker mova m1, m10 3816*c0909341SAndroid Build Coastguard Worker pshufb m4, m6, m8 3817*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m2 ; a2' 3818*c0909341SAndroid Build Coastguard Worker mova m2, m10 3819*c0909341SAndroid Build Coastguard Worker pshufb m3, m19, m8 3820*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m4 ; b0 3821*c0909341SAndroid Build Coastguard Worker mova m4, m10 3822*c0909341SAndroid Build Coastguard Worker pshufb m5, m20, m9 3823*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m3 ; c0 3824*c0909341SAndroid Build Coastguard Worker mova m3, m10 3825*c0909341SAndroid Build Coastguard Worker pshufb m26, m21, m9 3826*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m5 ; b2' 3827*c0909341SAndroid Build Coastguard Worker mova m5, m10 3828*c0909341SAndroid Build Coastguard Worker pshufb m18, m9 3829*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m26 ; c2' 3830*c0909341SAndroid Build Coastguard Worker pshufb m7, m8 3831*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m18 ; a1 3832*c0909341SAndroid Build Coastguard Worker pshufb m6, m9 3833*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m7 ; a1' 3834*c0909341SAndroid Build Coastguard Worker pshufb m19, m9 3835*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m6 ; b1 3836*c0909341SAndroid Build Coastguard Worker pshufb m20, m8 3837*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m19 ; c1 3838*c0909341SAndroid Build Coastguard Worker pshufb m21, m8 3839*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m20 ; b1' 3840*c0909341SAndroid Build Coastguard Worker shufpd m18, m7, 0x55 3841*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m21 ; c1' 3842*c0909341SAndroid Build Coastguard Worker shufpd m6, m20, 0x55 3843*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m18 ; a2 3844*c0909341SAndroid Build Coastguard Worker shufpd m19, m21, 0x55 3845*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m18 ; a0' 3846*c0909341SAndroid Build Coastguard Worker pshufb m18, m22, m8 3847*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m6 ; b2 3848*c0909341SAndroid Build Coastguard Worker pshufb m7, m23, m8 3849*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m19 ; c2 3850*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m6 ; b0' 3851*c0909341SAndroid Build Coastguard Worker mova m6, m10 3852*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m19 ; c0' 3853*c0909341SAndroid Build Coastguard Worker pshufb m19, m24, m9 3854*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m18 ; d0 3855*c0909341SAndroid Build Coastguard Worker mova m18, m10 3856*c0909341SAndroid Build Coastguard Worker pshufb m26, m25, m9 3857*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m12, m7 ; e0 3858*c0909341SAndroid Build Coastguard Worker mova m7, m10 3859*c0909341SAndroid Build Coastguard Worker pshufb m22, m9 3860*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m19 ; d2' 3861*c0909341SAndroid Build Coastguard Worker mova m19, m10 3862*c0909341SAndroid Build Coastguard Worker pshufb m23, m9 3863*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m14, m26 ; e2' 3864*c0909341SAndroid Build Coastguard Worker pshufb m24, m8 3865*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m22 ; d1 3866*c0909341SAndroid Build Coastguard Worker pshufb m25, m8 3867*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m13, m23 ; e1 3868*c0909341SAndroid Build Coastguard Worker shufpd m22, m24, 0x55 3869*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m24 ; d1' 3870*c0909341SAndroid Build Coastguard Worker shufpd m23, m25, 0x55 3871*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m13, m25 ; e1' 3872*c0909341SAndroid Build Coastguard Worker pslldq m0, 1 3873*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m22 ; d2 3874*c0909341SAndroid Build Coastguard Worker pslldq m1, 1 3875*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m14, m23 ; e2 3876*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m27, m4 ; 12 3877*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m22 ; d0' 3878*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m27, m5 ; 12' 3879*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m12, m23 ; e0' 3880*c0909341SAndroid Build Coastguard Worker vpshrdd m0, m2, 16 ; 01 3881*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m27, m18 ; 34 3882*c0909341SAndroid Build Coastguard Worker vpshrdd m1, m3, 16 ; 01' 3883*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m27, m19 ; 34' 3884*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m2, m6, 16 ; 23 3885*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m3, m7, 16 ; 23' 3886*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 3887*c0909341SAndroid Build Coastguard Worker movu m22, [r7+ssq*1+ 0] 3888*c0909341SAndroid Build Coastguard Worker movu m24, [r7+ssq*1+12] 3889*c0909341SAndroid Build Coastguard Worker lea r7, [r7+ssq*2] 3890*c0909341SAndroid Build Coastguard Worker movu m23, [r7+ssq*0+ 0] 3891*c0909341SAndroid Build Coastguard Worker movu m25, [r7+ssq*0+12] 3892*c0909341SAndroid Build Coastguard Worker mova m19, m11 3893*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m15, m2 ; B0 3894*c0909341SAndroid Build Coastguard Worker mova m21, m11 3895*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m15, m3 ; B0' 3896*c0909341SAndroid Build Coastguard Worker mova m18, m11 3897*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m15, m0 ; A0 3898*c0909341SAndroid Build Coastguard Worker mova m20, m11 3899*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m15, m1 ; A0' 3900*c0909341SAndroid Build Coastguard Worker mova m2, m6 3901*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m16, m6 ; B1 3902*c0909341SAndroid Build Coastguard Worker mova m3, m7 3903*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m16, m7 ; B1' 3904*c0909341SAndroid Build Coastguard Worker mova m0, m4 3905*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m16, m4 ; A1 3906*c0909341SAndroid Build Coastguard Worker mova m1, m5 3907*c0909341SAndroid Build Coastguard Worker pshufb m4, m22, m8 3908*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m16, m5 ; A1' 3909*c0909341SAndroid Build Coastguard Worker mova m6, m10 3910*c0909341SAndroid Build Coastguard Worker pshufb m7, m23, m8 3911*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m4 ; f0 3912*c0909341SAndroid Build Coastguard Worker mova m4, m10 3913*c0909341SAndroid Build Coastguard Worker pshufb m5, m24, m9 3914*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m7 ; g0 3915*c0909341SAndroid Build Coastguard Worker mova m7, m10 3916*c0909341SAndroid Build Coastguard Worker pshufb m26, m25, m9 3917*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m5 ; f2' 3918*c0909341SAndroid Build Coastguard Worker mova m5, m10 3919*c0909341SAndroid Build Coastguard Worker pshufb m22, m9 3920*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m26 ; g2' 3921*c0909341SAndroid Build Coastguard Worker pshufb m23, m9 3922*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m22 ; f1 3923*c0909341SAndroid Build Coastguard Worker pshufb m24, m8 3924*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m23 ; g1 3925*c0909341SAndroid Build Coastguard Worker pshufb m25, m8 3926*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m24 ; f1' 3927*c0909341SAndroid Build Coastguard Worker shufpd m22, m24, 0x55 3928*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m25 ; g1' 3929*c0909341SAndroid Build Coastguard Worker shufpd m23, m25, 0x55 3930*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m22 ; f2 3931*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m23 ; g2 3932*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m22 ; f0' 3933*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m23 ; g0' 3934*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m27, m4 ; 56 3935*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m27, m5 ; 56' 3936*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m17, m6 ; B2 3937*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m2, m6, 16 ; 45 3938*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m17, m7 ; B2' 3939*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m3, m7, 16 ; 45' 3940*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m17, m4 ; A2 3941*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m17, m5 ; A2' 3942*c0909341SAndroid Build Coastguard Worker vpermt2b m19, m28, m21 3943*c0909341SAndroid Build Coastguard Worker vpermt2b m18, m28, m20 3944*c0909341SAndroid Build Coastguard Worker mova [r8+wq*0], m18 3945*c0909341SAndroid Build Coastguard Worker mova [r8+wq*2], m19 3946*c0909341SAndroid Build Coastguard Worker lea r8, [r8+wq*4] 3947*c0909341SAndroid Build Coastguard Worker sub hd, 2 3948*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 3949*c0909341SAndroid Build Coastguard Worker add srcq, 64 3950*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3951*c0909341SAndroid Build Coastguard Worker movzx hd, r5b 3952*c0909341SAndroid Build Coastguard Worker sub r5d, 1<<8 3953*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop0 3954*c0909341SAndroid Build Coastguard Worker%if WIN64 3955*c0909341SAndroid Build Coastguard Worker pop r8 3956*c0909341SAndroid Build Coastguard Worker%endif 3957*c0909341SAndroid Build Coastguard Worker RET 3958*c0909341SAndroid Build Coastguard Worker 3959*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc 3960*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc 3961*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc 3962*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc 3963*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp, SHARP, SHARP 3964*c0909341SAndroid Build Coastguard Worker 3965*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my 3966*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx512icl 3967*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 3968*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 3969*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 3970*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 3971*c0909341SAndroid Build Coastguard Worker lea r7, [prep_avx512icl] 3972*c0909341SAndroid Build Coastguard Worker mov wd, wm 3973*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3974*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 3975*c0909341SAndroid Build Coastguard Worker jnz .h 3976*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3977*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep 3978*c0909341SAndroid Build Coastguard Worker.v: 3979*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3980*c0909341SAndroid Build Coastguard Worker shr myd, 16 3981*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3982*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3983*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 3984*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [prep_8tap_rnd] 3985*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+myq*8] 3986*c0909341SAndroid Build Coastguard Worker tzcnt r6d, wd 3987*c0909341SAndroid Build Coastguard Worker shr r5d, 11 3988*c0909341SAndroid Build Coastguard Worker movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] 3989*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 3990*c0909341SAndroid Build Coastguard Worker add r7, r6 3991*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 3992*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3993*c0909341SAndroid Build Coastguard Worker mova [tmpq], xmm0 3994*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 3995*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+ 4] 3996*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+ 8] 3997*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [tmpq+12] 3998*c0909341SAndroid Build Coastguard Worker jmp r7 3999*c0909341SAndroid Build Coastguard Worker.v_w4: 4000*c0909341SAndroid Build Coastguard Worker mov r3d, 0x330c 4001*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*0] 4002*c0909341SAndroid Build Coastguard Worker kmovw k1, r3d 4003*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym1{k1}, [srcq+strideq*1] 4004*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [srcq+r6 ] 4005*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3 4006*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4007*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym0{k1}, [srcq+strideq*0] 4008*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [srcq+strideq*1] 4009*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6 4010*c0909341SAndroid Build Coastguard Worker mova ym5, [prep_endA] 4011*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4 4012*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5 4013*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; 01 12 23 34 4014*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0 ; 23 34 45 56 4015*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 4016*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+r6 ] 4017*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4018*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym4{k1}, [srcq+strideq*0] 4019*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+strideq*1] 4020*c0909341SAndroid Build Coastguard Worker vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a 4021*c0909341SAndroid Build Coastguard Worker mova m3, m10 4022*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 4023*c0909341SAndroid Build Coastguard Worker valignq m1, m4, m0, 6 ; 6 7 8 9 4024*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m2 ; a1 b1 c1 d1 4025*c0909341SAndroid Build Coastguard Worker mova m0, m4 4026*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m1, m4 ; 67 78 89 9a 4027*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m15, m4 ; a3 b3 c3 d3 4028*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78 4029*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 4030*c0909341SAndroid Build Coastguard Worker mova m2, m4 4031*c0909341SAndroid Build Coastguard Worker vpermb m3, m5, m3 4032*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym3 4033*c0909341SAndroid Build Coastguard Worker add tmpq, 32 4034*c0909341SAndroid Build Coastguard Worker sub hd, 4 4035*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 4036*c0909341SAndroid Build Coastguard Worker RET 4037*c0909341SAndroid Build Coastguard Worker.v_w8: 4038*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 4039*c0909341SAndroid Build Coastguard Worker mov r3d, 0x33 4040*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym1, [srcq+strideq*1] 4041*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 4042*c0909341SAndroid Build Coastguard Worker mova m7, [spel_v_shuf8] 4043*c0909341SAndroid Build Coastguard Worker vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 4044*c0909341SAndroid Build Coastguard Worker add srcq, r6 4045*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym2, [srcq+strideq*0] 4046*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [srcq+strideq*1] 4047*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [srcq+strideq*2] 4048*c0909341SAndroid Build Coastguard Worker vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4 4049*c0909341SAndroid Build Coastguard Worker vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6 4050*c0909341SAndroid Build Coastguard Worker mova m8, [prep_endB] 4051*c0909341SAndroid Build Coastguard Worker vpermb m1, m7, m1 ; 01 12 4052*c0909341SAndroid Build Coastguard Worker vpermb m2, m7, m2 ; 23 34 4053*c0909341SAndroid Build Coastguard Worker vpermb m3, m7, m0 ; 45 56 4054*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 4055*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4056*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+strideq*0] 4057*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+strideq*1] 4058*c0909341SAndroid Build Coastguard Worker vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8 4059*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [srcq+strideq*2] 4060*c0909341SAndroid Build Coastguard Worker vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a 4061*c0909341SAndroid Build Coastguard Worker mova m5, m10 4062*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m1 ; a0 b0 4063*c0909341SAndroid Build Coastguard Worker mova m6, m10 4064*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m2 ; c0 d0 4065*c0909341SAndroid Build Coastguard Worker mova m1, m3 4066*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m2 ; c1 d1 4067*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m3 ; c1 d1 4068*c0909341SAndroid Build Coastguard Worker vpermb m2, m7, m4 ; 67 78 4069*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m3 ; a2 b2 4070*c0909341SAndroid Build Coastguard Worker vpermb m3, m7, m0 ; 89 9a 4071*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m2 ; c2 d2 4072*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m15, m2 ; a3 b3 4073*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m15, m3 ; c3 d3 4074*c0909341SAndroid Build Coastguard Worker vpermt2b m5, m8, m6 4075*c0909341SAndroid Build Coastguard Worker mova [tmpq], m5 4076*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4077*c0909341SAndroid Build Coastguard Worker sub hd, 4 4078*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 4079*c0909341SAndroid Build Coastguard Worker RET 4080*c0909341SAndroid Build Coastguard Worker.v_w16: 4081*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+strideq*1] 4082*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m0, [srcq+strideq*2], 1 4083*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*0], 0 4084*c0909341SAndroid Build Coastguard Worker mova m8, [spel_v_shuf16] 4085*c0909341SAndroid Build Coastguard Worker add srcq, r6 4086*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+strideq*0] 4087*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+strideq*1], 1 4088*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+strideq*2] 4089*c0909341SAndroid Build Coastguard Worker add srcq, r6 4090*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+strideq*0], 1 4091*c0909341SAndroid Build Coastguard Worker mova m11, [prep_endA] 4092*c0909341SAndroid Build Coastguard Worker vpermb m1, m8, m1 ; 12 4093*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 ; 01 4094*c0909341SAndroid Build Coastguard Worker vpermb m3, m8, m3 ; 34 4095*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, m5 ; 56 4096*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 23 4097*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m3, m5, 16 ; 45 4098*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 4099*c0909341SAndroid Build Coastguard Worker mova m7, m10 4100*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m1 ; b0 4101*c0909341SAndroid Build Coastguard Worker mova m6, m10 4102*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m0 ; a0 4103*c0909341SAndroid Build Coastguard Worker mova m1, m3 4104*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m3 ; b1 4105*c0909341SAndroid Build Coastguard Worker mova m0, m2 4106*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m2 ; a1 4107*c0909341SAndroid Build Coastguard Worker mova m3, m5 4108*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m5 ; b2 4109*c0909341SAndroid Build Coastguard Worker mova m2, m4 4110*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m4 ; a2 4111*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+strideq*1] 4112*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 4113*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+strideq*0], 1 4114*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, m5 ; 78 4115*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m3, m5, 16 ; 67 4116*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m15, m5 ; b3 4117*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m15, m4 ; a3 4118*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m11, m7 4119*c0909341SAndroid Build Coastguard Worker mova [tmpq], m6 4120*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4121*c0909341SAndroid Build Coastguard Worker sub hd, 2 4122*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 4123*c0909341SAndroid Build Coastguard Worker RET 4124*c0909341SAndroid Build Coastguard Worker.v_w32: 4125*c0909341SAndroid Build Coastguard Worker.v_w64: 4126*c0909341SAndroid Build Coastguard Worker.v_w128: 4127*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 23 4128*c0909341SAndroid Build Coastguard Worker%if WIN64 4129*c0909341SAndroid Build Coastguard Worker push r8 4130*c0909341SAndroid Build Coastguard Worker%endif 4131*c0909341SAndroid Build Coastguard Worker mova m11, [prep_endC] 4132*c0909341SAndroid Build Coastguard Worker lea r5, [hq+wq*8-256] 4133*c0909341SAndroid Build Coastguard Worker.v_w32_loop0: 4134*c0909341SAndroid Build Coastguard Worker movu m16, [srcq+strideq*0] 4135*c0909341SAndroid Build Coastguard Worker movu m17, [srcq+strideq*1] 4136*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+r6] 4137*c0909341SAndroid Build Coastguard Worker movu m18, [srcq+strideq*2] 4138*c0909341SAndroid Build Coastguard Worker movu m19, [r7 +strideq*0] 4139*c0909341SAndroid Build Coastguard Worker mov r8, tmpq 4140*c0909341SAndroid Build Coastguard Worker movu m20, [r7 +strideq*1] 4141*c0909341SAndroid Build Coastguard Worker movu m21, [r7 +strideq*2] 4142*c0909341SAndroid Build Coastguard Worker add r7, r6 4143*c0909341SAndroid Build Coastguard Worker movu m22, [r7 +strideq*0] 4144*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m16, m17 ; 01l 4145*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 ; 01h 4146*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m17, m18 ; 12l 4147*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m18 ; 12h 4148*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m18, m19 ; 23l 4149*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19 ; 23h 4150*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m19, m20 ; 34l 4151*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m20 ; 34h 4152*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m20, m21 ; 45l 4153*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m21 ; 45h 4154*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m21, m22 ; 56l 4155*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m22 ; 56h 4156*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 4157*c0909341SAndroid Build Coastguard Worker mova m6, m10 4158*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m0 ; a0l 4159*c0909341SAndroid Build Coastguard Worker mova m8, m10 4160*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m12, m16 ; a0h 4161*c0909341SAndroid Build Coastguard Worker mova m7, m10 4162*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m1 ; b0l 4163*c0909341SAndroid Build Coastguard Worker mova m9, m10 4164*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m12, m17 ; b0h 4165*c0909341SAndroid Build Coastguard Worker mova m0, m2 4166*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m2 ; a1l 4167*c0909341SAndroid Build Coastguard Worker mova m16, m18 4168*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m13, m18 ; a1h 4169*c0909341SAndroid Build Coastguard Worker mova m1, m3 4170*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m3 ; b1l 4171*c0909341SAndroid Build Coastguard Worker mova m17, m19 4172*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m13, m19 ; b1h 4173*c0909341SAndroid Build Coastguard Worker mova m2, m4 4174*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m4 ; a2l 4175*c0909341SAndroid Build Coastguard Worker mova m18, m20 4176*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m14, m20 ; a2h 4177*c0909341SAndroid Build Coastguard Worker mova m3, m5 4178*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m5 ; b2l 4179*c0909341SAndroid Build Coastguard Worker mova m19, m21 4180*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m14, m21 ; b2h 4181*c0909341SAndroid Build Coastguard Worker movu m21, [r7+strideq*1] 4182*c0909341SAndroid Build Coastguard Worker lea r7, [r7+strideq*2] 4183*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m22, m21 ; 67l 4184*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m22, m21 ; 67h 4185*c0909341SAndroid Build Coastguard Worker movu m22, [r7+strideq*0] 4186*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m15, m4 ; a3l 4187*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m15, m20 ; a3h 4188*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m21, m22 ; 78l 4189*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m22 ; 78h 4190*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m15, m5 ; b3l 4191*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m15, m21 ; b3h 4192*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m11, m8 4193*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m11, m9 4194*c0909341SAndroid Build Coastguard Worker mova [r8+wq*0], m6 4195*c0909341SAndroid Build Coastguard Worker mova [r8+wq*2], m7 4196*c0909341SAndroid Build Coastguard Worker lea r8, [r8+wq*4] 4197*c0909341SAndroid Build Coastguard Worker sub hd, 2 4198*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 4199*c0909341SAndroid Build Coastguard Worker add srcq, 64 4200*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4201*c0909341SAndroid Build Coastguard Worker movzx hd, r5b 4202*c0909341SAndroid Build Coastguard Worker sub r5d, 1<<8 4203*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop0 4204*c0909341SAndroid Build Coastguard Worker%if WIN64 4205*c0909341SAndroid Build Coastguard Worker pop r8 4206*c0909341SAndroid Build Coastguard Worker%endif 4207*c0909341SAndroid Build Coastguard Worker RET 4208*c0909341SAndroid Build Coastguard Worker.h_w4: 4209*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 4210*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 4211*c0909341SAndroid Build Coastguard Worker sub srcq, 2 4212*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4213*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 4214*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [spel_h_shufA] 4215*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [spel_h_shufB] 4216*c0909341SAndroid Build Coastguard Worker shr r5d, 11 4217*c0909341SAndroid Build Coastguard Worker mova ym9, [prep_endA] 4218*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 4219*c0909341SAndroid Build Coastguard Worker mova [tmpq], xmm0 4220*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [tmpq+4] 4221*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [tmpq+8] 4222*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 4223*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*0] 4224*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+strideq*1], 1 4225*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+strideq*2], 2 4226*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+r6 ], 3 4227*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4228*c0909341SAndroid Build Coastguard Worker mova m0, m10 4229*c0909341SAndroid Build Coastguard Worker pshufb m1, m2, m4 4230*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m6, m1 4231*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 4232*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m7, m2 4233*c0909341SAndroid Build Coastguard Worker vpermb m0, m9, m0 4234*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym0 4235*c0909341SAndroid Build Coastguard Worker add tmpq, 32 4236*c0909341SAndroid Build Coastguard Worker sub hd, 4 4237*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 4238*c0909341SAndroid Build Coastguard Worker RET 4239*c0909341SAndroid Build Coastguard Worker.h_w8: 4240*c0909341SAndroid Build Coastguard Worker mova m6, [spel_h_shufA] 4241*c0909341SAndroid Build Coastguard Worker movu m7, [spel_h_shufB] 4242*c0909341SAndroid Build Coastguard Worker movu m8, [spel_h_shufC] 4243*c0909341SAndroid Build Coastguard Worker mova m9, [spel_h_shufD] 4244*c0909341SAndroid Build Coastguard Worker mova m11, [prep_endB] 4245*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 4246*c0909341SAndroid Build Coastguard Worker movu ym4, [srcq+strideq*0] 4247*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, [srcq+strideq*1], 1 4248*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+strideq*2] 4249*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+r6 ], 1 4250*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4251*c0909341SAndroid Build Coastguard Worker mova m0, m10 4252*c0909341SAndroid Build Coastguard Worker mova m1, m10 4253*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m4 4254*c0909341SAndroid Build Coastguard Worker vpermb m3, m6, m5 4255*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m2 4256*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m3 4257*c0909341SAndroid Build Coastguard Worker vpermb m2, m7, m4 4258*c0909341SAndroid Build Coastguard Worker vpermb m3, m7, m5 4259*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m2 4260*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m3 4261*c0909341SAndroid Build Coastguard Worker vpermb m2, m8, m4 4262*c0909341SAndroid Build Coastguard Worker vpermb m3, m8, m5 4263*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m2 4264*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m3 4265*c0909341SAndroid Build Coastguard Worker vpermb m2, m9, m4 4266*c0909341SAndroid Build Coastguard Worker vpermb m3, m9, m5 4267*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m15, m2 4268*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m15, m3 4269*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m11, m1 4270*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 4271*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4272*c0909341SAndroid Build Coastguard Worker sub hd, 4 4273*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 4274*c0909341SAndroid Build Coastguard Worker RET 4275*c0909341SAndroid Build Coastguard Worker.h: 4276*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [prep_8tap_rnd] 4277*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 4278*c0909341SAndroid Build Coastguard Worker jnz .hv 4279*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 4280*c0909341SAndroid Build Coastguard Worker cmp wd, 4 4281*c0909341SAndroid Build Coastguard Worker je .h_w4 4282*c0909341SAndroid Build Coastguard Worker shr mxd, 16 4283*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4284*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 4285*c0909341SAndroid Build Coastguard Worker sub srcq, 6 4286*c0909341SAndroid Build Coastguard Worker shr r5d, 11 4287*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 4288*c0909341SAndroid Build Coastguard Worker mova [tmpq], xmm0 4289*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 4290*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+ 4] 4291*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+ 8] 4292*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [tmpq+12] 4293*c0909341SAndroid Build Coastguard Worker cmp wd, 16 4294*c0909341SAndroid Build Coastguard Worker jl .h_w8 4295*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufA] 4296*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [spel_h_shufB] 4297*c0909341SAndroid Build Coastguard Worker mova m11, [prep_endC] 4298*c0909341SAndroid Build Coastguard Worker jg .h_w32 4299*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 4300*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+strideq*0+ 0] 4301*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+strideq*1+ 0], 1 4302*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+strideq*0+16] 4303*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [srcq+strideq*1+16], 1 4304*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 4305*c0909341SAndroid Build Coastguard Worker mova m0, m10 4306*c0909341SAndroid Build Coastguard Worker mova m1, m10 4307*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 4308*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m4 ; a0 4309*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m6 4310*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m4 ; b2 4311*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m7 4312*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m4 ; a1 4313*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m7 4314*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m15, m4 ; b3 4315*c0909341SAndroid Build Coastguard Worker shufpd m2, m3, 0x55 4316*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 4317*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m4 ; a2 4318*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m4 ; b0 4319*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 4320*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m15, m2 ; a3 4321*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m2 ; b1 4322*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m11, m1 4323*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 4324*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4325*c0909341SAndroid Build Coastguard Worker sub hd, 2 4326*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 4327*c0909341SAndroid Build Coastguard Worker RET 4328*c0909341SAndroid Build Coastguard Worker.h_w32: 4329*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 4330*c0909341SAndroid Build Coastguard Worker neg wq 4331*c0909341SAndroid Build Coastguard Worker.h_w32_loop0: 4332*c0909341SAndroid Build Coastguard Worker mov r6, wq 4333*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 4334*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+r6*2+ 0] 4335*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+r6*2+ 8] 4336*c0909341SAndroid Build Coastguard Worker mova m0, m10 4337*c0909341SAndroid Build Coastguard Worker mova m1, m10 4338*c0909341SAndroid Build Coastguard Worker pshufb m4, m2, m6 4339*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m12, m4 ; a0 4340*c0909341SAndroid Build Coastguard Worker pshufb m4, m3, m6 4341*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m4 ; b0 4342*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m14, m4 ; a2 4343*c0909341SAndroid Build Coastguard Worker movu m4, [srcq+r6*2+16] 4344*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 4345*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m3 ; b1 4346*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m15, m3 ; a3 4347*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 4348*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m3 ; b2 4349*c0909341SAndroid Build Coastguard Worker pshufb m2, m7 4350*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m13, m2 ; a1 4351*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 4352*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m15, m4 ; b3 4353*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m11, m1 4354*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 4355*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4356*c0909341SAndroid Build Coastguard Worker add r6, 32 4357*c0909341SAndroid Build Coastguard Worker jl .h_w32_loop 4358*c0909341SAndroid Build Coastguard Worker add srcq, strideq 4359*c0909341SAndroid Build Coastguard Worker dec hd 4360*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop0 4361*c0909341SAndroid Build Coastguard Worker RET 4362*c0909341SAndroid Build Coastguard Worker.hv: 4363*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [pd_128] 4364*c0909341SAndroid Build Coastguard Worker cmp wd, 4 4365*c0909341SAndroid Build Coastguard Worker jg .hv_w8 4366*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 4367*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4368*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 4369*c0909341SAndroid Build Coastguard Worker shr myd, 16 4370*c0909341SAndroid Build Coastguard Worker cmp hd, 4 4371*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 4372*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 4373*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+myq*8] 4374*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 4375*c0909341SAndroid Build Coastguard Worker sub srcq, 2 4376*c0909341SAndroid Build Coastguard Worker shr r5d, 11 4377*c0909341SAndroid Build Coastguard Worker sub srcq, r6 4378*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 4379*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 4380*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], xmm0 4381*c0909341SAndroid Build Coastguard Worker mova [tmpq+16], xmm1 4382*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm1 4383*c0909341SAndroid Build Coastguard Worker movu xm16, [srcq+strideq*0] 4384*c0909341SAndroid Build Coastguard Worker mov r3d, 0xff0 4385*c0909341SAndroid Build Coastguard Worker vinserti128 ym16, [srcq+strideq*1], 1 4386*c0909341SAndroid Build Coastguard Worker kmovw k1, r3d 4387*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m18, [srcq+strideq*2] 4388*c0909341SAndroid Build Coastguard Worker add srcq, r6 4389*c0909341SAndroid Build Coastguard Worker vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3 4390*c0909341SAndroid Build Coastguard Worker movu xm17, [srcq+strideq*1] 4391*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym18, [srcq+strideq*2] 4392*c0909341SAndroid Build Coastguard Worker add srcq, r6 4393*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2 4394*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [spel_h_shufA] 4395*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [spel_h_shufB] 4396*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [tmpq+ 4] 4397*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [tmpq+ 8] 4398*c0909341SAndroid Build Coastguard Worker mova m1, m10 4399*c0909341SAndroid Build Coastguard Worker mova m19, [spel_shuf4a] 4400*c0909341SAndroid Build Coastguard Worker mova m2, m10 4401*c0909341SAndroid Build Coastguard Worker pshufb m0, m16, m5 4402*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m8, m0 4403*c0909341SAndroid Build Coastguard Worker pshufb m0, m17, m5 4404*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m8, m0 4405*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+20] 4406*c0909341SAndroid Build Coastguard Worker pshufb m16, m6 4407*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+24] 4408*c0909341SAndroid Build Coastguard Worker pshufb m17, m6 4409*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [tmpq+28] 4410*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m9, m16 ; 0 1 2 3 4411*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, m17 ; 4 5 6 4412*c0909341SAndroid Build Coastguard Worker mova m7, [spel_shuf4b] 4413*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m19, m2 ; 01 12 23 34 4414*c0909341SAndroid Build Coastguard Worker vpermb m2, m19, m2 ; 45 56 4415*c0909341SAndroid Build Coastguard Worker mova ym19, [prep_endA] 4416*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56 4417*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 4418*c0909341SAndroid Build Coastguard Worker movu xm17, [srcq+strideq*1] 4419*c0909341SAndroid Build Coastguard Worker vinserti128 ym17, [srcq+strideq*2], 1 4420*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m16, [srcq+r6 ] 4421*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4422*c0909341SAndroid Build Coastguard Worker vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3 4423*c0909341SAndroid Build Coastguard Worker mova m18, m10 4424*c0909341SAndroid Build Coastguard Worker pshufb m16, m17, m5 4425*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m8, m16 4426*c0909341SAndroid Build Coastguard Worker mova m16, m11 4427*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m12, m1 ; a0 b0 c0 d0 4428*c0909341SAndroid Build Coastguard Worker pshufb m17, m6 4429*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m9, m17 ; 7 8 9 a 4430*c0909341SAndroid Build Coastguard Worker mova m1, m2 4431*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m13, m2 ; a1 b1 c1 d1 4432*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m7, m18 ; 67 78 89 9a 4433*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m15, m2 ; a3 b3 c3 d3 4434*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m2, q1032 ; 45 56 67 78 4435*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m14, m1 ; a2 b2 c2 d2 4436*c0909341SAndroid Build Coastguard Worker vpermb m16, m19, m16 4437*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym16 4438*c0909341SAndroid Build Coastguard Worker add tmpq, 32 4439*c0909341SAndroid Build Coastguard Worker sub hd, 4 4440*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 4441*c0909341SAndroid Build Coastguard Worker vzeroupper 4442*c0909341SAndroid Build Coastguard Worker RET 4443*c0909341SAndroid Build Coastguard Worker.hv_w8: 4444*c0909341SAndroid Build Coastguard Worker shr mxd, 16 4445*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm0, [base+subpel_filters+mxq*8] 4446*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 4447*c0909341SAndroid Build Coastguard Worker shr myd, 16 4448*c0909341SAndroid Build Coastguard Worker cmp hd, 6 4449*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 4450*c0909341SAndroid Build Coastguard Worker mov r5d, r7m 4451*c0909341SAndroid Build Coastguard Worker pmovsxbw xmm1, [base+subpel_filters+myq*8] 4452*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 4453*c0909341SAndroid Build Coastguard Worker sub srcq, 6 4454*c0909341SAndroid Build Coastguard Worker shr r5d, 11 4455*c0909341SAndroid Build Coastguard Worker sub srcq, r6 4456*c0909341SAndroid Build Coastguard Worker psllw xmm0, [base+prep_hv_shift+r5*8] 4457*c0909341SAndroid Build Coastguard Worker psllw xmm1, 2 4458*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], xmm0 4459*c0909341SAndroid Build Coastguard Worker mova [tmpq+16], xmm1 4460*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, xmm0 4461*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [tmpq+ 4] 4462*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [tmpq+ 8] 4463*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [tmpq+12] 4464*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, xmm1 4465*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [tmpq+20] 4466*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [tmpq+24] 4467*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [tmpq+28] 4468*c0909341SAndroid Build Coastguard Worker cmp wd, 8 4469*c0909341SAndroid Build Coastguard Worker jg .hv_w16 4470*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 23 4471*c0909341SAndroid Build Coastguard Worker mova m5, [spel_h_shufA] 4472*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 4473*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 4474*c0909341SAndroid Build Coastguard Worker movu ym9, [srcq+strideq*2] 4475*c0909341SAndroid Build Coastguard Worker add srcq, r6 4476*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 4477*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+strideq*1] 4478*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 4479*c0909341SAndroid Build Coastguard Worker add srcq, r6 4480*c0909341SAndroid Build Coastguard Worker movu ym21, [srcq+strideq*0] ; 6 4481*c0909341SAndroid Build Coastguard Worker movu m6, [spel_h_shufB] 4482*c0909341SAndroid Build Coastguard Worker movu m7, [spel_h_shufC] 4483*c0909341SAndroid Build Coastguard Worker mova ym22, [prep_endB] 4484*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m0 4485*c0909341SAndroid Build Coastguard Worker mova m1, m10 4486*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m8 ; a0 b0 4487*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m9 4488*c0909341SAndroid Build Coastguard Worker mova m2, m10 4489*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m8 ; c0 d0 4490*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m20 4491*c0909341SAndroid Build Coastguard Worker mova m3, m10 4492*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m8 ; e0 f0 4493*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m21 4494*c0909341SAndroid Build Coastguard Worker mova m4, m10 4495*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m8 ; g0 4496*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m0 4497*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m8 ; a1 b1 4498*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m9 4499*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m8 ; c1 d1 4500*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m20 4501*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m8 ; e1 f1 4502*c0909341SAndroid Build Coastguard Worker vpermb m8, m6, m21 4503*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m8 ; g1 4504*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m0 4505*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m8 ; a2 b2 4506*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m9 4507*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m8 ; c2 d2 4508*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m20 4509*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m8 ; e2 f2 4510*c0909341SAndroid Build Coastguard Worker vpermb m8, m7, m21 4511*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m8 ; g2 4512*c0909341SAndroid Build Coastguard Worker mova m8, [spel_h_shufD] 4513*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 4514*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m15, m0 ; a3 b3 4515*c0909341SAndroid Build Coastguard Worker mova m0, [spel_shuf8a] 4516*c0909341SAndroid Build Coastguard Worker vpermb m9, m8, m9 4517*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m15, m9 ; c3 d3 4518*c0909341SAndroid Build Coastguard Worker mova m9, [spel_shuf8b] 4519*c0909341SAndroid Build Coastguard Worker vpermb m20, m8, m20 4520*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m15, m20 ; e3 f3 4521*c0909341SAndroid Build Coastguard Worker vpermb m21, m8, m21 4522*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m15, m21 ; g3 4523*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m0, m2 ; 01 12 4524*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m0, m3 ; 23 34 4525*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m0, m4 ; 45 56 4526*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 4527*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*1] 4528*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 4529*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*0], 1 4530*c0909341SAndroid Build Coastguard Worker mova m4, m10 4531*c0909341SAndroid Build Coastguard Worker mova m20, m11 4532*c0909341SAndroid Build Coastguard Worker vpermb m21, m5, m0 4533*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m21 ; h0 i0 4534*c0909341SAndroid Build Coastguard Worker vpermb m21, m6, m0 4535*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m16, m1 ; A0 B0 4536*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m21 ; h1 i1 4537*c0909341SAndroid Build Coastguard Worker vpermb m21, m7, m0 4538*c0909341SAndroid Build Coastguard Worker mova m1, m2 4539*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m17, m2 ; A1 B1 4540*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m21 ; h2 i2 4541*c0909341SAndroid Build Coastguard Worker vpermb m21, m8, m0 4542*c0909341SAndroid Build Coastguard Worker mova m2, m3 4543*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m18, m3 ; A2 B2 4544*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m15, m21 ; h3 i3 4545*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m9, m4 ; 67 78 4546*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m19, m3 ; A3 B3 4547*c0909341SAndroid Build Coastguard Worker vpermb m20, m22, m20 4548*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym20 4549*c0909341SAndroid Build Coastguard Worker add tmpq, 32 4550*c0909341SAndroid Build Coastguard Worker sub hd, 2 4551*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 4552*c0909341SAndroid Build Coastguard Worker RET 4553*c0909341SAndroid Build Coastguard Worker.hv_w16: 4554*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 27 4555*c0909341SAndroid Build Coastguard Worker%if WIN64 4556*c0909341SAndroid Build Coastguard Worker push r8 4557*c0909341SAndroid Build Coastguard Worker%endif 4558*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m20, [spel_h_shufA] 4559*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m21, [spel_h_shufB] 4560*c0909341SAndroid Build Coastguard Worker add wd, wd 4561*c0909341SAndroid Build Coastguard Worker mova m9, [spel_shuf16] 4562*c0909341SAndroid Build Coastguard Worker mova m26, [prep_endB] 4563*c0909341SAndroid Build Coastguard Worker lea r5d, [hq+wq*8-256] 4564*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 4565*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m5, [srcq+strideq*0+ 8] 4566*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 4567*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 4568*c0909341SAndroid Build Coastguard Worker movu ym6, [srcq+strideq*1+ 0] 4569*c0909341SAndroid Build Coastguard Worker movu ym7, [srcq+strideq*1+16] 4570*c0909341SAndroid Build Coastguard Worker lea r7, [srcq+r6] 4571*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, [srcq+strideq*2+ 0], 1 4572*c0909341SAndroid Build Coastguard Worker vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 4573*c0909341SAndroid Build Coastguard Worker movu ym22, [r7 +strideq*0+ 0] 4574*c0909341SAndroid Build Coastguard Worker movu ym23, [r7 +strideq*0+16] 4575*c0909341SAndroid Build Coastguard Worker mov r8, tmpq 4576*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, [r7 +strideq*1+ 0], 1 4577*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4 4578*c0909341SAndroid Build Coastguard Worker movu ym24, [r7 +strideq*2+ 0] 4579*c0909341SAndroid Build Coastguard Worker movu ym25, [r7 +strideq*2+16] 4580*c0909341SAndroid Build Coastguard Worker add r7, r6 4581*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, [r7 +strideq*0+ 0], 1 4582*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6 4583*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m20 4584*c0909341SAndroid Build Coastguard Worker mova m1, m10 4585*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m12, m0 ; a0 4586*c0909341SAndroid Build Coastguard Worker pshufb m0, m6, m20 4587*c0909341SAndroid Build Coastguard Worker mova m2, m10 4588*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m0 ; b0 4589*c0909341SAndroid Build Coastguard Worker pshufb m0, m7, m20 4590*c0909341SAndroid Build Coastguard Worker mova m3, m10 4591*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m14, m0 ; c2 4592*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m21 4593*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m13, m0 ; a1 4594*c0909341SAndroid Build Coastguard Worker pshufb m0, m6, m21 4595*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m13, m0 ; b1 4596*c0909341SAndroid Build Coastguard Worker pshufb m0, m7, m21 4597*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m15, m0 ; c3 4598*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m20 4599*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m14, m0 ; a2 4600*c0909341SAndroid Build Coastguard Worker shufpd m6, m7, 0x55 4601*c0909341SAndroid Build Coastguard Worker pshufb m7, m6, m20 4602*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m7 ; b2 4603*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m12, m7 ; c0 4604*c0909341SAndroid Build Coastguard Worker pshufb m5, m21 4605*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m15, m5 ; a3 4606*c0909341SAndroid Build Coastguard Worker pshufb m6, m21 4607*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m15, m6 ; b3 4608*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m6 ; c1 4609*c0909341SAndroid Build Coastguard Worker pshufb m0, m22, m20 4610*c0909341SAndroid Build Coastguard Worker mova m4, m10 4611*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m0 ; d0 4612*c0909341SAndroid Build Coastguard Worker pshufb m0, m23, m20 4613*c0909341SAndroid Build Coastguard Worker mova m5, m10 4614*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m0 ; e2 4615*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m20 4616*c0909341SAndroid Build Coastguard Worker mova m6, m10 4617*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m12, m0 ; f0 4618*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m20 4619*c0909341SAndroid Build Coastguard Worker mova m7, m10 4620*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m0 ; g2 4621*c0909341SAndroid Build Coastguard Worker pshufb m0, m22, m21 4622*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m13, m0 ; d1 4623*c0909341SAndroid Build Coastguard Worker pshufb m0, m23, m21 4624*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m15, m0 ; e3 4625*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m21 4626*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m13, m0 ; f1 4627*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m21 4628*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m15, m0 ; g3 4629*c0909341SAndroid Build Coastguard Worker shufpd m22, m23, 0x55 4630*c0909341SAndroid Build Coastguard Worker pshufb m23, m22, m20 4631*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m14, m23 ; d2 4632*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m12, m23 ; e0 4633*c0909341SAndroid Build Coastguard Worker shufpd m24, m25, 0x55 4634*c0909341SAndroid Build Coastguard Worker pshufb m25, m24, m20 4635*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m14, m25 ; f2 4636*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m25 ; g0 4637*c0909341SAndroid Build Coastguard Worker pshufb m22, m21 4638*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m15, m22 ; d3 4639*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m22 ; e1 4640*c0909341SAndroid Build Coastguard Worker pshufb m24, m21 4641*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m15, m24 ; f3 4642*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m24 ; g1 4643*c0909341SAndroid Build Coastguard Worker pslldq m1, 1 4644*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m9, m3 ; 12 4645*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m9, m5 ; 34 4646*c0909341SAndroid Build Coastguard Worker vpermt2b m6, m9, m7 ; 56 4647*c0909341SAndroid Build Coastguard Worker vpshrdd m1, m2, 16 ; 01 4648*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 23 4649*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m4, m6, 16 ; 45 4650*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 4651*c0909341SAndroid Build Coastguard Worker movu ym24, [r7+strideq*1+ 0] 4652*c0909341SAndroid Build Coastguard Worker movu ym25, [r7+strideq*1+16] 4653*c0909341SAndroid Build Coastguard Worker lea r7, [r7+strideq*2] 4654*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, [r7+strideq*0+ 0], 1 4655*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, [r7+strideq*0+16], 1 4656*c0909341SAndroid Build Coastguard Worker mova m7, m10 4657*c0909341SAndroid Build Coastguard Worker mova m8, m10 4658*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m20 4659*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m12, m0 ; h0 4660*c0909341SAndroid Build Coastguard Worker mova m22, m11 4661*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m20 4662*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m14, m0 ; i2 4663*c0909341SAndroid Build Coastguard Worker mova m23, m11 4664*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m16, m1 ; A0 4665*c0909341SAndroid Build Coastguard Worker mova m1, m3 4666*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m16, m2 ; B0 4667*c0909341SAndroid Build Coastguard Worker mova m2, m4 4668*c0909341SAndroid Build Coastguard Worker pshufb m0, m24, m21 4669*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m13, m0 ; h1 4670*c0909341SAndroid Build Coastguard Worker pshufb m0, m25, m21 4671*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m15, m0 ; i3 4672*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m17, m3 ; A1 4673*c0909341SAndroid Build Coastguard Worker mova m3, m5 4674*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m17, m4 ; B1 4675*c0909341SAndroid Build Coastguard Worker mova m4, m6 4676*c0909341SAndroid Build Coastguard Worker shufpd m24, m25, 0x55 4677*c0909341SAndroid Build Coastguard Worker pshufb m25, m24, m20 4678*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m14, m25 ; h2 4679*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m12, m25 ; i0 4680*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m18, m5 ; A2 4681*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m18, m6 ; B2 4682*c0909341SAndroid Build Coastguard Worker pshufb m24, m21 4683*c0909341SAndroid Build Coastguard Worker vpdpwssd m7, m15, m24 ; h3 4684*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m13, m24 ; i1 4685*c0909341SAndroid Build Coastguard Worker vpermt2b m7, m9, m8 ; 78 4686*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m6, m7, 16 ; 67 4687*c0909341SAndroid Build Coastguard Worker vpdpwssd m22, m19, m5 ; A3 4688*c0909341SAndroid Build Coastguard Worker vpdpwssd m23, m19, m7 ; B3 4689*c0909341SAndroid Build Coastguard Worker mova m6, m7 4690*c0909341SAndroid Build Coastguard Worker vpermt2b m22, m26, m23 4691*c0909341SAndroid Build Coastguard Worker mova [r8+wq*0], ym22 4692*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r8+wq*1], m22, 1 4693*c0909341SAndroid Build Coastguard Worker lea r8, [r8+wq*2] 4694*c0909341SAndroid Build Coastguard Worker sub hd, 2 4695*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 4696*c0909341SAndroid Build Coastguard Worker add srcq, 32 4697*c0909341SAndroid Build Coastguard Worker add tmpq, 32 4698*c0909341SAndroid Build Coastguard Worker movzx hd, r5b 4699*c0909341SAndroid Build Coastguard Worker sub r5d, 1<<8 4700*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 4701*c0909341SAndroid Build Coastguard Worker%if WIN64 4702*c0909341SAndroid Build Coastguard Worker pop r8 4703*c0909341SAndroid Build Coastguard Worker%endif 4704*c0909341SAndroid Build Coastguard Worker RET 4705*c0909341SAndroid Build Coastguard Worker 4706*c0909341SAndroid Build Coastguard Worker%if WIN64 4707*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5 4708*c0909341SAndroid Build Coastguard Worker%else 4709*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 4710*c0909341SAndroid Build Coastguard Worker%endif 4711*c0909341SAndroid Build Coastguard Worker 4712*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts 4713*c0909341SAndroid Build Coastguard Worker%define base r6-pd_0to7 4714*c0909341SAndroid Build Coastguard Worker mov t0d, r7m 4715*c0909341SAndroid Build Coastguard Worker lea r6, [pd_0to7] 4716*c0909341SAndroid Build Coastguard Worker shr t0d, 11 4717*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+warp_8x8t_rnd_v] 4718*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] 4719*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main 4720*c0909341SAndroid Build Coastguard Worker psrad m14, m16, 15 4721*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4722*c0909341SAndroid Build Coastguard Worker psrad m16, 15 4723*c0909341SAndroid Build Coastguard Worker packssdw m14, m16 4724*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4725*c0909341SAndroid Build Coastguard Worker psrad m15, m16, 15 4726*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 4727*c0909341SAndroid Build Coastguard Worker add tsq, tsq 4728*c0909341SAndroid Build Coastguard Worker psrad m16, 15 4729*c0909341SAndroid Build Coastguard Worker packssdw m15, m16 4730*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end 4731*c0909341SAndroid Build Coastguard Worker 4732*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd 4733*c0909341SAndroid Build Coastguard Worker mov t0d, r7m ; pixel_max 4734*c0909341SAndroid Build Coastguard Worker lea r6, [pd_0to7] 4735*c0909341SAndroid Build Coastguard Worker shr t0d, 11 4736*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] 4737*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] 4738*c0909341SAndroid Build Coastguard Worker call .main 4739*c0909341SAndroid Build Coastguard Worker psrad m14, m16, 13 4740*c0909341SAndroid Build Coastguard Worker call .main2 4741*c0909341SAndroid Build Coastguard Worker psrad m16, 13 4742*c0909341SAndroid Build Coastguard Worker packusdw m14, m16 4743*c0909341SAndroid Build Coastguard Worker call .main2 4744*c0909341SAndroid Build Coastguard Worker psrad m15, m16, 13 4745*c0909341SAndroid Build Coastguard Worker call .main2 4746*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+bidir_shift+t0*4] 4747*c0909341SAndroid Build Coastguard Worker vpsrlvw m14, m0 4748*c0909341SAndroid Build Coastguard Worker psrad m16, 13 4749*c0909341SAndroid Build Coastguard Worker packusdw m15, m16 4750*c0909341SAndroid Build Coastguard Worker vpsrlvw m15, m0 4751*c0909341SAndroid Build Coastguard Worker.end: 4752*c0909341SAndroid Build Coastguard Worker mova m0, [base+warp8x8_end] 4753*c0909341SAndroid Build Coastguard Worker vpermb m16, m0, m14 4754*c0909341SAndroid Build Coastguard Worker lea r2, [dsq*3] 4755*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm16 4756*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ym16, 1 4757*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*2], m16, 2 4758*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m16, 3 4759*c0909341SAndroid Build Coastguard Worker vpermb m16, m0, m15 4760*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 4761*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm16 4762*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ym16, 1 4763*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*2], m16, 2 4764*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m16, 3 4765*c0909341SAndroid Build Coastguard Worker RET 4766*c0909341SAndroid Build Coastguard Worker.main: 4767*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3, [base+pd_512] 4768*c0909341SAndroid Build Coastguard Worker%if WIN64 4769*c0909341SAndroid Build Coastguard Worker mov abcdq, r5mp 4770*c0909341SAndroid Build Coastguard Worker vpaddd ym18, ym3, r6m {1to8} ; mx 4771*c0909341SAndroid Build Coastguard Worker%else 4772*c0909341SAndroid Build Coastguard Worker add r5d, 512 4773*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym18, r5d 4774*c0909341SAndroid Build Coastguard Worker%endif 4775*c0909341SAndroid Build Coastguard Worker vpaddd ym20, ym3, r7m {1to8} ; my 4776*c0909341SAndroid Build Coastguard Worker mova ym16, [base+pd_0to7] 4777*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym19, [abcdq+4*0] ; alpha 4778*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym21, [abcdq+4*1] ; gamma 4779*c0909341SAndroid Build Coastguard Worker lea r4, [ssq*3+6] 4780*c0909341SAndroid Build Coastguard Worker vpdpwssd ym18, ym19, ym16 ; tmx 4781*c0909341SAndroid Build Coastguard Worker vpdpwssd ym20, ym21, ym16 ; tmy 4782*c0909341SAndroid Build Coastguard Worker sub srcq, r4 4783*c0909341SAndroid Build Coastguard Worker mova m10, [base+warp8x8_permA] 4784*c0909341SAndroid Build Coastguard Worker lea r4, [mc_warp_filter+64*8] 4785*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m12, [base+warp8x8_permC] 4786*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 4787*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m13, [base+warp8x8_permD] 4788*c0909341SAndroid Build Coastguard Worker movu ym5, [srcq+0] 4789*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [srcq+8], 1 4790*c0909341SAndroid Build Coastguard Worker psrad ym17, ym18, 10 4791*c0909341SAndroid Build Coastguard Worker mova m11, [base+warp8x8_permB] 4792*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 4793*c0909341SAndroid Build Coastguard Worker vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 4794*c0909341SAndroid Build Coastguard Worker psrad ym19, 16 ; beta 4795*c0909341SAndroid Build Coastguard Worker psrad ym21, 16 ; delta 4796*c0909341SAndroid Build Coastguard Worker paddd ym18, ym19 4797*c0909341SAndroid Build Coastguard Worker vpermb m4, m10, m5 4798*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [base+warp_shift_h+t0*8] 4799*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q3120 4800*c0909341SAndroid Build Coastguard Worker paddd m7, m1, m1 4801*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m12 4802*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m2 4803*c0909341SAndroid Build Coastguard Worker vpermb m5, m11, m5 4804*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m5, q1021 4805*c0909341SAndroid Build Coastguard Worker pshufb m3, m13 4806*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m3 4807*c0909341SAndroid Build Coastguard Worker call .h 4808*c0909341SAndroid Build Coastguard Worker psllq m2, m1, 32 4809*c0909341SAndroid Build Coastguard Worker paddd m1, m2 4810*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m1, m9, m1 4811*c0909341SAndroid Build Coastguard Worker vpshrdq m1, m0, 48 ; 01 12 4812*c0909341SAndroid Build Coastguard Worker call .h 4813*c0909341SAndroid Build Coastguard Worker vpshrdq m2, m1, m0, 48 ; 23 34 4814*c0909341SAndroid Build Coastguard Worker call .h 4815*c0909341SAndroid Build Coastguard Worker vpshrdq m3, m2, m0, 48 ; 45 56 4816*c0909341SAndroid Build Coastguard Worker.main2: 4817*c0909341SAndroid Build Coastguard Worker call .h 4818*c0909341SAndroid Build Coastguard Worker psrad ym6, ym20, 10 4819*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 4820*c0909341SAndroid Build Coastguard Worker paddd ym17, ym20, ym21 ; my += delta 4821*c0909341SAndroid Build Coastguard Worker vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 4822*c0909341SAndroid Build Coastguard Worker psrad ym16, ym17, 10 4823*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 4824*c0909341SAndroid Build Coastguard Worker vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 4825*c0909341SAndroid Build Coastguard Worker shufps m5, m20, m6, q2020 4826*c0909341SAndroid Build Coastguard Worker mova m16, m8 4827*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m12 4828*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m1, m4 ; a0 b0 4829*c0909341SAndroid Build Coastguard Worker pshufb m5, m13 4830*c0909341SAndroid Build Coastguard Worker mova m1, m2 4831*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m2, m5 ; a1 b1 4832*c0909341SAndroid Build Coastguard Worker shufps m6, m20, m6, q3131 4833*c0909341SAndroid Build Coastguard Worker paddd ym20, ym17, ym21 4834*c0909341SAndroid Build Coastguard Worker pshufb m4, m6, m12 4835*c0909341SAndroid Build Coastguard Worker mova m2, m3 4836*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m3, m4 ; a2 b2 4837*c0909341SAndroid Build Coastguard Worker vpshrdq m3, m0, 48 ; 67 78 4838*c0909341SAndroid Build Coastguard Worker pshufb m6, m13 4839*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m3, m6 ; a3 b3 4840*c0909341SAndroid Build Coastguard Worker ret 4841*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4842*c0909341SAndroid Build Coastguard Worker.h: 4843*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+ssq*1] 4844*c0909341SAndroid Build Coastguard Worker psrad ym6, ym18, 10 4845*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4846*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, m16, [srcq+ssq*0], 1 4847*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 4848*c0909341SAndroid Build Coastguard Worker paddd ym17, ym18, ym19 ; mx += beta 4849*c0909341SAndroid Build Coastguard Worker vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 4850*c0909341SAndroid Build Coastguard Worker psrad ym16, ym17, 10 4851*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 4852*c0909341SAndroid Build Coastguard Worker vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 4853*c0909341SAndroid Build Coastguard Worker vpermb m4, m10, m5 4854*c0909341SAndroid Build Coastguard Worker shufps m16, m18, m6, q2020 4855*c0909341SAndroid Build Coastguard Worker shufps m6, m18, m6, q3131 4856*c0909341SAndroid Build Coastguard Worker mova m0, m7 4857*c0909341SAndroid Build Coastguard Worker pshufb m18, m16, m12 4858*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m4, m18 ; a0 b0 4859*c0909341SAndroid Build Coastguard Worker vpermb m5, m11, m5 4860*c0909341SAndroid Build Coastguard Worker pshufb m18, m6, m13 4861*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m18 ; a3 b3 4862*c0909341SAndroid Build Coastguard Worker paddd ym18, ym17, ym19 4863*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m4, m5, q1021 4864*c0909341SAndroid Build Coastguard Worker pshufb m16, m13 4865*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m17, m16 ; a1 b1 4866*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m5, q2132 4867*c0909341SAndroid Build Coastguard Worker pshufb m6, m12 4868*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m4, m6 ; a2 b2 4869*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m0, m9, m0 ; a a b b 4870*c0909341SAndroid Build Coastguard Worker ret 4871*c0909341SAndroid Build Coastguard Worker 4872*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 0 4873*c0909341SAndroid Build Coastguard Worker call .main 4874*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4875*c0909341SAndroid Build Coastguard Worker jmp wq 4876*c0909341SAndroid Build Coastguard Worker.w4: 4877*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm0 4878*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 4879*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 4880*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 4881*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 4882*c0909341SAndroid Build Coastguard Worker cmp hd, 8 4883*c0909341SAndroid Build Coastguard Worker jl .w4_end 4884*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 4885*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4886*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm2 4887*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 4888*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m0, 3 4889*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 4890*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 4891*c0909341SAndroid Build Coastguard Worker je .w4_end 4892*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4893*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm1 4894*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 4895*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, ym1, 1 4896*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 4897*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 4898*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m1, 2 4899*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4900*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm0 4901*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 4902*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m1, 3 4903*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 4904*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 4905*c0909341SAndroid Build Coastguard Worker.w4_end: 4906*c0909341SAndroid Build Coastguard Worker RET 4907*c0909341SAndroid Build Coastguard Worker.w8_loop: 4908*c0909341SAndroid Build Coastguard Worker call .main 4909*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4910*c0909341SAndroid Build Coastguard Worker.w8: 4911*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 4912*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 4913*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 4914*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 4915*c0909341SAndroid Build Coastguard Worker sub hd, 8 4916*c0909341SAndroid Build Coastguard Worker jl .w8_end 4917*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4918*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm1 4919*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym1, 1 4920*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m1, 2 4921*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m1, 3 4922*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4923*c0909341SAndroid Build Coastguard Worker.w8_end: 4924*c0909341SAndroid Build Coastguard Worker RET 4925*c0909341SAndroid Build Coastguard Worker.w16_loop: 4926*c0909341SAndroid Build Coastguard Worker call .main 4927*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4928*c0909341SAndroid Build Coastguard Worker.w16: 4929*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 4930*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 4931*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym1 4932*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+stride3q ], m1, 1 4933*c0909341SAndroid Build Coastguard Worker sub hd, 4 4934*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4935*c0909341SAndroid Build Coastguard Worker RET 4936*c0909341SAndroid Build Coastguard Worker.w32_loop: 4937*c0909341SAndroid Build Coastguard Worker call .main 4938*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4939*c0909341SAndroid Build Coastguard Worker.w32: 4940*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 4941*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 4942*c0909341SAndroid Build Coastguard Worker sub hd, 2 4943*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4944*c0909341SAndroid Build Coastguard Worker RET 4945*c0909341SAndroid Build Coastguard Worker.w64_loop: 4946*c0909341SAndroid Build Coastguard Worker call .main 4947*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4948*c0909341SAndroid Build Coastguard Worker.w64: 4949*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 4950*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 4951*c0909341SAndroid Build Coastguard Worker dec hd 4952*c0909341SAndroid Build Coastguard Worker jg .w64_loop 4953*c0909341SAndroid Build Coastguard Worker RET 4954*c0909341SAndroid Build Coastguard Worker.w128_loop: 4955*c0909341SAndroid Build Coastguard Worker call .main 4956*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4957*c0909341SAndroid Build Coastguard Worker.w128: 4958*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 4959*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 4960*c0909341SAndroid Build Coastguard Worker call .main 4961*c0909341SAndroid Build Coastguard Worker mova [dstq+64*2], m0 4962*c0909341SAndroid Build Coastguard Worker mova [dstq+64*3], m1 4963*c0909341SAndroid Build Coastguard Worker dec hd 4964*c0909341SAndroid Build Coastguard Worker jg .w128_loop 4965*c0909341SAndroid Build Coastguard Worker RET 4966*c0909341SAndroid Build Coastguard Worker%endmacro 4967*c0909341SAndroid Build Coastguard Worker 4968*c0909341SAndroid Build Coastguard Worker%if WIN64 4969*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5 4970*c0909341SAndroid Build Coastguard Worker%else 4971*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7 4972*c0909341SAndroid Build Coastguard Worker%endif 4973*c0909341SAndroid Build Coastguard Worker 4974*c0909341SAndroid Build Coastguard Workercglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 4975*c0909341SAndroid Build Coastguard Worker%define base r6-avg_avx512icl_table 4976*c0909341SAndroid Build Coastguard Worker lea r6, [avg_avx512icl_table] 4977*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4978*c0909341SAndroid Build Coastguard Worker mov t0d, r6m ; pixel_max 4979*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 4980*c0909341SAndroid Build Coastguard Worker shr t0d, 11 4981*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+avg_round+t0*4] 4982*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+avg_shift+t0*4] 4983*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4984*c0909341SAndroid Build Coastguard Worker add wq, r6 4985*c0909341SAndroid Build Coastguard Worker BIDIR_FN 4986*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4987*c0909341SAndroid Build Coastguard Worker.main: 4988*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+64*0] 4989*c0909341SAndroid Build Coastguard Worker paddsw m0, [tmp2q+64*0] 4990*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+64*1] 4991*c0909341SAndroid Build Coastguard Worker paddsw m1, [tmp2q+64*1] 4992*c0909341SAndroid Build Coastguard Worker add tmp1q, 64*2 4993*c0909341SAndroid Build Coastguard Worker add tmp2q, 64*2 4994*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m2 4995*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m2 4996*c0909341SAndroid Build Coastguard Worker psubsw m0, m2 4997*c0909341SAndroid Build Coastguard Worker psubsw m1, m2 4998*c0909341SAndroid Build Coastguard Worker vpsrlvw m0, m3 4999*c0909341SAndroid Build Coastguard Worker vpsrlvw m1, m3 5000*c0909341SAndroid Build Coastguard Worker ret 5001*c0909341SAndroid Build Coastguard Worker 5002*c0909341SAndroid Build Coastguard Workercglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 5003*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg_avx512icl_table 5004*c0909341SAndroid Build Coastguard Worker lea r6, [w_avg_avx512icl_table] 5005*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5006*c0909341SAndroid Build Coastguard Worker mov t0d, r7m ; pixel_max 5007*c0909341SAndroid Build Coastguard Worker shr t0d, 11 5008*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 5009*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+w_avg_round+t0*4] 5010*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+bidir_shift+t0*4] 5011*c0909341SAndroid Build Coastguard Worker add wq, r6 5012*c0909341SAndroid Build Coastguard Worker mov r6d, r6m ; weight 5013*c0909341SAndroid Build Coastguard Worker lea t0d, [r6-16] 5014*c0909341SAndroid Build Coastguard Worker shl r6d, 16 5015*c0909341SAndroid Build Coastguard Worker sub r6d, t0d ; 16-weight, weight 5016*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5017*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, r6d 5018*c0909341SAndroid Build Coastguard Worker BIDIR_FN 5019*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5020*c0909341SAndroid Build Coastguard Worker.main: 5021*c0909341SAndroid Build Coastguard Worker mova m3, [tmp1q+64*0] 5022*c0909341SAndroid Build Coastguard Worker mova m1, [tmp2q+64*0] 5023*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+64*1] 5024*c0909341SAndroid Build Coastguard Worker mova m4, [tmp2q+64*1] 5025*c0909341SAndroid Build Coastguard Worker add tmp1q, 64*2 5026*c0909341SAndroid Build Coastguard Worker add tmp2q, 64*2 5027*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m3 5028*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m3 5029*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 5030*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 5031*c0909341SAndroid Build Coastguard Worker mova m0, m5 5032*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m6, m2 5033*c0909341SAndroid Build Coastguard Worker mova m2, m5 5034*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m6, m1 5035*c0909341SAndroid Build Coastguard Worker mova m1, m5 5036*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m6, m3 5037*c0909341SAndroid Build Coastguard Worker mova m3, m5 5038*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m6, m4 5039*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 2}, m0, m2, m1, m3 5040*c0909341SAndroid Build Coastguard Worker packusdw m0, m2 5041*c0909341SAndroid Build Coastguard Worker packusdw m1, m3 5042*c0909341SAndroid Build Coastguard Worker vpsrlvw m0, m7 5043*c0909341SAndroid Build Coastguard Worker vpsrlvw m1, m7 5044*c0909341SAndroid Build Coastguard Worker ret 5045*c0909341SAndroid Build Coastguard Worker 5046*c0909341SAndroid Build Coastguard Workercglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 5047*c0909341SAndroid Build Coastguard Worker%define base r7-mask_avx512icl_table 5048*c0909341SAndroid Build Coastguard Worker lea r7, [mask_avx512icl_table] 5049*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5050*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; pixel_max 5051*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5052*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5053*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5054*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pw_64] 5055*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+mask_round+r6*4] 5056*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+bidir_shift+r6*4] 5057*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5058*c0909341SAndroid Build Coastguard Worker add wq, r7 5059*c0909341SAndroid Build Coastguard Worker BIDIR_FN 5060*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5061*c0909341SAndroid Build Coastguard Worker.main: 5062*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [maskq+32*0] 5063*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+64*0] 5064*c0909341SAndroid Build Coastguard Worker mova m2, [tmp2q+64*0] 5065*c0909341SAndroid Build Coastguard Worker pmovzxbw m6, [maskq+32*1] 5066*c0909341SAndroid Build Coastguard Worker mova m5, [tmp1q+64*1] 5067*c0909341SAndroid Build Coastguard Worker mova m3, [tmp2q+64*1] 5068*c0909341SAndroid Build Coastguard Worker add maskq, 32*2 5069*c0909341SAndroid Build Coastguard Worker add tmp1q, 64*2 5070*c0909341SAndroid Build Coastguard Worker add tmp2q, 64*2 5071*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m4, m2 5072*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m2 5073*c0909341SAndroid Build Coastguard Worker psubw m0, m8, m1 5074*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m0 ; m, 64-m 5075*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 5076*c0909341SAndroid Build Coastguard Worker mova m0, m9 5077*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m7, m2 5078*c0909341SAndroid Build Coastguard Worker mova m2, m9 5079*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) 5080*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m5, m3 5081*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3 5082*c0909341SAndroid Build Coastguard Worker psubw m1, m8, m6 5083*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m6, m1 5084*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m1 5085*c0909341SAndroid Build Coastguard Worker mova m1, m9 5086*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m7, m3 5087*c0909341SAndroid Build Coastguard Worker mova m3, m9 5088*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m5, m6 5089*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 4}, m0, m2, m1, m3 5090*c0909341SAndroid Build Coastguard Worker packusdw m0, m2 5091*c0909341SAndroid Build Coastguard Worker packusdw m1, m3 5092*c0909341SAndroid Build Coastguard Worker vpsrlvw m0, m10 5093*c0909341SAndroid Build Coastguard Worker vpsrlvw m1, m10 5094*c0909341SAndroid Build Coastguard Worker ret 5095*c0909341SAndroid Build Coastguard Worker 5096*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5097*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx512icl_table 5098*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_420_avx512icl_table] 5099*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5100*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; pixel_max 5101*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5102*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5103*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5104*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5105*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_64] 5106*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+mask_round+r6*4] 5107*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+bidir_shift+r6*4] 5108*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; sign 5109*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [base+w_mask_round+r6*4] 5110*c0909341SAndroid Build Coastguard Worker mova ym15, [w_mask_end42x] 5111*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5112*c0909341SAndroid Build Coastguard Worker add wq, r7 5113*c0909341SAndroid Build Coastguard Worker call .main 5114*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5115*c0909341SAndroid Build Coastguard Worker jmp wq 5116*c0909341SAndroid Build Coastguard Worker.w4: 5117*c0909341SAndroid Build Coastguard Worker mova m4, [w_mask_shuf4] 5118*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m4, m3 5119*c0909341SAndroid Build Coastguard Worker mova m3, m14 5120*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m2, [pb_64] {1to16} 5121*c0909341SAndroid Build Coastguard Worker vpermb m3, m15, m3 5122*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 5123*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 5124*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 5125*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5126*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5127*c0909341SAndroid Build Coastguard Worker mova [maskq], xm3 5128*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5129*c0909341SAndroid Build Coastguard Worker jl .w4_end 5130*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 5131*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5132*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5133*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5134*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m0, 3 5135*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 5136*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 5137*c0909341SAndroid Build Coastguard Worker je .w4_end 5138*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5139*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 5140*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 5141*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym1, 1 5142*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5143*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5144*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m1, 2 5145*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5146*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5147*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5148*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m1, 3 5149*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 5150*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5151*c0909341SAndroid Build Coastguard Worker.w4_end: 5152*c0909341SAndroid Build Coastguard Worker RET 5153*c0909341SAndroid Build Coastguard Worker.w8: 5154*c0909341SAndroid Build Coastguard Worker mova m8, [w_mask_shuf8] 5155*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pb_64] 5156*c0909341SAndroid Build Coastguard Worker jmp .w8_start 5157*c0909341SAndroid Build Coastguard Worker.w8_loop: 5158*c0909341SAndroid Build Coastguard Worker call .main 5159*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5160*c0909341SAndroid Build Coastguard Worker add maskq, 16 5161*c0909341SAndroid Build Coastguard Worker.w8_start: 5162*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m8, m3 5163*c0909341SAndroid Build Coastguard Worker mova m3, m14 5164*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m2, m9 5165*c0909341SAndroid Build Coastguard Worker vpermb m3, m15, m3 5166*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5167*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 5168*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 5169*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 5170*c0909341SAndroid Build Coastguard Worker mova [maskq], xm3 5171*c0909341SAndroid Build Coastguard Worker sub hd, 8 5172*c0909341SAndroid Build Coastguard Worker jl .w8_end 5173*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5174*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm1 5175*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym1, 1 5176*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m1, 2 5177*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m1, 3 5178*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5179*c0909341SAndroid Build Coastguard Worker.w8_end: 5180*c0909341SAndroid Build Coastguard Worker RET 5181*c0909341SAndroid Build Coastguard Worker.w16: 5182*c0909341SAndroid Build Coastguard Worker mova m8, [w_mask_shuf16] 5183*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pb_64] 5184*c0909341SAndroid Build Coastguard Worker jmp .w16_start 5185*c0909341SAndroid Build Coastguard Worker.w16_loop: 5186*c0909341SAndroid Build Coastguard Worker call .main 5187*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5188*c0909341SAndroid Build Coastguard Worker add maskq, 16 5189*c0909341SAndroid Build Coastguard Worker.w16_start: 5190*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m8, m3 5191*c0909341SAndroid Build Coastguard Worker mova m3, m14 5192*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m2, m9 5193*c0909341SAndroid Build Coastguard Worker vpermb m3, m15, m3 5194*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 5195*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 5196*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym1 5197*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+stride3q ], m1, 1 5198*c0909341SAndroid Build Coastguard Worker mova [maskq], xm3 5199*c0909341SAndroid Build Coastguard Worker sub hd, 4 5200*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5201*c0909341SAndroid Build Coastguard Worker RET 5202*c0909341SAndroid Build Coastguard Worker.w32_loop: 5203*c0909341SAndroid Build Coastguard Worker call .main 5204*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5205*c0909341SAndroid Build Coastguard Worker add maskq, 32 5206*c0909341SAndroid Build Coastguard Worker.w32: 5207*c0909341SAndroid Build Coastguard Worker paddw m2, m3 5208*c0909341SAndroid Build Coastguard Worker mova m8, m14 5209*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m11, m2 5210*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5211*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 5212*c0909341SAndroid Build Coastguard Worker call .main 5213*c0909341SAndroid Build Coastguard Worker paddw m2, m3 5214*c0909341SAndroid Build Coastguard Worker mova m3, m14 5215*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m11, m2 5216*c0909341SAndroid Build Coastguard Worker vpermt2b m8, m15, m3 5217*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 5218*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m1 5219*c0909341SAndroid Build Coastguard Worker mova [maskq], ym8 5220*c0909341SAndroid Build Coastguard Worker sub hd, 4 5221*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5222*c0909341SAndroid Build Coastguard Worker RET 5223*c0909341SAndroid Build Coastguard Worker.w64_loop: 5224*c0909341SAndroid Build Coastguard Worker call .main 5225*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5226*c0909341SAndroid Build Coastguard Worker add maskq, 32 5227*c0909341SAndroid Build Coastguard Worker.w64: 5228*c0909341SAndroid Build Coastguard Worker mova m8, m2 5229*c0909341SAndroid Build Coastguard Worker mova m9, m3 5230*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*0], m0 5231*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*1], m1 5232*c0909341SAndroid Build Coastguard Worker call .main 5233*c0909341SAndroid Build Coastguard Worker paddw m8, m2 5234*c0909341SAndroid Build Coastguard Worker paddw m9, m3 5235*c0909341SAndroid Build Coastguard Worker mova m2, m14 5236*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m11, m8 5237*c0909341SAndroid Build Coastguard Worker mova m3, m14 5238*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m11, m9 5239*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m15, m3 5240*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*0], m0 5241*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*1], m1 5242*c0909341SAndroid Build Coastguard Worker mova [maskq], ym2 5243*c0909341SAndroid Build Coastguard Worker sub hd, 2 5244*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5245*c0909341SAndroid Build Coastguard Worker RET 5246*c0909341SAndroid Build Coastguard Worker.w128_loop: 5247*c0909341SAndroid Build Coastguard Worker call .main 5248*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5249*c0909341SAndroid Build Coastguard Worker add maskq, 64 5250*c0909341SAndroid Build Coastguard Worker.w128: 5251*c0909341SAndroid Build Coastguard Worker mova m16, m2 5252*c0909341SAndroid Build Coastguard Worker mova m8, m3 5253*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*0], m0 5254*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*1], m1 5255*c0909341SAndroid Build Coastguard Worker call .main 5256*c0909341SAndroid Build Coastguard Worker mova m17, m2 5257*c0909341SAndroid Build Coastguard Worker mova m9, m3 5258*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*2], m0 5259*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*3], m1 5260*c0909341SAndroid Build Coastguard Worker call .main 5261*c0909341SAndroid Build Coastguard Worker paddw m2, m16 5262*c0909341SAndroid Build Coastguard Worker paddw m3, m8 5263*c0909341SAndroid Build Coastguard Worker mova m16, m14 5264*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m11, m2 5265*c0909341SAndroid Build Coastguard Worker mova m8, m14 5266*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m11, m3 5267*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*0], m0 5268*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*1], m1 5269*c0909341SAndroid Build Coastguard Worker call .main 5270*c0909341SAndroid Build Coastguard Worker paddw m2, m17 5271*c0909341SAndroid Build Coastguard Worker paddw m3, m9 5272*c0909341SAndroid Build Coastguard Worker mova m17, m14 5273*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m11, m2 5274*c0909341SAndroid Build Coastguard Worker mova m9, m14 5275*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m11, m3 5276*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m15, m8 5277*c0909341SAndroid Build Coastguard Worker vpermt2b m17, m15, m9 5278*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*2], m0 5279*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*3], m1 5280*c0909341SAndroid Build Coastguard Worker mova [maskq+32*0], ym16 5281*c0909341SAndroid Build Coastguard Worker mova [maskq+32*1], ym17 5282*c0909341SAndroid Build Coastguard Worker sub hd, 2 5283*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5284*c0909341SAndroid Build Coastguard Worker vzeroupper 5285*c0909341SAndroid Build Coastguard Worker RET 5286*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5287*c0909341SAndroid Build Coastguard Worker.main: 5288*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+64*0] 5289*c0909341SAndroid Build Coastguard Worker mova m3, [tmp2q+64*0] 5290*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+64*1] 5291*c0909341SAndroid Build Coastguard Worker mova m7, [tmp2q+64*1] 5292*c0909341SAndroid Build Coastguard Worker add tmp1q, 64*2 5293*c0909341SAndroid Build Coastguard Worker add tmp2q, 64*2 5294*c0909341SAndroid Build Coastguard Worker psubsw m6, m1, m3 5295*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3, m1 5296*c0909341SAndroid Build Coastguard Worker pabsw m6, m6 5297*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 5298*c0909341SAndroid Build Coastguard Worker psubusw m6, m10, m6 5299*c0909341SAndroid Build Coastguard Worker psrlw m6, 10 ; 64-m 5300*c0909341SAndroid Build Coastguard Worker psubw m2, m11, m6 ; m 5301*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m6, m2 5302*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m2 5303*c0909341SAndroid Build Coastguard Worker mova m0, m12 5304*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m1 5305*c0909341SAndroid Build Coastguard Worker mova m1, m12 5306*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m6 5307*c0909341SAndroid Build Coastguard Worker psubsw m5, m4, m7 5308*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m4 5309*c0909341SAndroid Build Coastguard Worker pabsw m5, m5 5310*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m4 5311*c0909341SAndroid Build Coastguard Worker psubusw m5, m10, m5 5312*c0909341SAndroid Build Coastguard Worker psrlw m5, 10 5313*c0909341SAndroid Build Coastguard Worker psubw m3, m11, m5 5314*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m3 5315*c0909341SAndroid Build Coastguard Worker psrad m0, 4 5316*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3 5317*c0909341SAndroid Build Coastguard Worker psrad m1, 4 5318*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 5319*c0909341SAndroid Build Coastguard Worker mova m1, m12 5320*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m6, m4 5321*c0909341SAndroid Build Coastguard Worker mova m4, m12 5322*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m7, m5 5323*c0909341SAndroid Build Coastguard Worker psrad m1, 4 5324*c0909341SAndroid Build Coastguard Worker psrad m4, 4 5325*c0909341SAndroid Build Coastguard Worker packusdw m1, m4 5326*c0909341SAndroid Build Coastguard Worker vpsrlvw m0, m13 5327*c0909341SAndroid Build Coastguard Worker vpsrlvw m1, m13 5328*c0909341SAndroid Build Coastguard Worker ret 5329*c0909341SAndroid Build Coastguard Worker 5330*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 5331*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx512icl_table 5332*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_422_avx512icl_table] 5333*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5334*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; pixel_max 5335*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5336*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5337*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5338*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5339*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pw_64] 5340*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+mask_round+r6*4] 5341*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+bidir_shift+r6*4] 5342*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; sign 5343*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+w_mask_round+r6*4] 5344*c0909341SAndroid Build Coastguard Worker mova ym13, [w_mask_end42x] 5345*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5346*c0909341SAndroid Build Coastguard Worker add wq, r7 5347*c0909341SAndroid Build Coastguard Worker paddw m14, m9, m9 ; pw_128 5348*c0909341SAndroid Build Coastguard Worker call .main 5349*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5350*c0909341SAndroid Build Coastguard Worker jmp wq 5351*c0909341SAndroid Build Coastguard Worker.w4: 5352*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 5353*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 5354*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 5355*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5356*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5357*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5358*c0909341SAndroid Build Coastguard Worker jl .w4_end 5359*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 5360*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5361*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5362*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5363*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m0, 3 5364*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 5365*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 5366*c0909341SAndroid Build Coastguard Worker je .w4_end 5367*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5368*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 5369*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 5370*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym1, 1 5371*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5372*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5373*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m1, 2 5374*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5375*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5376*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5377*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m1, 3 5378*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 5379*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5380*c0909341SAndroid Build Coastguard Worker.w4_end: 5381*c0909341SAndroid Build Coastguard Worker RET 5382*c0909341SAndroid Build Coastguard Worker.w8_loop: 5383*c0909341SAndroid Build Coastguard Worker call .main 5384*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5385*c0909341SAndroid Build Coastguard Worker.w8: 5386*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5387*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 5388*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 5389*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 5390*c0909341SAndroid Build Coastguard Worker sub hd, 8 5391*c0909341SAndroid Build Coastguard Worker jl .w8_end 5392*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5393*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm1 5394*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym1, 1 5395*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m1, 2 5396*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m1, 3 5397*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5398*c0909341SAndroid Build Coastguard Worker.w8_end: 5399*c0909341SAndroid Build Coastguard Worker RET 5400*c0909341SAndroid Build Coastguard Worker.w16_loop: 5401*c0909341SAndroid Build Coastguard Worker call .main 5402*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5403*c0909341SAndroid Build Coastguard Worker.w16: 5404*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 5405*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 5406*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym1 5407*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+stride3q ], m1, 1 5408*c0909341SAndroid Build Coastguard Worker sub hd, 4 5409*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5410*c0909341SAndroid Build Coastguard Worker RET 5411*c0909341SAndroid Build Coastguard Worker.w32_loop: 5412*c0909341SAndroid Build Coastguard Worker call .main 5413*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5414*c0909341SAndroid Build Coastguard Worker.w32: 5415*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5416*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 5417*c0909341SAndroid Build Coastguard Worker sub hd, 2 5418*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5419*c0909341SAndroid Build Coastguard Worker RET 5420*c0909341SAndroid Build Coastguard Worker.w64_loop: 5421*c0909341SAndroid Build Coastguard Worker call .main 5422*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5423*c0909341SAndroid Build Coastguard Worker.w64: 5424*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5425*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5426*c0909341SAndroid Build Coastguard Worker dec hd 5427*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5428*c0909341SAndroid Build Coastguard Worker RET 5429*c0909341SAndroid Build Coastguard Worker.w128_loop: 5430*c0909341SAndroid Build Coastguard Worker call .main 5431*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5432*c0909341SAndroid Build Coastguard Worker.w128: 5433*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5434*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5435*c0909341SAndroid Build Coastguard Worker call .main 5436*c0909341SAndroid Build Coastguard Worker mova [dstq+64*2], m0 5437*c0909341SAndroid Build Coastguard Worker mova [dstq+64*3], m1 5438*c0909341SAndroid Build Coastguard Worker dec hd 5439*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5440*c0909341SAndroid Build Coastguard Worker RET 5441*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5442*c0909341SAndroid Build Coastguard Worker.main: 5443*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+64*0] 5444*c0909341SAndroid Build Coastguard Worker mova m3, [tmp2q+64*0] 5445*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+64*1] 5446*c0909341SAndroid Build Coastguard Worker mova m7, [tmp2q+64*1] 5447*c0909341SAndroid Build Coastguard Worker add tmp1q, 64*2 5448*c0909341SAndroid Build Coastguard Worker add tmp2q, 64*2 5449*c0909341SAndroid Build Coastguard Worker psubsw m6, m1, m3 5450*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3, m1 5451*c0909341SAndroid Build Coastguard Worker pabsw m6, m6 5452*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 5453*c0909341SAndroid Build Coastguard Worker psubusw m6, m8, m6 5454*c0909341SAndroid Build Coastguard Worker psrlw m6, 10 5455*c0909341SAndroid Build Coastguard Worker psubw m2, m9, m6 5456*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m6, m2 5457*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m2 5458*c0909341SAndroid Build Coastguard Worker mova m0, m10 5459*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m1 5460*c0909341SAndroid Build Coastguard Worker mova m1, m10 5461*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m6 5462*c0909341SAndroid Build Coastguard Worker psubsw m5, m4, m7 5463*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m4 5464*c0909341SAndroid Build Coastguard Worker pabsw m5, m5 5465*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m4 5466*c0909341SAndroid Build Coastguard Worker psubusw m5, m8, m5 5467*c0909341SAndroid Build Coastguard Worker psrlw m5, 10 5468*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m5 5469*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m3 5470*c0909341SAndroid Build Coastguard Worker psrad m0, 4 5471*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3 5472*c0909341SAndroid Build Coastguard Worker psrad m1, 4 5473*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 5474*c0909341SAndroid Build Coastguard Worker mova m1, m10 5475*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m6, m4 5476*c0909341SAndroid Build Coastguard Worker mova m4, m10 5477*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m7, m5 5478*c0909341SAndroid Build Coastguard Worker mova m5, m12 5479*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m14, m2 5480*c0909341SAndroid Build Coastguard Worker mova m2, m12 5481*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m14, m3 5482*c0909341SAndroid Build Coastguard Worker psrad m1, 4 5483*c0909341SAndroid Build Coastguard Worker psrad m4, 4 5484*c0909341SAndroid Build Coastguard Worker packusdw m1, m4 5485*c0909341SAndroid Build Coastguard Worker vpermt2b m5, m13, m2 5486*c0909341SAndroid Build Coastguard Worker vpsrlvw m0, m11 5487*c0909341SAndroid Build Coastguard Worker vpsrlvw m1, m11 5488*c0909341SAndroid Build Coastguard Worker mova [maskq], ym5 5489*c0909341SAndroid Build Coastguard Worker add maskq, 32 5490*c0909341SAndroid Build Coastguard Worker ret 5491*c0909341SAndroid Build Coastguard Worker 5492*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 5493*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx512icl_table 5494*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_444_avx512icl_table] 5495*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5496*c0909341SAndroid Build Coastguard Worker mov r6d, r8m ; pixel_max 5497*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5498*c0909341SAndroid Build Coastguard Worker shr r6d, 11 5499*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 5500*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5501*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pw_64] 5502*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+mask_round+r6*4] 5503*c0909341SAndroid Build Coastguard Worker mova m11, [w_mask_end444] 5504*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+bidir_shift+r6*4] 5505*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 5506*c0909341SAndroid Build Coastguard Worker add wq, r7 5507*c0909341SAndroid Build Coastguard Worker call .main 5508*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 5509*c0909341SAndroid Build Coastguard Worker jmp wq 5510*c0909341SAndroid Build Coastguard Worker.w4: 5511*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 5512*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 5513*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 5514*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5515*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5516*c0909341SAndroid Build Coastguard Worker cmp hd, 8 5517*c0909341SAndroid Build Coastguard Worker jl .w4_end 5518*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 5519*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5520*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5521*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5522*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m0, 3 5523*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 5524*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm0 5525*c0909341SAndroid Build Coastguard Worker je .w4_end 5526*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5527*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm1 5528*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 5529*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym1, 1 5530*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 5531*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm2 5532*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m1, 2 5533*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5534*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm2 5535*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm2 5536*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m1, 3 5537*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 5538*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 5539*c0909341SAndroid Build Coastguard Worker.w4_end: 5540*c0909341SAndroid Build Coastguard Worker RET 5541*c0909341SAndroid Build Coastguard Worker.w8_loop: 5542*c0909341SAndroid Build Coastguard Worker call .main 5543*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5544*c0909341SAndroid Build Coastguard Worker.w8: 5545*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 5546*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 5547*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 5548*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 5549*c0909341SAndroid Build Coastguard Worker sub hd, 8 5550*c0909341SAndroid Build Coastguard Worker jl .w8_end 5551*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5552*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm1 5553*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym1, 1 5554*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m1, 2 5555*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m1, 3 5556*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5557*c0909341SAndroid Build Coastguard Worker.w8_end: 5558*c0909341SAndroid Build Coastguard Worker RET 5559*c0909341SAndroid Build Coastguard Worker.w16_loop: 5560*c0909341SAndroid Build Coastguard Worker call .main 5561*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5562*c0909341SAndroid Build Coastguard Worker.w16: 5563*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 5564*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 5565*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym1 5566*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+stride3q ], m1, 1 5567*c0909341SAndroid Build Coastguard Worker sub hd, 4 5568*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5569*c0909341SAndroid Build Coastguard Worker RET 5570*c0909341SAndroid Build Coastguard Worker.w32_loop: 5571*c0909341SAndroid Build Coastguard Worker call .main 5572*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5573*c0909341SAndroid Build Coastguard Worker.w32: 5574*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 5575*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 5576*c0909341SAndroid Build Coastguard Worker sub hd, 2 5577*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5578*c0909341SAndroid Build Coastguard Worker RET 5579*c0909341SAndroid Build Coastguard Worker.w64_loop: 5580*c0909341SAndroid Build Coastguard Worker call .main 5581*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5582*c0909341SAndroid Build Coastguard Worker.w64: 5583*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5584*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5585*c0909341SAndroid Build Coastguard Worker dec hd 5586*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5587*c0909341SAndroid Build Coastguard Worker RET 5588*c0909341SAndroid Build Coastguard Worker.w128_loop: 5589*c0909341SAndroid Build Coastguard Worker call .main 5590*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5591*c0909341SAndroid Build Coastguard Worker.w128: 5592*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5593*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5594*c0909341SAndroid Build Coastguard Worker call .main 5595*c0909341SAndroid Build Coastguard Worker mova [dstq+64*2], m0 5596*c0909341SAndroid Build Coastguard Worker mova [dstq+64*3], m1 5597*c0909341SAndroid Build Coastguard Worker dec hd 5598*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5599*c0909341SAndroid Build Coastguard Worker RET 5600*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5601*c0909341SAndroid Build Coastguard Worker.main: 5602*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+64*0] 5603*c0909341SAndroid Build Coastguard Worker mova m3, [tmp2q+64*0] 5604*c0909341SAndroid Build Coastguard Worker mova m4, [tmp1q+64*1] 5605*c0909341SAndroid Build Coastguard Worker mova m7, [tmp2q+64*1] 5606*c0909341SAndroid Build Coastguard Worker add tmp1q, 64*2 5607*c0909341SAndroid Build Coastguard Worker add tmp2q, 64*2 5608*c0909341SAndroid Build Coastguard Worker psubsw m6, m1, m3 5609*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3, m1 5610*c0909341SAndroid Build Coastguard Worker pabsw m6, m6 5611*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 5612*c0909341SAndroid Build Coastguard Worker psubusw m6, m8, m6 5613*c0909341SAndroid Build Coastguard Worker psrlw m6, 10 5614*c0909341SAndroid Build Coastguard Worker psubw m2, m9, m6 5615*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m6, m2 5616*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m2 5617*c0909341SAndroid Build Coastguard Worker mova m0, m10 5618*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m1 5619*c0909341SAndroid Build Coastguard Worker mova m1, m10 5620*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m6 5621*c0909341SAndroid Build Coastguard Worker psubsw m5, m4, m7 5622*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m4 5623*c0909341SAndroid Build Coastguard Worker pabsw m5, m5 5624*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m4 5625*c0909341SAndroid Build Coastguard Worker psubusw m5, m8, m5 5626*c0909341SAndroid Build Coastguard Worker psrlw m5, 10 5627*c0909341SAndroid Build Coastguard Worker psubw m3, m9, m5 5628*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m3 5629*c0909341SAndroid Build Coastguard Worker psrad m0, 4 5630*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3 5631*c0909341SAndroid Build Coastguard Worker psrad m1, 4 5632*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 5633*c0909341SAndroid Build Coastguard Worker mova m1, m10 5634*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m6, m4 5635*c0909341SAndroid Build Coastguard Worker mova m4, m10 5636*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m7, m5 5637*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m11, m3 5638*c0909341SAndroid Build Coastguard Worker psrad m1, 4 5639*c0909341SAndroid Build Coastguard Worker psrad m4, 4 5640*c0909341SAndroid Build Coastguard Worker packusdw m1, m4 5641*c0909341SAndroid Build Coastguard Worker vpsrlvw m0, m12 5642*c0909341SAndroid Build Coastguard Worker vpsrlvw m1, m12 5643*c0909341SAndroid Build Coastguard Worker mova [maskq], m2 5644*c0909341SAndroid Build Coastguard Worker add maskq, 64 5645*c0909341SAndroid Build Coastguard Worker ret 5646*c0909341SAndroid Build Coastguard Worker 5647*c0909341SAndroid Build Coastguard Workercglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 5648*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx512icl_table 5649*c0909341SAndroid Build Coastguard Worker lea r6, [blend_avx512icl_table] 5650*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5651*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5652*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 5653*c0909341SAndroid Build Coastguard Worker movifnidn maskq, maskmp 5654*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_m512] 5655*c0909341SAndroid Build Coastguard Worker add wq, r6 5656*c0909341SAndroid Build Coastguard Worker lea r6, [dsq*3] 5657*c0909341SAndroid Build Coastguard Worker jmp wq 5658*c0909341SAndroid Build Coastguard Worker.w4: 5659*c0909341SAndroid Build Coastguard Worker pmovzxbw ym19, [maskq] 5660*c0909341SAndroid Build Coastguard Worker movq xm16, [dstq+dsq*0] 5661*c0909341SAndroid Build Coastguard Worker movhps xm16, [dstq+dsq*1] 5662*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym17, [dstq+dsq*2] 5663*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym18, [dstq+r6 ] 5664*c0909341SAndroid Build Coastguard Worker pmullw ym19, ym6 5665*c0909341SAndroid Build Coastguard Worker vpblendd ym16, ym17, 0x30 5666*c0909341SAndroid Build Coastguard Worker vpblendd ym16, ym18, 0xc0 5667*c0909341SAndroid Build Coastguard Worker psubw ym17, ym16, [tmpq] 5668*c0909341SAndroid Build Coastguard Worker add maskq, 16 5669*c0909341SAndroid Build Coastguard Worker add tmpq, 32 5670*c0909341SAndroid Build Coastguard Worker pmulhrsw ym17, ym19 5671*c0909341SAndroid Build Coastguard Worker paddw ym16, ym17 5672*c0909341SAndroid Build Coastguard Worker vextracti128 xm17, ym16, 1 5673*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm16 5674*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm16 5675*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*2], xm17 5676*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], xm17 5677*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5678*c0909341SAndroid Build Coastguard Worker sub hd, 4 5679*c0909341SAndroid Build Coastguard Worker jg .w4 5680*c0909341SAndroid Build Coastguard Worker vzeroupper 5681*c0909341SAndroid Build Coastguard Worker RET 5682*c0909341SAndroid Build Coastguard Worker.w8: 5683*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [maskq] 5684*c0909341SAndroid Build Coastguard Worker mova xm0, [dstq+dsq*0] 5685*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [dstq+dsq*1], 1 5686*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [dstq+dsq*2], 2 5687*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [dstq+r6 ], 3 5688*c0909341SAndroid Build Coastguard Worker pmullw m2, m6 5689*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 5690*c0909341SAndroid Build Coastguard Worker add maskq, 32 5691*c0909341SAndroid Build Coastguard Worker add tmpq, 64 5692*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 5693*c0909341SAndroid Build Coastguard Worker paddw m0, m1 5694*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5695*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 5696*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*2], m0, 2 5697*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r6 ], m0, 3 5698*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5699*c0909341SAndroid Build Coastguard Worker sub hd, 4 5700*c0909341SAndroid Build Coastguard Worker jg .w8 5701*c0909341SAndroid Build Coastguard Worker RET 5702*c0909341SAndroid Build Coastguard Worker.w16: 5703*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [maskq+32*0] 5704*c0909341SAndroid Build Coastguard Worker pmovzxbw m5, [maskq+32*1] 5705*c0909341SAndroid Build Coastguard Worker mova ym0, [dstq+dsq*0] 5706*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [dstq+dsq*1], 1 5707*c0909341SAndroid Build Coastguard Worker mova ym1, [dstq+dsq*2] 5708*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [dstq+r6 ], 1 5709*c0909341SAndroid Build Coastguard Worker pmullw m4, m6 5710*c0909341SAndroid Build Coastguard Worker pmullw m5, m6 5711*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+64*0] 5712*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+64*1] 5713*c0909341SAndroid Build Coastguard Worker add maskq, 32*2 5714*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 5715*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 5716*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 5717*c0909341SAndroid Build Coastguard Worker paddw m0, m2 5718*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5719*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 5720*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 5721*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*2], ym1 5722*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+r6 ], m1, 1 5723*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5724*c0909341SAndroid Build Coastguard Worker sub hd, 4 5725*c0909341SAndroid Build Coastguard Worker jg .w16 5726*c0909341SAndroid Build Coastguard Worker RET 5727*c0909341SAndroid Build Coastguard Worker.w32: 5728*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [maskq+32*0] 5729*c0909341SAndroid Build Coastguard Worker pmovzxbw m5, [maskq+32*1] 5730*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0] 5731*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1] 5732*c0909341SAndroid Build Coastguard Worker pmullw m4, m6 5733*c0909341SAndroid Build Coastguard Worker pmullw m5, m6 5734*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+ 64*0] 5735*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+ 64*1] 5736*c0909341SAndroid Build Coastguard Worker add maskq, 32*2 5737*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 5738*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 5739*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 5740*c0909341SAndroid Build Coastguard Worker paddw m0, m2 5741*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5742*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 5743*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 5744*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5745*c0909341SAndroid Build Coastguard Worker sub hd, 2 5746*c0909341SAndroid Build Coastguard Worker jg .w32 5747*c0909341SAndroid Build Coastguard Worker RET 5748*c0909341SAndroid Build Coastguard Worker 5749*c0909341SAndroid Build Coastguard Workercglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h 5750*c0909341SAndroid Build Coastguard Worker lea r5, [blend_v_avx512icl_table] 5751*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5752*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5753*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 5754*c0909341SAndroid Build Coastguard Worker add wq, r5 5755*c0909341SAndroid Build Coastguard Worker jmp wq 5756*c0909341SAndroid Build Coastguard Worker.w2: 5757*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm2, [obmc_masks_avx2+2*2] 5758*c0909341SAndroid Build Coastguard Worker.w2_loop: 5759*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5760*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [dstq+dsq*1], 1 5761*c0909341SAndroid Build Coastguard Worker movq xmm1, [tmpq] 5762*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 5763*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm0, xmm1 5764*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xmm2 5765*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 5766*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 5767*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 5768*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5769*c0909341SAndroid Build Coastguard Worker sub hd, 2 5770*c0909341SAndroid Build Coastguard Worker jg .w2_loop 5771*c0909341SAndroid Build Coastguard Worker RET 5772*c0909341SAndroid Build Coastguard Worker.w4: 5773*c0909341SAndroid Build Coastguard Worker vpbroadcastq xmm2, [obmc_masks_avx2+4*2] 5774*c0909341SAndroid Build Coastguard Worker.w4_loop: 5775*c0909341SAndroid Build Coastguard Worker movq xmm0, [dstq+dsq*0] 5776*c0909341SAndroid Build Coastguard Worker movhps xmm0, [dstq+dsq*1] 5777*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm0, [tmpq] 5778*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 5779*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xmm2 5780*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 5781*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 5782*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 5783*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5784*c0909341SAndroid Build Coastguard Worker sub hd, 2 5785*c0909341SAndroid Build Coastguard Worker jg .w4_loop 5786*c0909341SAndroid Build Coastguard Worker RET 5787*c0909341SAndroid Build Coastguard Worker.w8: 5788*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] 5789*c0909341SAndroid Build Coastguard Worker.w8_loop: 5790*c0909341SAndroid Build Coastguard Worker mova xm0, [dstq+dsq*0] 5791*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [dstq+dsq*1], 1 5792*c0909341SAndroid Build Coastguard Worker psubw ym1, ym0, [tmpq] 5793*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 5794*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym2 5795*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 5796*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5797*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 5798*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5799*c0909341SAndroid Build Coastguard Worker sub hd, 2 5800*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5801*c0909341SAndroid Build Coastguard Worker RET 5802*c0909341SAndroid Build Coastguard Worker.w16: 5803*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] 5804*c0909341SAndroid Build Coastguard Worker.w16_loop: 5805*c0909341SAndroid Build Coastguard Worker mova ym0, [dstq+dsq*0] 5806*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [dstq+dsq*1], 1 5807*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 5808*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 5809*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 5810*c0909341SAndroid Build Coastguard Worker paddw m0, m1 5811*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 5812*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 5813*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5814*c0909341SAndroid Build Coastguard Worker sub hd, 2 5815*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5816*c0909341SAndroid Build Coastguard Worker RET 5817*c0909341SAndroid Build Coastguard Worker.w32: 5818*c0909341SAndroid Build Coastguard Worker mova m4, [obmc_masks_avx2+32*2] 5819*c0909341SAndroid Build Coastguard Worker.w32_loop: 5820*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0] 5821*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+ 64*0] 5822*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1] 5823*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+ 64*1] 5824*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 5825*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 5826*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 5827*c0909341SAndroid Build Coastguard Worker paddw m0, m2 5828*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5829*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 5830*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 5831*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5832*c0909341SAndroid Build Coastguard Worker sub hd, 2 5833*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5834*c0909341SAndroid Build Coastguard Worker RET 5835*c0909341SAndroid Build Coastguard Worker 5836*c0909341SAndroid Build Coastguard Workercglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask 5837*c0909341SAndroid Build Coastguard Worker%define base r6-$$ 5838*c0909341SAndroid Build Coastguard Worker lea r6, [$$] 5839*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5840*c0909341SAndroid Build Coastguard Worker mov hd, hm 5841*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+blend_h_avx512icl_table+wq*4] 5842*c0909341SAndroid Build Coastguard Worker lea maskq, [base+obmc_masks_avx2+hq*2] 5843*c0909341SAndroid Build Coastguard Worker lea hd, [hq*3] 5844*c0909341SAndroid Build Coastguard Worker lea wq, [base+blend_h_avx512icl_table+wq] 5845*c0909341SAndroid Build Coastguard Worker shr hd, 2 ; h * 3/4 5846*c0909341SAndroid Build Coastguard Worker lea maskq, [maskq+hq*2] 5847*c0909341SAndroid Build Coastguard Worker neg hq 5848*c0909341SAndroid Build Coastguard Worker jmp wq 5849*c0909341SAndroid Build Coastguard Worker.w2: 5850*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5851*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [dstq+dsq*1], 1 5852*c0909341SAndroid Build Coastguard Worker movd xmm2, [maskq+hq*2] 5853*c0909341SAndroid Build Coastguard Worker movq xmm1, [tmpq] 5854*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 5855*c0909341SAndroid Build Coastguard Worker punpcklwd xmm2, xmm2 5856*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm0, xmm1 5857*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xmm2 5858*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 5859*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 5860*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 5861*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5862*c0909341SAndroid Build Coastguard Worker add hq, 2 5863*c0909341SAndroid Build Coastguard Worker jl .w2 5864*c0909341SAndroid Build Coastguard Worker RET 5865*c0909341SAndroid Build Coastguard Worker.w4: 5866*c0909341SAndroid Build Coastguard Worker mova xmm3, [blend_shuf] 5867*c0909341SAndroid Build Coastguard Worker.w4_loop: 5868*c0909341SAndroid Build Coastguard Worker movq xmm0, [dstq+dsq*0] 5869*c0909341SAndroid Build Coastguard Worker movhps xmm0, [dstq+dsq*1] 5870*c0909341SAndroid Build Coastguard Worker movd xmm2, [maskq+hq*2] 5871*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm0, [tmpq] 5872*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 5873*c0909341SAndroid Build Coastguard Worker pshufb xmm2, xmm3 5874*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xmm2 5875*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 5876*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 5877*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 5878*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5879*c0909341SAndroid Build Coastguard Worker add hq, 2 5880*c0909341SAndroid Build Coastguard Worker jl .w4_loop 5881*c0909341SAndroid Build Coastguard Worker RET 5882*c0909341SAndroid Build Coastguard Worker.w8: 5883*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [blend_shuf] 5884*c0909341SAndroid Build Coastguard Worker shufpd ym3, ym3, 0x0c 5885*c0909341SAndroid Build Coastguard Worker.w8_loop: 5886*c0909341SAndroid Build Coastguard Worker mova xm0, [dstq+dsq*0] 5887*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [dstq+dsq*1], 1 5888*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [maskq+hq*2] 5889*c0909341SAndroid Build Coastguard Worker psubw ym1, ym0, [tmpq] 5890*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 5891*c0909341SAndroid Build Coastguard Worker pshufb ym2, ym3 5892*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym2 5893*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 5894*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5895*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 5896*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5897*c0909341SAndroid Build Coastguard Worker add hq, 2 5898*c0909341SAndroid Build Coastguard Worker jl .w8_loop 5899*c0909341SAndroid Build Coastguard Worker RET 5900*c0909341SAndroid Build Coastguard Worker.w16: 5901*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [blend_shuf] 5902*c0909341SAndroid Build Coastguard Worker shufpd m3, m3, 0xf0 5903*c0909341SAndroid Build Coastguard Worker.w16_loop: 5904*c0909341SAndroid Build Coastguard Worker mova ym0, [dstq+dsq*0] 5905*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [dstq+dsq*1], 1 5906*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [maskq+hq*2] 5907*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmpq] 5908*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 5909*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 5910*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 5911*c0909341SAndroid Build Coastguard Worker paddw m0, m1 5912*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 5913*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 5914*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5915*c0909341SAndroid Build Coastguard Worker add hq, 2 5916*c0909341SAndroid Build Coastguard Worker jl .w16_loop 5917*c0909341SAndroid Build Coastguard Worker RET 5918*c0909341SAndroid Build Coastguard Worker.w32: 5919*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [maskq+hq*2] 5920*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, [maskq+hq*2+2] 5921*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+dsq*0] 5922*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+ 64*0] 5923*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+dsq*1] 5924*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+ 64*1] 5925*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 5926*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 5927*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 5928*c0909341SAndroid Build Coastguard Worker paddw m0, m2 5929*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5930*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 5931*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 5932*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5933*c0909341SAndroid Build Coastguard Worker add hq, 2 5934*c0909341SAndroid Build Coastguard Worker jl .w32 5935*c0909341SAndroid Build Coastguard Worker RET 5936*c0909341SAndroid Build Coastguard Worker.w64: 5937*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, [maskq+hq*2] 5938*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+64*0] 5939*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmpq+64*0] 5940*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+64*1] 5941*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmpq+64*1] 5942*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 5943*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 5944*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 5945*c0909341SAndroid Build Coastguard Worker paddw m0, m2 5946*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5947*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5948*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5949*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5950*c0909341SAndroid Build Coastguard Worker inc hq 5951*c0909341SAndroid Build Coastguard Worker jl .w64 5952*c0909341SAndroid Build Coastguard Worker RET 5953*c0909341SAndroid Build Coastguard Worker.w128: 5954*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [maskq+hq*2] 5955*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+64*0] 5956*c0909341SAndroid Build Coastguard Worker psubw m4, m0, [tmpq+64*0] 5957*c0909341SAndroid Build Coastguard Worker mova m1, [dstq+64*1] 5958*c0909341SAndroid Build Coastguard Worker psubw m5, m1, [tmpq+64*1] 5959*c0909341SAndroid Build Coastguard Worker mova m2, [dstq+64*2] 5960*c0909341SAndroid Build Coastguard Worker psubw m6, m2, [tmpq+64*2] 5961*c0909341SAndroid Build Coastguard Worker mova m3, [dstq+64*3] 5962*c0909341SAndroid Build Coastguard Worker psubw m7, m3, [tmpq+64*3] 5963*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 5964*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m4, m5, m6, m7 5965*c0909341SAndroid Build Coastguard Worker paddw m0, m4 5966*c0909341SAndroid Build Coastguard Worker paddw m1, m5 5967*c0909341SAndroid Build Coastguard Worker paddw m2, m6 5968*c0909341SAndroid Build Coastguard Worker paddw m3, m7 5969*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5970*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5971*c0909341SAndroid Build Coastguard Worker mova [dstq+64*2], m2 5972*c0909341SAndroid Build Coastguard Worker mova [dstq+64*3], m3 5973*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5974*c0909341SAndroid Build Coastguard Worker inc hq 5975*c0909341SAndroid Build Coastguard Worker jl .w128 5976*c0909341SAndroid Build Coastguard Worker RET 5977*c0909341SAndroid Build Coastguard Worker 5978*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ 5979*c0909341SAndroid Build Coastguard Worker dst_w, h, src_w, dx, mx0, pxmax 5980*c0909341SAndroid Build Coastguard Worker sub dword mx0m, 4<<14 5981*c0909341SAndroid Build Coastguard Worker sub dword src_wm, 8 5982*c0909341SAndroid Build Coastguard Worker mov r6, ~0 5983*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, dxm 5984*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, mx0m 5985*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, src_wm 5986*c0909341SAndroid Build Coastguard Worker kmovq k6, r6 5987*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax 5988*c0909341SAndroid Build Coastguard Worker LEA r7, $$ 5989*c0909341SAndroid Build Coastguard Worker%define base r7-$$ 5990*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+pd_16384] 5991*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pd_63] 5992*c0909341SAndroid Build Coastguard Worker mova m24, [base+resize_permA] 5993*c0909341SAndroid Build Coastguard Worker mova m25, [base+resize_permB] 5994*c0909341SAndroid Build Coastguard Worker mova m26, [base+resize_permC] 5995*c0909341SAndroid Build Coastguard Worker mova m27, [base+resize_permD] 5996*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m28, [base+resize_shufA] 5997*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m29, [base+resize_shufB] 5998*c0909341SAndroid Build Coastguard Worker mova m30, [base+resize_permE] 5999*c0909341SAndroid Build Coastguard Worker vpbroadcastw ym31, pxmaxm 6000*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] 6001*c0909341SAndroid Build Coastguard Worker pslld m5, 4 ; dx*16 6002*c0909341SAndroid Build Coastguard Worker pslld m6, 14 6003*c0909341SAndroid Build Coastguard Worker pxor m2, m2 6004*c0909341SAndroid Build Coastguard Worker.loop_y: 6005*c0909341SAndroid Build Coastguard Worker xor xd, xd 6006*c0909341SAndroid Build Coastguard Worker mova m4, m8 ; per-line working version of mx 6007*c0909341SAndroid Build Coastguard Worker.loop_x: 6008*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m4, m2 6009*c0909341SAndroid Build Coastguard Worker psrad m9, m4, 8 ; filter offset (unmasked) 6010*c0909341SAndroid Build Coastguard Worker pminsd m0, m6 ; iclip(mx, 0, src_w-8) 6011*c0909341SAndroid Build Coastguard Worker psubd m1, m4, m0 ; pshufb offset 6012*c0909341SAndroid Build Coastguard Worker psrad m0, 14 ; clipped src_x offset 6013*c0909341SAndroid Build Coastguard Worker psrad m1, 14 ; pshufb edge_emu offset 6014*c0909341SAndroid Build Coastguard Worker vptestmd k5, m1, m1 6015*c0909341SAndroid Build Coastguard Worker pand m9, m7 ; filter offset (masked) 6016*c0909341SAndroid Build Coastguard Worker ktestw k5, k5 6017*c0909341SAndroid Build Coastguard Worker jz .load 6018*c0909341SAndroid Build Coastguard Worker vpbroadcastq m14, [base+pd_0_4] 6019*c0909341SAndroid Build Coastguard Worker vpermq m10, m0, q1100 6020*c0909341SAndroid Build Coastguard Worker vpermq m11, m0, q3322 6021*c0909341SAndroid Build Coastguard Worker vpermq m20, m1, q1100 6022*c0909341SAndroid Build Coastguard Worker vpermq m21, m1, q3322 6023*c0909341SAndroid Build Coastguard Worker punpckldq m10, m10 6024*c0909341SAndroid Build Coastguard Worker punpckldq m11, m11 6025*c0909341SAndroid Build Coastguard Worker punpckldq m20, m20 6026*c0909341SAndroid Build Coastguard Worker punpckldq m21, m21 6027*c0909341SAndroid Build Coastguard Worker paddd m10, m14 6028*c0909341SAndroid Build Coastguard Worker paddd m11, m14 6029*c0909341SAndroid Build Coastguard Worker paddd m20, m14 6030*c0909341SAndroid Build Coastguard Worker paddd m21, m14 6031*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym12, m10, 1 6032*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym13, m11, 1 6033*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym22, m20, 1 6034*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym23, m21, 1 6035*c0909341SAndroid Build Coastguard Worker kmovq k1, k6 6036*c0909341SAndroid Build Coastguard Worker kmovq k2, k6 6037*c0909341SAndroid Build Coastguard Worker kmovq k3, k6 6038*c0909341SAndroid Build Coastguard Worker kmovq k4, k6 6039*c0909341SAndroid Build Coastguard Worker vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 6040*c0909341SAndroid Build Coastguard Worker vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 6041*c0909341SAndroid Build Coastguard Worker vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B 6042*c0909341SAndroid Build Coastguard Worker vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F 6043*c0909341SAndroid Build Coastguard Worker kmovq k1, k6 6044*c0909341SAndroid Build Coastguard Worker kmovq k2, k6 6045*c0909341SAndroid Build Coastguard Worker kmovq k3, k6 6046*c0909341SAndroid Build Coastguard Worker kmovq k4, k6 6047*c0909341SAndroid Build Coastguard Worker vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] 6048*c0909341SAndroid Build Coastguard Worker vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] 6049*c0909341SAndroid Build Coastguard Worker vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] 6050*c0909341SAndroid Build Coastguard Worker vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] 6051*c0909341SAndroid Build Coastguard Worker pshufb m16, m0 6052*c0909341SAndroid Build Coastguard Worker pshufb m17, m1 6053*c0909341SAndroid Build Coastguard Worker pshufb m18, m14 6054*c0909341SAndroid Build Coastguard Worker pshufb m19, m15 6055*c0909341SAndroid Build Coastguard Worker mova m20, m24 6056*c0909341SAndroid Build Coastguard Worker mova m22, m24 6057*c0909341SAndroid Build Coastguard Worker mova m21, m25 6058*c0909341SAndroid Build Coastguard Worker mova m23, m25 6059*c0909341SAndroid Build Coastguard Worker vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b 6060*c0909341SAndroid Build Coastguard Worker vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d 6061*c0909341SAndroid Build Coastguard Worker vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb 6062*c0909341SAndroid Build Coastguard Worker vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd 6063*c0909341SAndroid Build Coastguard Worker mova m15, m26 6064*c0909341SAndroid Build Coastguard Worker mova m17, m26 6065*c0909341SAndroid Build Coastguard Worker mova m16, m27 6066*c0909341SAndroid Build Coastguard Worker mova m18, m27 6067*c0909341SAndroid Build Coastguard Worker vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa 6068*c0909341SAndroid Build Coastguard Worker vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb 6069*c0909341SAndroid Build Coastguard Worker vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc 6070*c0909341SAndroid Build Coastguard Worker vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd 6071*c0909341SAndroid Build Coastguard Worker kmovq k1, k6 6072*c0909341SAndroid Build Coastguard Worker kmovq k2, k6 6073*c0909341SAndroid Build Coastguard Worker vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] 6074*c0909341SAndroid Build Coastguard Worker vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] 6075*c0909341SAndroid Build Coastguard Worker pshufb m10, m11, m28 6076*c0909341SAndroid Build Coastguard Worker pshufb m11, m11, m29 6077*c0909341SAndroid Build Coastguard Worker pshufb m12, m13, m28 6078*c0909341SAndroid Build Coastguard Worker pshufb m13, m13, m29 6079*c0909341SAndroid Build Coastguard Worker jmp .filter 6080*c0909341SAndroid Build Coastguard Worker.load: 6081*c0909341SAndroid Build Coastguard Worker kmovq k1, k6 6082*c0909341SAndroid Build Coastguard Worker kmovq k2, k6 6083*c0909341SAndroid Build Coastguard Worker kmovq k3, k6 6084*c0909341SAndroid Build Coastguard Worker kmovq k4, k6 6085*c0909341SAndroid Build Coastguard Worker vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] 6086*c0909341SAndroid Build Coastguard Worker vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] 6087*c0909341SAndroid Build Coastguard Worker pshufb m10, m11, m28 6088*c0909341SAndroid Build Coastguard Worker pshufb m11, m11, m29 6089*c0909341SAndroid Build Coastguard Worker pshufb m12, m13, m28 6090*c0909341SAndroid Build Coastguard Worker pshufb m13, m13, m29 6091*c0909341SAndroid Build Coastguard Worker vpgatherdd m15{k3}, [srcq+m0*2+ 0] 6092*c0909341SAndroid Build Coastguard Worker vpgatherdd m16{k4}, [srcq+m0*2+ 4] 6093*c0909341SAndroid Build Coastguard Worker kmovq k1, k6 6094*c0909341SAndroid Build Coastguard Worker kmovq k2, k6 6095*c0909341SAndroid Build Coastguard Worker vpgatherdd m17{k1}, [srcq+m0*2+ 8] 6096*c0909341SAndroid Build Coastguard Worker vpgatherdd m18{k2}, [srcq+m0*2+12] 6097*c0909341SAndroid Build Coastguard Worker.filter: 6098*c0909341SAndroid Build Coastguard Worker mova m14, m2 6099*c0909341SAndroid Build Coastguard Worker vpdpwssd m14, m15, m10 6100*c0909341SAndroid Build Coastguard Worker vpdpwssd m14, m16, m11 6101*c0909341SAndroid Build Coastguard Worker vpdpwssd m14, m17, m12 6102*c0909341SAndroid Build Coastguard Worker vpdpwssd m14, m18, m13 6103*c0909341SAndroid Build Coastguard Worker psubd m14, m3, m14 6104*c0909341SAndroid Build Coastguard Worker psrad m14, 15 6105*c0909341SAndroid Build Coastguard Worker packusdw m14, m14 6106*c0909341SAndroid Build Coastguard Worker vpermq m14, m30, m14 6107*c0909341SAndroid Build Coastguard Worker pminsw ym14, ym31 6108*c0909341SAndroid Build Coastguard Worker mova [dstq+xq*2], ym14 6109*c0909341SAndroid Build Coastguard Worker paddd m4, m5 6110*c0909341SAndroid Build Coastguard Worker add xd, 16 6111*c0909341SAndroid Build Coastguard Worker cmp xd, dst_wd 6112*c0909341SAndroid Build Coastguard Worker jl .loop_x 6113*c0909341SAndroid Build Coastguard Worker add dstq, dst_strideq 6114*c0909341SAndroid Build Coastguard Worker add srcq, src_strideq 6115*c0909341SAndroid Build Coastguard Worker dec hd 6116*c0909341SAndroid Build Coastguard Worker jg .loop_y 6117*c0909341SAndroid Build Coastguard Worker RET 6118*c0909341SAndroid Build Coastguard Worker 6119*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 6120