1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workerobmc_masks: 34*c0909341SAndroid Build Coastguard Workerpw_512: times 2 dw 512 35*c0909341SAndroid Build Coastguard Worker ; 2 36*c0909341SAndroid Build Coastguard Worker db 45, 19, 64, 0 37*c0909341SAndroid Build Coastguard Worker ; 4 38*c0909341SAndroid Build Coastguard Worker db 39, 25, 50, 14, 59, 5, 64, 0 39*c0909341SAndroid Build Coastguard Worker ; 8 40*c0909341SAndroid Build Coastguard Worker db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 41*c0909341SAndroid Build Coastguard Worker ; 16 42*c0909341SAndroid Build Coastguard Worker db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 43*c0909341SAndroid Build Coastguard Worker db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 44*c0909341SAndroid Build Coastguard Worker ; 32 45*c0909341SAndroid Build Coastguard Worker db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 46*c0909341SAndroid Build Coastguard Worker db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 47*c0909341SAndroid Build Coastguard Worker db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 48*c0909341SAndroid Build Coastguard Worker db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 49*c0909341SAndroid Build Coastguard Worker 50*c0909341SAndroid Build Coastguard Workerwarp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 51*c0909341SAndroid Build Coastguard Worker db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 52*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 53*c0909341SAndroid Build Coastguard Worker db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 54*c0909341SAndroid Build Coastguard Workerwarp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 55*c0909341SAndroid Build Coastguard Worker db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 56*c0909341SAndroid Build Coastguard Worker db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 57*c0909341SAndroid Build Coastguard Worker db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 58*c0909341SAndroid Build Coastguard Workerwarp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 59*c0909341SAndroid Build Coastguard Workerwarp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 60*c0909341SAndroid Build Coastguard Workerpd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 61*c0909341SAndroid Build Coastguard Workerwarp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 62*c0909341SAndroid Build Coastguard Workerpd_16384: dd 16384 63*c0909341SAndroid Build Coastguard Workerpd_262144: dd 262144 64*c0909341SAndroid Build Coastguard Workerwarp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 65*c0909341SAndroid Build Coastguard Workerwarp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 66*c0909341SAndroid Build Coastguard Worker db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 67*c0909341SAndroid Build Coastguard Workerbidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 68*c0909341SAndroid Build Coastguard Workerwm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 69*c0909341SAndroid Build Coastguard Worker db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 70*c0909341SAndroid Build Coastguard Worker db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 71*c0909341SAndroid Build Coastguard Worker db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 72*c0909341SAndroid Build Coastguard Workerwm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 73*c0909341SAndroid Build Coastguard Worker db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 74*c0909341SAndroid Build Coastguard Worker db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 75*c0909341SAndroid Build Coastguard Worker db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 76*c0909341SAndroid Build Coastguard Workerwm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 77*c0909341SAndroid Build Coastguard Worker db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 78*c0909341SAndroid Build Coastguard Worker db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 79*c0909341SAndroid Build Coastguard Worker db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 80*c0909341SAndroid Build Coastguard Workerwm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 81*c0909341SAndroid Build Coastguard Worker db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 82*c0909341SAndroid Build Coastguard Worker db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 83*c0909341SAndroid Build Coastguard Worker db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 84*c0909341SAndroid Build Coastguard Workerwm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 85*c0909341SAndroid Build Coastguard Worker db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 86*c0909341SAndroid Build Coastguard Worker db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 87*c0909341SAndroid Build Coastguard Worker db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 88*c0909341SAndroid Build Coastguard Workerwm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 89*c0909341SAndroid Build Coastguard Worker db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 90*c0909341SAndroid Build Coastguard Worker db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 91*c0909341SAndroid Build Coastguard Worker db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 92*c0909341SAndroid Build Coastguard Workerbilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 93*c0909341SAndroid Build Coastguard Worker db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 94*c0909341SAndroid Build Coastguard Worker db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 95*c0909341SAndroid Build Coastguard Worker db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 96*c0909341SAndroid Build Coastguard Workerbilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 97*c0909341SAndroid Build Coastguard Worker db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 98*c0909341SAndroid Build Coastguard Worker db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 99*c0909341SAndroid Build Coastguard Worker db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 100*c0909341SAndroid Build Coastguard Workerbilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 101*c0909341SAndroid Build Coastguard Worker db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 102*c0909341SAndroid Build Coastguard Worker db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39 103*c0909341SAndroid Build Coastguard Worker db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71 104*c0909341SAndroid Build Coastguard Workerbilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 105*c0909341SAndroid Build Coastguard Worker db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 106*c0909341SAndroid Build Coastguard Worker db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71 107*c0909341SAndroid Build Coastguard Worker db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79 108*c0909341SAndroid Build Coastguard Workerbilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 109*c0909341SAndroid Build Coastguard Worker db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 110*c0909341SAndroid Build Coastguard Worker db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 111*c0909341SAndroid Build Coastguard Worker db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 112*c0909341SAndroid Build Coastguard Workerbilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11 113*c0909341SAndroid Build Coastguard Workerspel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 114*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 115*c0909341SAndroid Build Coastguard Worker db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 116*c0909341SAndroid Build Coastguard Worker db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 117*c0909341SAndroid Build Coastguard Workerspel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 118*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 119*c0909341SAndroid Build Coastguard Worker db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 120*c0909341SAndroid Build Coastguard Worker db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 121*c0909341SAndroid Build Coastguard Workerspel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 122*c0909341SAndroid Build Coastguard Worker db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23 123*c0909341SAndroid Build Coastguard Worker db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 124*c0909341SAndroid Build Coastguard Worker db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39 125*c0909341SAndroid Build Coastguard Workerspel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 126*c0909341SAndroid Build Coastguard Worker db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 127*c0909341SAndroid Build Coastguard Worker db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 128*c0909341SAndroid Build Coastguard Worker db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 129*c0909341SAndroid Build Coastguard Workerspel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 130*c0909341SAndroid Build Coastguard Worker db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 131*c0909341SAndroid Build Coastguard Worker db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 132*c0909341SAndroid Build Coastguard Worker db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 133*c0909341SAndroid Build Coastguard Workerspel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 134*c0909341SAndroid Build Coastguard Worker db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 135*c0909341SAndroid Build Coastguard Worker db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 136*c0909341SAndroid Build Coastguard Worker db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 137*c0909341SAndroid Build Coastguard Workerspel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 138*c0909341SAndroid Build Coastguard Worker db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 139*c0909341SAndroid Build Coastguard Workerspel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 140*c0909341SAndroid Build Coastguard Worker db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 141*c0909341SAndroid Build Coastguard Workerspel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 142*c0909341SAndroid Build Coastguard Worker db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 143*c0909341SAndroid Build Coastguard Workerspel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 144*c0909341SAndroid Build Coastguard Worker db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 145*c0909341SAndroid Build Coastguard Workerspel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 146*c0909341SAndroid Build Coastguard Worker db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 147*c0909341SAndroid Build Coastguard Worker db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 148*c0909341SAndroid Build Coastguard Worker db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 149*c0909341SAndroid Build Coastguard Workerspel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 150*c0909341SAndroid Build Coastguard Worker db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 151*c0909341SAndroid Build Coastguard Worker db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 152*c0909341SAndroid Build Coastguard Worker db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 153*c0909341SAndroid Build Coastguard Workerspel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 154*c0909341SAndroid Build Coastguard Worker db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 155*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 156*c0909341SAndroid Build Coastguard Worker db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 157*c0909341SAndroid Build Coastguard Workerspel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 158*c0909341SAndroid Build Coastguard Worker db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 159*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 160*c0909341SAndroid Build Coastguard Worker db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 161*c0909341SAndroid Build Coastguard Workerspel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 162*c0909341SAndroid Build Coastguard Worker db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 163*c0909341SAndroid Build Coastguard Workerspel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 164*c0909341SAndroid Build Coastguard Workerdeint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 165*c0909341SAndroid Build Coastguard Workersubpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 166*c0909341SAndroid Build Coastguard Worker db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 167*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 168*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 169*c0909341SAndroid Build Coastguard Workersubpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 170*c0909341SAndroid Build Coastguard Workerbilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 171*c0909341SAndroid Build Coastguard Workerbilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 172*c0909341SAndroid Build Coastguard Workerblend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 173*c0909341SAndroid Build Coastguard Workerrescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 174*c0909341SAndroid Build Coastguard Workerresize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 175*c0909341SAndroid Build Coastguard Workerresize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 176*c0909341SAndroid Build Coastguard Workerresize_permC: dd 0, 4, 8, 12 177*c0909341SAndroid Build Coastguard Workerresize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 178*c0909341SAndroid Build Coastguard Workerpb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 179*c0909341SAndroid Build Coastguard Worker 180*c0909341SAndroid Build Coastguard Workerwm_420_perm64: dq 0xfedcba9876543210 181*c0909341SAndroid Build Coastguard Workerwm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 182*c0909341SAndroid Build Coastguard Worker 183*c0909341SAndroid Build Coastguard Workerpb_8x0_8x8: times 8 db 0 184*c0909341SAndroid Build Coastguard Worker times 8 db 8 185*c0909341SAndroid Build Coastguard Workerpb_4: times 4 db 4 186*c0909341SAndroid Build Coastguard Workerpb_32: times 4 db 32 187*c0909341SAndroid Build Coastguard Workerpb_127: times 4 db 127 188*c0909341SAndroid Build Coastguard Workerpw_m128 times 2 dw -128 189*c0909341SAndroid Build Coastguard Workerpw_m256: times 2 dw -256 190*c0909341SAndroid Build Coastguard Workerpw_1024: times 2 dw 1024 191*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 192*c0909341SAndroid Build Coastguard Workerpw_6903: times 2 dw 6903 193*c0909341SAndroid Build Coastguard Workerpw_8192: times 2 dw 8192 194*c0909341SAndroid Build Coastguard Workerpd_32: dd 32 195*c0909341SAndroid Build Coastguard Workerpd_34: dd 34 196*c0909341SAndroid Build Coastguard Workerpd_63: dd 63 197*c0909341SAndroid Build Coastguard Workerpd_512: dd 512 198*c0909341SAndroid Build Coastguard Worker 199*c0909341SAndroid Build Coastguard Worker%define pb_m64 (wm_sign+4) 200*c0909341SAndroid Build Coastguard Worker%define pb_64 (wm_sign+8) 201*c0909341SAndroid Build Coastguard Worker%define pd_2 (pd_0to7+8) 202*c0909341SAndroid Build Coastguard Worker 203*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters 204*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 205*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter 206*c0909341SAndroid Build Coastguard Workercextern resize_filter 207*c0909341SAndroid Build Coastguard Worker 208*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-* 209*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - %3) 210*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2 211*c0909341SAndroid Build Coastguard Worker %%table: 212*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 213*c0909341SAndroid Build Coastguard Worker dw %%base %+ _w%3 - %%base 214*c0909341SAndroid Build Coastguard Worker %rotate 1 215*c0909341SAndroid Build Coastguard Worker %endrep 216*c0909341SAndroid Build Coastguard Worker%endmacro 217*c0909341SAndroid Build Coastguard Worker 218*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-* 219*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) 220*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%3 221*c0909341SAndroid Build Coastguard Worker %assign %%types %4 222*c0909341SAndroid Build Coastguard Worker %if %%types & 1 223*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_h_%3_table (%%h - %5) 224*c0909341SAndroid Build Coastguard Worker %%h: 225*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 226*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .h_w%5 - %%base 227*c0909341SAndroid Build Coastguard Worker %rotate 1 228*c0909341SAndroid Build Coastguard Worker %endrep 229*c0909341SAndroid Build Coastguard Worker %rotate 4 230*c0909341SAndroid Build Coastguard Worker %endif 231*c0909341SAndroid Build Coastguard Worker %if %%types & 2 232*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_v_%3_table (%%v - %5) 233*c0909341SAndroid Build Coastguard Worker %%v: 234*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 235*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .v_w%5 - %%base 236*c0909341SAndroid Build Coastguard Worker %rotate 1 237*c0909341SAndroid Build Coastguard Worker %endrep 238*c0909341SAndroid Build Coastguard Worker %rotate 4 239*c0909341SAndroid Build Coastguard Worker %endif 240*c0909341SAndroid Build Coastguard Worker %if %%types & 4 241*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_hv_%3_table (%%hv - %5) 242*c0909341SAndroid Build Coastguard Worker %%hv: 243*c0909341SAndroid Build Coastguard Worker %rep %0 - 4 244*c0909341SAndroid Build Coastguard Worker dw %%prefix %+ .hv_w%5 - %%base 245*c0909341SAndroid Build Coastguard Worker %rotate 1 246*c0909341SAndroid Build Coastguard Worker %endrep 247*c0909341SAndroid Build Coastguard Worker %endif 248*c0909341SAndroid Build Coastguard Worker%endmacro 249*c0909341SAndroid Build Coastguard Worker 250*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-* 251*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*%3) 252*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_%2_table 253*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 254*c0909341SAndroid Build Coastguard Worker %%table: 255*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 256*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .w%3 - %%base 257*c0909341SAndroid Build Coastguard Worker %rotate 1 258*c0909341SAndroid Build Coastguard Worker %endrep 259*c0909341SAndroid Build Coastguard Worker%endmacro 260*c0909341SAndroid Build Coastguard Worker 261*c0909341SAndroid Build Coastguard Worker%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) 262*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) 263*c0909341SAndroid Build Coastguard Worker 264*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 265*c0909341SAndroid Build Coastguard Worker 266*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 267*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 268*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 269*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 270*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 271*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 272*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 273*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128 274*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 275*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 276*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 277*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 278*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 279*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 280*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 281*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 282*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 283*c0909341SAndroid Build Coastguard Worker 284*c0909341SAndroid Build Coastguard WorkerSECTION .text 285*c0909341SAndroid Build Coastguard Worker 286*c0909341SAndroid Build Coastguard Worker%macro WRAP_YMM 1+ 287*c0909341SAndroid Build Coastguard WorkerINIT_YMM cpuname 288*c0909341SAndroid Build Coastguard Worker %1 289*c0909341SAndroid Build Coastguard WorkerINIT_ZMM cpuname 290*c0909341SAndroid Build Coastguard Worker%endmacro 291*c0909341SAndroid Build Coastguard Worker 292*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 293*c0909341SAndroid Build Coastguard Workercglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy 294*c0909341SAndroid Build Coastguard Worker movifnidn mxyd, r6m ; mx 295*c0909341SAndroid Build Coastguard Worker lea r7, [put_avx512icl] 296*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 297*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 298*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 299*c0909341SAndroid Build Coastguard Worker jnz .h 300*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 301*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 302*c0909341SAndroid Build Coastguard Worker jnz .v 303*c0909341SAndroid Build Coastguard Worker.put: 304*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put,)] 305*c0909341SAndroid Build Coastguard Worker add wq, r7 306*c0909341SAndroid Build Coastguard Worker jmp wq 307*c0909341SAndroid Build Coastguard Worker.put_w2: 308*c0909341SAndroid Build Coastguard Worker movzx r6d, word [srcq+ssq*0] 309*c0909341SAndroid Build Coastguard Worker movzx r7d, word [srcq+ssq*1] 310*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 311*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6w 312*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7w 313*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 314*c0909341SAndroid Build Coastguard Worker sub hd, 2 315*c0909341SAndroid Build Coastguard Worker jg .put_w2 316*c0909341SAndroid Build Coastguard Worker RET 317*c0909341SAndroid Build Coastguard Worker.put_w4: 318*c0909341SAndroid Build Coastguard Worker mov r6d, [srcq+ssq*0] 319*c0909341SAndroid Build Coastguard Worker mov r7d, [srcq+ssq*1] 320*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 321*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6d 322*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7d 323*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 324*c0909341SAndroid Build Coastguard Worker sub hd, 2 325*c0909341SAndroid Build Coastguard Worker jg .put_w4 326*c0909341SAndroid Build Coastguard Worker RET 327*c0909341SAndroid Build Coastguard Worker.put_w8: 328*c0909341SAndroid Build Coastguard Worker mov r6, [srcq+ssq*0] 329*c0909341SAndroid Build Coastguard Worker mov r7, [srcq+ssq*1] 330*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 331*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*0], r6 332*c0909341SAndroid Build Coastguard Worker mov [dstq+dsq*1], r7 333*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 334*c0909341SAndroid Build Coastguard Worker sub hd, 2 335*c0909341SAndroid Build Coastguard Worker jg .put_w8 336*c0909341SAndroid Build Coastguard Worker RET 337*c0909341SAndroid Build Coastguard Worker.put_w16: 338*c0909341SAndroid Build Coastguard Worker movu xmm0, [srcq+ssq*0] 339*c0909341SAndroid Build Coastguard Worker movu xmm1, [srcq+ssq*1] 340*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 341*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xmm0 342*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], xmm1 343*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 344*c0909341SAndroid Build Coastguard Worker sub hd, 2 345*c0909341SAndroid Build Coastguard Worker jg .put_w16 346*c0909341SAndroid Build Coastguard Worker RET 347*c0909341SAndroid Build Coastguard Worker.put_w32: 348*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 349*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*1] 350*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 351*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 352*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], ym1 353*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 354*c0909341SAndroid Build Coastguard Worker sub hd, 2 355*c0909341SAndroid Build Coastguard Worker jg .put_w32 356*c0909341SAndroid Build Coastguard Worker RET 357*c0909341SAndroid Build Coastguard Worker.put_w64: 358*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 359*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*1] 360*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 361*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m0 362*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m1 363*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 364*c0909341SAndroid Build Coastguard Worker sub hd, 2 365*c0909341SAndroid Build Coastguard Worker jg .put_w64 366*c0909341SAndroid Build Coastguard Worker RET 367*c0909341SAndroid Build Coastguard Worker.put_w128: 368*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0+64*0] 369*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+ssq*0+64*1] 370*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*1+64*0] 371*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1+64*1] 372*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 373*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*0], m0 374*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0+64*1], m1 375*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*0], m2 376*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1+64*1], m3 377*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 378*c0909341SAndroid Build Coastguard Worker sub hd, 2 379*c0909341SAndroid Build Coastguard Worker jg .put_w128 380*c0909341SAndroid Build Coastguard Worker RET 381*c0909341SAndroid Build Coastguard Worker.h: 382*c0909341SAndroid Build Coastguard Worker ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 383*c0909341SAndroid Build Coastguard Worker ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 384*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 385*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [bilin_h_perm16] 386*c0909341SAndroid Build Coastguard Worker add mxyd, 16 387*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, mxyd 388*c0909341SAndroid Build Coastguard Worker mov mxyd, r7m ; my 389*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 390*c0909341SAndroid Build Coastguard Worker jnz .hv 391*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 392*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_2048] 393*c0909341SAndroid Build Coastguard Worker add wq, r7 394*c0909341SAndroid Build Coastguard Worker jmp wq 395*c0909341SAndroid Build Coastguard Worker.h_w2: 396*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 397*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [srcq+ssq*1], 1 398*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 399*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xm4 400*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xm5 401*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm3 402*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 403*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm0, 0 404*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm0, 2 405*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 406*c0909341SAndroid Build Coastguard Worker sub hd, 2 407*c0909341SAndroid Build Coastguard Worker jg .h_w2 408*c0909341SAndroid Build Coastguard Worker RET 409*c0909341SAndroid Build Coastguard Worker.h_w4: 410*c0909341SAndroid Build Coastguard Worker mova xmm4, [bilin_h_shuf4] 411*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 412*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 413*c0909341SAndroid Build Coastguard Worker movhps xmm0, [srcq+ssq*1] 414*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 415*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xmm4 416*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xm5 417*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm3 418*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 419*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 420*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 421*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 422*c0909341SAndroid Build Coastguard Worker sub hd, 2 423*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 424*c0909341SAndroid Build Coastguard Worker RET 425*c0909341SAndroid Build Coastguard Worker.h_w8: 426*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 427*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+ssq*1], 1 428*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 429*c0909341SAndroid Build Coastguard Worker pshufb ym0, ym4 430*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym5 431*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym3 432*c0909341SAndroid Build Coastguard Worker vpmovuswb xm0, ym0 433*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 434*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 435*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 436*c0909341SAndroid Build Coastguard Worker sub hd, 2 437*c0909341SAndroid Build Coastguard Worker jg .h_w8 438*c0909341SAndroid Build Coastguard Worker RET 439*c0909341SAndroid Build Coastguard Worker.h_w16: 440*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm16] 441*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 442*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 443*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 444*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 445*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m0 446*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 447*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 448*c0909341SAndroid Build Coastguard Worker vpmovuswb ym0, m0 449*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 450*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ym0, 1 451*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 452*c0909341SAndroid Build Coastguard Worker sub hd, 2 453*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 454*c0909341SAndroid Build Coastguard Worker RET 455*c0909341SAndroid Build Coastguard Worker.h_w32: 456*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0+8*0] 457*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1+8*0], 1 458*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*0+8*1] 459*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+ssq*1+8*1], 1 460*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 461*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 462*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 463*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 464*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 465*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 466*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 467*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 468*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 469*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 470*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 471*c0909341SAndroid Build Coastguard Worker sub hd, 2 472*c0909341SAndroid Build Coastguard Worker jg .h_w32 473*c0909341SAndroid Build Coastguard Worker RET 474*c0909341SAndroid Build Coastguard Worker.h_w64: 475*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 476*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 477*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 478*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 479*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 480*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 481*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 482*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 483*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 484*c0909341SAndroid Build Coastguard Worker add srcq, ssq 485*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 486*c0909341SAndroid Build Coastguard Worker add dstq, dsq 487*c0909341SAndroid Build Coastguard Worker dec hd 488*c0909341SAndroid Build Coastguard Worker jg .h_w64 489*c0909341SAndroid Build Coastguard Worker RET 490*c0909341SAndroid Build Coastguard Worker.h_w128: 491*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 492*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+8*1] 493*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*8] 494*c0909341SAndroid Build Coastguard Worker movu m6, [srcq+8*9] 495*c0909341SAndroid Build Coastguard Worker add srcq, ssq 496*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m4}, m0, m2, m1, m6 497*c0909341SAndroid Build Coastguard Worker REPX {pmaddubsw x, m5}, m0, m2, m1, m6 498*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m3}, m0, m2, m1, m6 499*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 500*c0909341SAndroid Build Coastguard Worker packuswb m1, m6 501*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 502*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 503*c0909341SAndroid Build Coastguard Worker add dstq, dsq 504*c0909341SAndroid Build Coastguard Worker dec hd 505*c0909341SAndroid Build Coastguard Worker jg .h_w128 506*c0909341SAndroid Build Coastguard Worker RET 507*c0909341SAndroid Build Coastguard Worker.v: 508*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 509*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 510*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pw_2048] 511*c0909341SAndroid Build Coastguard Worker add mxyd, 16 512*c0909341SAndroid Build Coastguard Worker add wq, r7 513*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, mxyd 514*c0909341SAndroid Build Coastguard Worker jmp wq 515*c0909341SAndroid Build Coastguard Worker.v_w2: 516*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 517*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 518*c0909341SAndroid Build Coastguard Worker pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 519*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 520*c0909341SAndroid Build Coastguard Worker pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 521*c0909341SAndroid Build Coastguard Worker pshuflw xmm1, xmm1, q2301 ; 1 0 522*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 523*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xm4 524*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm5 525*c0909341SAndroid Build Coastguard Worker packuswb xmm1, xmm1 526*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm1, 1 527*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm1, 0 528*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 529*c0909341SAndroid Build Coastguard Worker sub hd, 2 530*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 531*c0909341SAndroid Build Coastguard Worker RET 532*c0909341SAndroid Build Coastguard Worker.v_w4: 533*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+ssq*0] 534*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 535*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm2, [srcq+ssq*1] 536*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 537*c0909341SAndroid Build Coastguard Worker vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1 538*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 539*c0909341SAndroid Build Coastguard Worker vpblendd xmm2, xmm0, 0x02 ; 1 2 540*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm2 541*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xm4 542*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm5 543*c0909341SAndroid Build Coastguard Worker packuswb xmm1, xmm1 544*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm1 545*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm1, 1 546*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 547*c0909341SAndroid Build Coastguard Worker sub hd, 2 548*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 549*c0909341SAndroid Build Coastguard Worker RET 550*c0909341SAndroid Build Coastguard Worker.v_w8: 551*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 552*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 553*c0909341SAndroid Build Coastguard Worker movq xmm2, [srcq+ssq*1] 554*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 555*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm0, xmm2 556*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 557*c0909341SAndroid Build Coastguard Worker punpcklbw xmm2, xmm0 558*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xm4 559*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm2, xm4 560*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm5 561*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm2, xm5 562*c0909341SAndroid Build Coastguard Worker packuswb xmm1, xmm2 563*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm1 564*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm1 565*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 566*c0909341SAndroid Build Coastguard Worker sub hd, 2 567*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 568*c0909341SAndroid Build Coastguard Worker RET 569*c0909341SAndroid Build Coastguard Worker.v_w16: 570*c0909341SAndroid Build Coastguard Worker movu xmm0, [srcq+ssq*0] 571*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 572*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm3, [srcq+ssq*1] 573*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 574*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1 575*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm0, [srcq+ssq*0] 576*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, 0xf0 ; 1 2 577*c0909341SAndroid Build Coastguard Worker punpcklbw ymm1, ymm2, ymm3 578*c0909341SAndroid Build Coastguard Worker punpckhbw ymm2, ymm3 579*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm1, ym4 580*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm2, ym4 581*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm1, ym5 582*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm2, ym5 583*c0909341SAndroid Build Coastguard Worker packuswb ymm1, ymm2 584*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xmm1 585*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ymm1, 1 586*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 587*c0909341SAndroid Build Coastguard Worker sub hd, 2 588*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 589*c0909341SAndroid Build Coastguard Worker vzeroupper 590*c0909341SAndroid Build Coastguard Worker RET 591*c0909341SAndroid Build Coastguard Worker.v_w32: 592*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 593*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 594*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 595*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m3, [srcq+ssq*1] 596*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 597*c0909341SAndroid Build Coastguard Worker vpblendmd m2{k1}, m3, m0 ; 0 1 598*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+ssq*0] 599*c0909341SAndroid Build Coastguard Worker vpblendmd m3{k1}, m0, m3 ; 1 2 600*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m3 601*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 602*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 603*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 604*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 605*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 606*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 607*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym1 608*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m1, 1 609*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 610*c0909341SAndroid Build Coastguard Worker sub hd, 2 611*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 612*c0909341SAndroid Build Coastguard Worker RET 613*c0909341SAndroid Build Coastguard Worker.v_w64: 614*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 615*c0909341SAndroid Build Coastguard Worker.v_w64_loop: 616*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+ssq*1] 617*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 618*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0, m3 619*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0, m3 620*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 621*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 622*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m4 623*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m0 624*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0 625*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 626*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m4 627*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m1, m6, m2, m3 628*c0909341SAndroid Build Coastguard Worker packuswb m1, m6 629*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 630*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m1 631*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m2 632*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 633*c0909341SAndroid Build Coastguard Worker sub hd, 2 634*c0909341SAndroid Build Coastguard Worker jg .v_w64_loop 635*c0909341SAndroid Build Coastguard Worker RET 636*c0909341SAndroid Build Coastguard Worker.v_w128: 637*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+64*0] 638*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+64*1] 639*c0909341SAndroid Build Coastguard Worker.v_w128_loop: 640*c0909341SAndroid Build Coastguard Worker add srcq, ssq 641*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+64*0] 642*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+64*1] 643*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m0, m2 644*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m4 645*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2 646*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m4 647*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m1, m3 648*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, m4 649*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 650*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 651*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m6, m0, m7, m1 652*c0909341SAndroid Build Coastguard Worker packuswb m6, m0 653*c0909341SAndroid Build Coastguard Worker mova m0, m2 654*c0909341SAndroid Build Coastguard Worker packuswb m7, m1 655*c0909341SAndroid Build Coastguard Worker mova m1, m3 656*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m6 657*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m7 658*c0909341SAndroid Build Coastguard Worker add dstq, dsq 659*c0909341SAndroid Build Coastguard Worker dec hd 660*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop 661*c0909341SAndroid Build Coastguard Worker RET 662*c0909341SAndroid Build Coastguard Worker.hv: 663*c0909341SAndroid Build Coastguard Worker ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 664*c0909341SAndroid Build Coastguard Worker ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 665*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 666*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 8 667*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 ; can't shift by 12 due to signed overflow 668*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_2048] 669*c0909341SAndroid Build Coastguard Worker add wq, r7 670*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, mxyd 671*c0909341SAndroid Build Coastguard Worker jmp wq 672*c0909341SAndroid Build Coastguard Worker.hv_w2: 673*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 674*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xm4 675*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xm5 676*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 677*c0909341SAndroid Build Coastguard Worker movd xmm1, [srcq+ssq*1] 678*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 679*c0909341SAndroid Build Coastguard Worker pinsrd xmm1, [srcq+ssq*0], 1 680*c0909341SAndroid Build Coastguard Worker pshufb xmm1, xm4 681*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xm5 ; 1 _ 2 _ 682*c0909341SAndroid Build Coastguard Worker shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ 683*c0909341SAndroid Build Coastguard Worker mova xmm0, xmm1 684*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm2 685*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm1 686*c0909341SAndroid Build Coastguard Worker pmulhw xmm1, xm6 687*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 688*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm7 689*c0909341SAndroid Build Coastguard Worker packuswb xmm1, xmm1 690*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm1, 0 691*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm1, 2 692*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 693*c0909341SAndroid Build Coastguard Worker sub hd, 2 694*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 695*c0909341SAndroid Build Coastguard Worker RET 696*c0909341SAndroid Build Coastguard Worker.hv_w4: 697*c0909341SAndroid Build Coastguard Worker mova xmm4, [bilin_h_shuf4] 698*c0909341SAndroid Build Coastguard Worker movddup xmm0, [srcq+ssq*0] 699*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xmm4 700*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xm5 701*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 702*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*1] 703*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 704*c0909341SAndroid Build Coastguard Worker movhps xmm1, [srcq+ssq*0] 705*c0909341SAndroid Build Coastguard Worker pshufb xmm1, xmm4 706*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xm5 ; 1 2 707*c0909341SAndroid Build Coastguard Worker shufps xmm2, xmm0, xmm1, q1032 ; 0 1 708*c0909341SAndroid Build Coastguard Worker mova xmm0, xmm1 709*c0909341SAndroid Build Coastguard Worker psubw xmm1, xmm2 710*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm1 711*c0909341SAndroid Build Coastguard Worker pmulhw xmm1, xm6 712*c0909341SAndroid Build Coastguard Worker paddw xmm1, xmm2 713*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm7 714*c0909341SAndroid Build Coastguard Worker packuswb xmm1, xmm1 715*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm1 716*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm1, 1 717*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 718*c0909341SAndroid Build Coastguard Worker sub hd, 2 719*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 720*c0909341SAndroid Build Coastguard Worker RET 721*c0909341SAndroid Build Coastguard Worker.hv_w8: 722*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ym0, [srcq+ssq*0] 723*c0909341SAndroid Build Coastguard Worker pshufb ym0, ym4 724*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym5 725*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 726*c0909341SAndroid Build Coastguard Worker movu xm1, [srcq+ssq*1] 727*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 728*c0909341SAndroid Build Coastguard Worker vinserti128 ym1, [srcq+ssq*0], 1 729*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym4 730*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym5 ; 1 2 731*c0909341SAndroid Build Coastguard Worker valignq ym2, ym1, ym0, 2 732*c0909341SAndroid Build Coastguard Worker mova ym0, ym1 733*c0909341SAndroid Build Coastguard Worker psubw ym1, ym2 734*c0909341SAndroid Build Coastguard Worker paddw ym1, ym1 735*c0909341SAndroid Build Coastguard Worker pmulhw ym1, ym6 736*c0909341SAndroid Build Coastguard Worker paddw ym1, ym2 737*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym7 738*c0909341SAndroid Build Coastguard Worker vpmovuswb xm1, ym1 739*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm1 740*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm1 741*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 742*c0909341SAndroid Build Coastguard Worker sub hd, 2 743*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 744*c0909341SAndroid Build Coastguard Worker RET 745*c0909341SAndroid Build Coastguard Worker.hv_w16: 746*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+ssq*0] 747*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm16] 748*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m0 749*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 750*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 751*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*1] 752*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 753*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+ssq*0], 1 754*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, m1 755*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 ; 1 2 756*c0909341SAndroid Build Coastguard Worker valignq m2, m1, m0, 4 ; 0 1 757*c0909341SAndroid Build Coastguard Worker mova m0, m1 758*c0909341SAndroid Build Coastguard Worker psubw m1, m2 759*c0909341SAndroid Build Coastguard Worker paddw m1, m1 760*c0909341SAndroid Build Coastguard Worker pmulhw m1, m6 761*c0909341SAndroid Build Coastguard Worker paddw m1, m2 762*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 763*c0909341SAndroid Build Coastguard Worker vpmovuswb ym1, m1 764*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm1 765*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym1, 1 766*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 767*c0909341SAndroid Build Coastguard Worker sub hd, 2 768*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 769*c0909341SAndroid Build Coastguard Worker RET 770*c0909341SAndroid Build Coastguard Worker.hv_w32: 771*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 772*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+ssq*0] 773*c0909341SAndroid Build Coastguard Worker pmovzxbq m8, [pb_02461357] 774*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 775*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 776*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+ssq*1] 777*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 778*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, [srcq+ssq*0] 779*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 780*c0909341SAndroid Build Coastguard Worker psubw m1, m2, m0 781*c0909341SAndroid Build Coastguard Worker paddw m1, m1 782*c0909341SAndroid Build Coastguard Worker pmulhw m1, m6 783*c0909341SAndroid Build Coastguard Worker paddw m1, m0 784*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3, m5 785*c0909341SAndroid Build Coastguard Worker psubw m3, m0, m2 786*c0909341SAndroid Build Coastguard Worker paddw m3, m3 787*c0909341SAndroid Build Coastguard Worker pmulhw m3, m6 788*c0909341SAndroid Build Coastguard Worker paddw m3, m2 789*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 790*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m7 791*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 792*c0909341SAndroid Build Coastguard Worker vpermq m1, m8, m1 793*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym1 794*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m1, 1 795*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 796*c0909341SAndroid Build Coastguard Worker sub hd, 2 797*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 798*c0909341SAndroid Build Coastguard Worker RET 799*c0909341SAndroid Build Coastguard Worker.hv_w64: 800*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 801*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 802*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 803*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 804*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 805*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 806*c0909341SAndroid Build Coastguard Worker.hv_w64_loop: 807*c0909341SAndroid Build Coastguard Worker add srcq, ssq 808*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+8*0] 809*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+8*1] 810*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 811*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 812*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 813*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 814*c0909341SAndroid Build Coastguard Worker psubw m8, m2, m0 815*c0909341SAndroid Build Coastguard Worker psubw m9, m3, m1 816*c0909341SAndroid Build Coastguard Worker paddw m8, m8 817*c0909341SAndroid Build Coastguard Worker pmulhw m8, m6 818*c0909341SAndroid Build Coastguard Worker paddw m9, m9 819*c0909341SAndroid Build Coastguard Worker pmulhw m9, m6 820*c0909341SAndroid Build Coastguard Worker paddw m8, m0 821*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m7 822*c0909341SAndroid Build Coastguard Worker paddw m9, m1 823*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m7 824*c0909341SAndroid Build Coastguard Worker mova m0, m2 825*c0909341SAndroid Build Coastguard Worker mova m1, m3 826*c0909341SAndroid Build Coastguard Worker packuswb m8, m9 827*c0909341SAndroid Build Coastguard Worker mova [dstq], m8 828*c0909341SAndroid Build Coastguard Worker add dstq, dsq 829*c0909341SAndroid Build Coastguard Worker dec hd 830*c0909341SAndroid Build Coastguard Worker jg .hv_w64_loop 831*c0909341SAndroid Build Coastguard Worker RET 832*c0909341SAndroid Build Coastguard Worker.hv_w128: 833*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 834*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 835*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+8*8] 836*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+8*9] 837*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m4}, m0, m1, m2, m3 838*c0909341SAndroid Build Coastguard Worker REPX {pmaddubsw x, m5}, m0, m1, m2, m3 839*c0909341SAndroid Build Coastguard Worker.hv_w128_loop: 840*c0909341SAndroid Build Coastguard Worker add srcq, ssq 841*c0909341SAndroid Build Coastguard Worker movu m8, [srcq+8*0] 842*c0909341SAndroid Build Coastguard Worker movu m9, [srcq+8*1] 843*c0909341SAndroid Build Coastguard Worker movu m10, [srcq+8*8] 844*c0909341SAndroid Build Coastguard Worker movu m11, [srcq+8*9] 845*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m4}, m8, m9, m10, m11 846*c0909341SAndroid Build Coastguard Worker REPX {pmaddubsw x, m5}, m8, m9, m10, m11 847*c0909341SAndroid Build Coastguard Worker psubw m12, m8, m0 848*c0909341SAndroid Build Coastguard Worker psubw m13, m9, m1 849*c0909341SAndroid Build Coastguard Worker psubw m14, m10, m2 850*c0909341SAndroid Build Coastguard Worker psubw m15, m11, m3 851*c0909341SAndroid Build Coastguard Worker paddw m12, m12 852*c0909341SAndroid Build Coastguard Worker pmulhw m12, m6 853*c0909341SAndroid Build Coastguard Worker paddw m13, m13 854*c0909341SAndroid Build Coastguard Worker pmulhw m13, m6 855*c0909341SAndroid Build Coastguard Worker paddw m14, m14 856*c0909341SAndroid Build Coastguard Worker pmulhw m14, m6 857*c0909341SAndroid Build Coastguard Worker paddw m15, m15 858*c0909341SAndroid Build Coastguard Worker pmulhw m15, m6 859*c0909341SAndroid Build Coastguard Worker paddw m12, m0 860*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, m7 861*c0909341SAndroid Build Coastguard Worker paddw m13, m1 862*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, m7 863*c0909341SAndroid Build Coastguard Worker paddw m14, m2 864*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m7 865*c0909341SAndroid Build Coastguard Worker paddw m15, m3 866*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m7 867*c0909341SAndroid Build Coastguard Worker mova m0, m8 868*c0909341SAndroid Build Coastguard Worker mova m1, m9 869*c0909341SAndroid Build Coastguard Worker mova m2, m10 870*c0909341SAndroid Build Coastguard Worker mova m3, m11 871*c0909341SAndroid Build Coastguard Worker packuswb m12, m13 872*c0909341SAndroid Build Coastguard Worker packuswb m14, m15 873*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m12 874*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m14 875*c0909341SAndroid Build Coastguard Worker add dstq, dsq 876*c0909341SAndroid Build Coastguard Worker dec hd 877*c0909341SAndroid Build Coastguard Worker jg .hv_w128_loop 878*c0909341SAndroid Build Coastguard Worker RET 879*c0909341SAndroid Build Coastguard Worker 880*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 3, 5, 6 881*c0909341SAndroid Build Coastguard Worker 882*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 883*c0909341SAndroid Build Coastguard Worker movifnidn mxyd, r5m ; mx 884*c0909341SAndroid Build Coastguard Worker lea t2, [prep_avx512icl] 885*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 886*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 887*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 888*c0909341SAndroid Build Coastguard Worker jnz .h 889*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 890*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 891*c0909341SAndroid Build Coastguard Worker jnz .v 892*c0909341SAndroid Build Coastguard Worker.prep: 893*c0909341SAndroid Build Coastguard Worker movzx wd, word [t2+wq*2+table_offset(prep,)] 894*c0909341SAndroid Build Coastguard Worker add wq, t2 895*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 896*c0909341SAndroid Build Coastguard Worker jmp wq 897*c0909341SAndroid Build Coastguard Worker.prep_w4: 898*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+strideq*0] 899*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [srcq+strideq*1], 1 900*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [srcq+strideq*2], 2 901*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [srcq+stride3q ], 3 902*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 903*c0909341SAndroid Build Coastguard Worker pmovzxbw ym0, xmm0 904*c0909341SAndroid Build Coastguard Worker psllw ym0, 4 905*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym0 906*c0909341SAndroid Build Coastguard Worker add tmpq, 32 907*c0909341SAndroid Build Coastguard Worker sub hd, 4 908*c0909341SAndroid Build Coastguard Worker jg .prep_w4 909*c0909341SAndroid Build Coastguard Worker RET 910*c0909341SAndroid Build Coastguard Worker.prep_w8: 911*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+strideq*0] 912*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+strideq*1] 913*c0909341SAndroid Build Coastguard Worker vinserti128 ym0, ymm0, [srcq+strideq*2], 1 914*c0909341SAndroid Build Coastguard Worker vinserti128 ym1, ymm1, [srcq+stride3q ], 1 915*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 916*c0909341SAndroid Build Coastguard Worker punpcklqdq ym0, ym1 917*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, ym0 918*c0909341SAndroid Build Coastguard Worker psllw m0, 4 919*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 920*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 921*c0909341SAndroid Build Coastguard Worker sub hd, 4 922*c0909341SAndroid Build Coastguard Worker jg .prep_w8 923*c0909341SAndroid Build Coastguard Worker RET 924*c0909341SAndroid Build Coastguard Worker.prep_w16: 925*c0909341SAndroid Build Coastguard Worker movu xmm0, [srcq+strideq*0] 926*c0909341SAndroid Build Coastguard Worker vinserti128 ym0, ymm0, [srcq+strideq*1], 1 927*c0909341SAndroid Build Coastguard Worker movu xmm1, [srcq+strideq*2] 928*c0909341SAndroid Build Coastguard Worker vinserti128 ym1, ymm1, [srcq+stride3q ], 1 929*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 930*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, ym0 931*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, ym1 932*c0909341SAndroid Build Coastguard Worker psllw m0, 4 933*c0909341SAndroid Build Coastguard Worker psllw m1, 4 934*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 935*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 936*c0909341SAndroid Build Coastguard Worker add tmpq, 32*4 937*c0909341SAndroid Build Coastguard Worker sub hd, 4 938*c0909341SAndroid Build Coastguard Worker jg .prep_w16 939*c0909341SAndroid Build Coastguard Worker RET 940*c0909341SAndroid Build Coastguard Worker.prep_w32: 941*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+strideq*0] 942*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+strideq*1] 943*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+strideq*2] 944*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+stride3q ] 945*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 946*c0909341SAndroid Build Coastguard Worker REPX {psllw x, 4}, m0, m1, m2, m3 947*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 948*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 949*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 950*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 951*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 952*c0909341SAndroid Build Coastguard Worker sub hd, 4 953*c0909341SAndroid Build Coastguard Worker jg .prep_w32 954*c0909341SAndroid Build Coastguard Worker RET 955*c0909341SAndroid Build Coastguard Worker.prep_w64: 956*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+strideq*0+32*0] 957*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+strideq*0+32*1] 958*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+strideq*1+32*0] 959*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+strideq*1+32*1] 960*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 961*c0909341SAndroid Build Coastguard Worker REPX {psllw x, 4}, m0, m1, m2, m3 962*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 963*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 964*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 965*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 966*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 967*c0909341SAndroid Build Coastguard Worker sub hd, 2 968*c0909341SAndroid Build Coastguard Worker jg .prep_w64 969*c0909341SAndroid Build Coastguard Worker RET 970*c0909341SAndroid Build Coastguard Worker.prep_w128: 971*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [srcq+32*0] 972*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [srcq+32*1] 973*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [srcq+32*2] 974*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [srcq+32*3] 975*c0909341SAndroid Build Coastguard Worker REPX {psllw x, 4}, m0, m1, m2, m3 976*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 977*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 978*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 979*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 980*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 981*c0909341SAndroid Build Coastguard Worker add srcq, strideq 982*c0909341SAndroid Build Coastguard Worker dec hd 983*c0909341SAndroid Build Coastguard Worker jg .prep_w128 984*c0909341SAndroid Build Coastguard Worker RET 985*c0909341SAndroid Build Coastguard Worker.h: 986*c0909341SAndroid Build Coastguard Worker ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 987*c0909341SAndroid Build Coastguard Worker ; = (16 - mx) * src[x] + mx * src[x + 1] 988*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 989*c0909341SAndroid Build Coastguard Worker add mxyd, 16 990*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, mxyd 991*c0909341SAndroid Build Coastguard Worker mov mxyd, r6m ; my 992*c0909341SAndroid Build Coastguard Worker test mxyd, mxyd 993*c0909341SAndroid Build Coastguard Worker jnz .hv 994*c0909341SAndroid Build Coastguard Worker movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] 995*c0909341SAndroid Build Coastguard Worker add wq, t2 996*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 997*c0909341SAndroid Build Coastguard Worker jmp wq 998*c0909341SAndroid Build Coastguard Worker.h_w4: 999*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [bilin_h_shuf4] 1000*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 1001*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+strideq*0] 1002*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+strideq*1] 1003*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 1004*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 1005*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1006*c0909341SAndroid Build Coastguard Worker punpcklqdq ym0, ym1 1007*c0909341SAndroid Build Coastguard Worker pshufb ym0, ym4 1008*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym5 1009*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym0 1010*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1011*c0909341SAndroid Build Coastguard Worker sub hd, 4 1012*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 1013*c0909341SAndroid Build Coastguard Worker RET 1014*c0909341SAndroid Build Coastguard Worker.h_w8: 1015*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [bilin_h_perm16] 1016*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 1017*c0909341SAndroid Build Coastguard Worker movu xmm0, [srcq+strideq*0] 1018*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 1019*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+strideq*2], 2 1020*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+stride3q ], 3 1021*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1022*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1023*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1024*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 1025*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1026*c0909341SAndroid Build Coastguard Worker sub hd, 4 1027*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 1028*c0909341SAndroid Build Coastguard Worker RET 1029*c0909341SAndroid Build Coastguard Worker.h_w16: 1030*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm16] 1031*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 1032*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 1033*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*1], 1 1034*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+strideq*2] 1035*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+stride3q ], 1 1036*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1037*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m0 1038*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, m1 1039*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1040*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1041*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1042*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1043*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1044*c0909341SAndroid Build Coastguard Worker sub hd, 4 1045*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 1046*c0909341SAndroid Build Coastguard Worker RET 1047*c0909341SAndroid Build Coastguard Worker.h_w32: 1048*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 1049*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 1050*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+strideq*0] 1051*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, [srcq+strideq*1] 1052*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+strideq*2] 1053*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, [srcq+stride3q ] 1054*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1055*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1056*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1057*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1058*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1059*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1060*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1061*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 1062*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 1063*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1064*c0909341SAndroid Build Coastguard Worker sub hd, 4 1065*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop 1066*c0909341SAndroid Build Coastguard Worker RET 1067*c0909341SAndroid Build Coastguard Worker.h_w64: 1068*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 1069*c0909341SAndroid Build Coastguard Worker.h_w64_loop: 1070*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+strideq*0+32*0] 1071*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, [srcq+strideq*0+32*1] 1072*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+strideq*1+32*0] 1073*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, [srcq+strideq*1+32*1] 1074*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1075*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1076*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1077*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1078*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1079*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1080*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1081*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 1082*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 1083*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1084*c0909341SAndroid Build Coastguard Worker sub hd, 2 1085*c0909341SAndroid Build Coastguard Worker jg .h_w64_loop 1086*c0909341SAndroid Build Coastguard Worker RET 1087*c0909341SAndroid Build Coastguard Worker.h_w128: 1088*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 1089*c0909341SAndroid Build Coastguard Worker.h_w128_loop: 1090*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+32*0] 1091*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, [srcq+32*1] 1092*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+32*2] 1093*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, [srcq+32*3] 1094*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1095*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1096*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1097*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1098*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 1099*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 1100*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m2 1101*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m3 1102*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1103*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1104*c0909341SAndroid Build Coastguard Worker dec hd 1105*c0909341SAndroid Build Coastguard Worker jg .h_w128_loop 1106*c0909341SAndroid Build Coastguard Worker RET 1107*c0909341SAndroid Build Coastguard Worker.v: 1108*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 1109*c0909341SAndroid Build Coastguard Worker movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] 1110*c0909341SAndroid Build Coastguard Worker imul mxyd, 255 1111*c0909341SAndroid Build Coastguard Worker add mxyd, 16 1112*c0909341SAndroid Build Coastguard Worker add wq, t2 1113*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1114*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, mxyd 1115*c0909341SAndroid Build Coastguard Worker jmp wq 1116*c0909341SAndroid Build Coastguard Worker.v_w4: 1117*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [srcq+strideq*0] 1118*c0909341SAndroid Build Coastguard Worker mov r3d, 0x29 1119*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [bilin_v_shuf4] 1120*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 1121*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1122*c0909341SAndroid Build Coastguard Worker vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ 1123*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [srcq+strideq*2] 1124*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ 1125*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1126*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym0, [srcq+strideq*0] 1127*c0909341SAndroid Build Coastguard Worker punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ 1128*c0909341SAndroid Build Coastguard Worker pshufb ym2, ym3 1129*c0909341SAndroid Build Coastguard Worker pmaddubsw ym2, ym6 1130*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym2 1131*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1132*c0909341SAndroid Build Coastguard Worker sub hd, 4 1133*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1134*c0909341SAndroid Build Coastguard Worker RET 1135*c0909341SAndroid Build Coastguard Worker.v_w8: 1136*c0909341SAndroid Build Coastguard Worker mova m5, [bilin_v_perm8] 1137*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [srcq+strideq*0] 1138*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1139*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1140*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym0, [srcq+strideq*2] 1141*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+stride3q ], 2 1142*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1143*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+strideq*0], 0 1144*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m5, m0 1145*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1146*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1147*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1148*c0909341SAndroid Build Coastguard Worker sub hd, 4 1149*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1150*c0909341SAndroid Build Coastguard Worker RET 1151*c0909341SAndroid Build Coastguard Worker.v_w16: 1152*c0909341SAndroid Build Coastguard Worker mova m5, [bilin_v_perm16] 1153*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 1154*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1155*c0909341SAndroid Build Coastguard Worker movu xm2, [srcq+strideq*2] 1156*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 1157*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m5, m2 1158*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+stride3q ], 1 1159*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1160*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+strideq*0] 1161*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m5, m0 1162*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1163*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1164*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m1 1165*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1166*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1167*c0909341SAndroid Build Coastguard Worker sub hd, 4 1168*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1169*c0909341SAndroid Build Coastguard Worker RET 1170*c0909341SAndroid Build Coastguard Worker.v_w32: 1171*c0909341SAndroid Build Coastguard Worker mova m5, [bilin_v_perm32] 1172*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 1173*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 1174*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+strideq*1] 1175*c0909341SAndroid Build Coastguard Worker movu ym3, [srcq+strideq*2] 1176*c0909341SAndroid Build Coastguard Worker movu ym4, [srcq+stride3q ] 1177*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1178*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m5, m2 1179*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m5, m3 1180*c0909341SAndroid Build Coastguard Worker vpermt2b m3, m5, m4 1181*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m0, m6 1182*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 1183*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m5, m0 1184*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1185*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1186*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1187*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m1 1188*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1189*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m3 1190*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m4 1191*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1192*c0909341SAndroid Build Coastguard Worker sub hd, 4 1193*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 1194*c0909341SAndroid Build Coastguard Worker RET 1195*c0909341SAndroid Build Coastguard Worker.v_w64: 1196*c0909341SAndroid Build Coastguard Worker mova m5, [bilin_v_perm64] 1197*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, [srcq+strideq*0] 1198*c0909341SAndroid Build Coastguard Worker.v_w64_loop: 1199*c0909341SAndroid Build Coastguard Worker vpermq m1, m5, [srcq+strideq*1] 1200*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1201*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0, m1 1202*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m1 1203*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, [srcq+strideq*0] 1204*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m1, m0 1205*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0 1206*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1207*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1208*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1209*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1210*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m4 1211*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1212*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m3 1213*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m1 1214*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1215*c0909341SAndroid Build Coastguard Worker sub hd, 2 1216*c0909341SAndroid Build Coastguard Worker jg .v_w64_loop 1217*c0909341SAndroid Build Coastguard Worker RET 1218*c0909341SAndroid Build Coastguard Worker.v_w128: 1219*c0909341SAndroid Build Coastguard Worker mova m5, [bilin_v_perm64] 1220*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, [srcq+strideq*0+ 0] 1221*c0909341SAndroid Build Coastguard Worker vpermq m1, m5, [srcq+strideq*0+64] 1222*c0909341SAndroid Build Coastguard Worker.v_w128_loop: 1223*c0909341SAndroid Build Coastguard Worker vpermq m2, m5, [srcq+strideq*1+ 0] 1224*c0909341SAndroid Build Coastguard Worker vpermq m3, m5, [srcq+strideq*1+64] 1225*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1226*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0, m2 1227*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2 1228*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1229*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6 1230*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m4 1231*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m0 1232*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m1, m3 1233*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 1234*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1235*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 1236*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m4 1237*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m1 1238*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, [srcq+strideq*0+ 0] 1239*c0909341SAndroid Build Coastguard Worker vpermq m1, m5, [srcq+strideq*0+64] 1240*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2, m0 1241*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0 1242*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1243*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 1244*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*4], m4 1245*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*5], m2 1246*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3, m1 1247*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 1248*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m6 1249*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 1250*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*6], m4 1251*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*7], m3 1252*c0909341SAndroid Build Coastguard Worker add tmpq, 64*8 1253*c0909341SAndroid Build Coastguard Worker sub hd, 2 1254*c0909341SAndroid Build Coastguard Worker jg .v_w128_loop 1255*c0909341SAndroid Build Coastguard Worker RET 1256*c0909341SAndroid Build Coastguard Worker.hv: 1257*c0909341SAndroid Build Coastguard Worker ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1258*c0909341SAndroid Build Coastguard Worker ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1259*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 7 1260*c0909341SAndroid Build Coastguard Worker movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] 1261*c0909341SAndroid Build Coastguard Worker shl mxyd, 11 1262*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, mxyd 1263*c0909341SAndroid Build Coastguard Worker add wq, t2 1264*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1265*c0909341SAndroid Build Coastguard Worker jmp wq 1266*c0909341SAndroid Build Coastguard Worker.hv_w4: 1267*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [bilin_h_shuf4] 1268*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym0, [srcq+strideq*0] 1269*c0909341SAndroid Build Coastguard Worker pshufb ym0, ym4 1270*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym5 1271*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1272*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+strideq*1] 1273*c0909341SAndroid Build Coastguard Worker movq xmm2, [srcq+strideq*2] 1274*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 1275*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1276*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 1277*c0909341SAndroid Build Coastguard Worker punpcklqdq ym1, ym2 1278*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym4 1279*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym5 ; 1 2 3 4 1280*c0909341SAndroid Build Coastguard Worker valignq ym2, ym1, ym0, 3 ; 0 1 2 3 1281*c0909341SAndroid Build Coastguard Worker mova ym0, ym1 1282*c0909341SAndroid Build Coastguard Worker psubw ym1, ym2 1283*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym6 1284*c0909341SAndroid Build Coastguard Worker paddw ym1, ym2 1285*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym1 1286*c0909341SAndroid Build Coastguard Worker add tmpq, 32 1287*c0909341SAndroid Build Coastguard Worker sub hd, 4 1288*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1289*c0909341SAndroid Build Coastguard Worker RET 1290*c0909341SAndroid Build Coastguard Worker.hv_w8: 1291*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [bilin_h_perm16] 1292*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+strideq*0] 1293*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1294*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1295*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1296*c0909341SAndroid Build Coastguard Worker movu xmm1, [srcq+strideq*1] 1297*c0909341SAndroid Build Coastguard Worker vinserti128 ym1, ymm1, [srcq+strideq*2], 1 1298*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+stride3q ], 2 1299*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1300*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq*0], 3 1301*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1302*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 ; 1 2 3 4 1303*c0909341SAndroid Build Coastguard Worker valignq m2, m1, m0, 6 ; 0 1 2 3 1304*c0909341SAndroid Build Coastguard Worker mova m0, m1 1305*c0909341SAndroid Build Coastguard Worker psubw m1, m2 1306*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1307*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1308*c0909341SAndroid Build Coastguard Worker mova [tmpq], m1 1309*c0909341SAndroid Build Coastguard Worker add tmpq, 64 1310*c0909341SAndroid Build Coastguard Worker sub hd, 4 1311*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 1312*c0909341SAndroid Build Coastguard Worker RET 1313*c0909341SAndroid Build Coastguard Worker.hv_w16: 1314*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm16] 1315*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m0, [srcq+strideq*0] 1316*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m0 1317*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1318*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 1319*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+strideq*1] 1320*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+strideq*2], 1 1321*c0909341SAndroid Build Coastguard Worker movu ym2, [srcq+stride3q ] 1322*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1323*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [srcq+strideq*0], 1 1324*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, m1 1325*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, m2 1326*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 ; 1 2 1327*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m0, m1, q1032 ; 0 1 1328*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 ; 3 4 1329*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m1, m0, q1032 ; 2 3 1330*c0909341SAndroid Build Coastguard Worker psubw m1, m3 1331*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1332*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1333*c0909341SAndroid Build Coastguard Worker psubw m3, m0, m2 1334*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1335*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1336*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m1 1337*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m3 1338*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1339*c0909341SAndroid Build Coastguard Worker sub hd, 4 1340*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 1341*c0909341SAndroid Build Coastguard Worker RET 1342*c0909341SAndroid Build Coastguard Worker.hv_w32: 1343*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 1344*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+strideq*0] 1345*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1346*c0909341SAndroid Build Coastguard Worker.hv_w32_loop: 1347*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, [srcq+strideq*1] 1348*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1349*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+strideq*0] 1350*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1351*c0909341SAndroid Build Coastguard Worker psubw m3, m1, m0 1352*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1353*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1354*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 1355*c0909341SAndroid Build Coastguard Worker psubw m2, m0, m1 1356*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 1357*c0909341SAndroid Build Coastguard Worker paddw m2, m1 1358*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m3 1359*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m2 1360*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1361*c0909341SAndroid Build Coastguard Worker sub hd, 2 1362*c0909341SAndroid Build Coastguard Worker jg .hv_w32_loop 1363*c0909341SAndroid Build Coastguard Worker RET 1364*c0909341SAndroid Build Coastguard Worker.hv_w64: 1365*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 1366*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+32*0] 1367*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, [srcq+32*1] 1368*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m5 1369*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 1370*c0909341SAndroid Build Coastguard Worker.hv_w64_loop: 1371*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1372*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+32*0] 1373*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, [srcq+32*1] 1374*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m5 1375*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1376*c0909341SAndroid Build Coastguard Worker psubw m7, m2, m0 1377*c0909341SAndroid Build Coastguard Worker psubw m8, m3, m1 1378*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m6 1379*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m6 1380*c0909341SAndroid Build Coastguard Worker paddw m7, m0 1381*c0909341SAndroid Build Coastguard Worker mova m0, m2 1382*c0909341SAndroid Build Coastguard Worker paddw m8, m1 1383*c0909341SAndroid Build Coastguard Worker mova m1, m3 1384*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m7 1385*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m8 1386*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 1387*c0909341SAndroid Build Coastguard Worker dec hd 1388*c0909341SAndroid Build Coastguard Worker jg .hv_w64_loop 1389*c0909341SAndroid Build Coastguard Worker RET 1390*c0909341SAndroid Build Coastguard Worker.hv_w128: 1391*c0909341SAndroid Build Coastguard Worker mova m4, [bilin_h_perm32] 1392*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, [srcq+32*0] 1393*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, [srcq+32*1] 1394*c0909341SAndroid Build Coastguard Worker vpermb m2, m4, [srcq+32*2] 1395*c0909341SAndroid Build Coastguard Worker vpermb m3, m4, [srcq+32*3] 1396*c0909341SAndroid Build Coastguard Worker REPX {pmaddubsw x, m5}, m0, m1, m2, m3 1397*c0909341SAndroid Build Coastguard Worker.hv_w128_loop: 1398*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1399*c0909341SAndroid Build Coastguard Worker vpermb m7, m4, [srcq+32*0] 1400*c0909341SAndroid Build Coastguard Worker vpermb m8, m4, [srcq+32*1] 1401*c0909341SAndroid Build Coastguard Worker vpermb m9, m4, [srcq+32*2] 1402*c0909341SAndroid Build Coastguard Worker vpermb m10, m4, [srcq+32*3] 1403*c0909341SAndroid Build Coastguard Worker REPX {pmaddubsw x, m5}, m7, m8, m9, m10 1404*c0909341SAndroid Build Coastguard Worker psubw m11, m7, m0 1405*c0909341SAndroid Build Coastguard Worker psubw m12, m8, m1 1406*c0909341SAndroid Build Coastguard Worker psubw m13, m9, m2 1407*c0909341SAndroid Build Coastguard Worker psubw m14, m10, m3 1408*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m6}, m11, m12, m13, m14 1409*c0909341SAndroid Build Coastguard Worker paddw m11, m0 1410*c0909341SAndroid Build Coastguard Worker mova m0, m7 1411*c0909341SAndroid Build Coastguard Worker paddw m12, m1 1412*c0909341SAndroid Build Coastguard Worker mova m1, m8 1413*c0909341SAndroid Build Coastguard Worker paddw m13, m2 1414*c0909341SAndroid Build Coastguard Worker mova m2, m9 1415*c0909341SAndroid Build Coastguard Worker paddw m14, m3 1416*c0909341SAndroid Build Coastguard Worker mova m3, m10 1417*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m11 1418*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m12 1419*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*2], m13 1420*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*3], m14 1421*c0909341SAndroid Build Coastguard Worker add tmpq, 64*4 1422*c0909341SAndroid Build Coastguard Worker dec hd 1423*c0909341SAndroid Build Coastguard Worker jg .hv_w128_loop 1424*c0909341SAndroid Build Coastguard Worker RET 1425*c0909341SAndroid Build Coastguard Worker 1426*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8] 1427*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15 1428*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1429*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP (2*15 << 16) | 3*15 1430*c0909341SAndroid Build Coastguard Worker 1431*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to 1432*c0909341SAndroid Build Coastguard Workercglobal %1_%2_8bpc 1433*c0909341SAndroid Build Coastguard Worker mov t0d, FILTER_%3 1434*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4 1435*c0909341SAndroid Build Coastguard Worker mov t1d, t0d 1436*c0909341SAndroid Build Coastguard Worker%else 1437*c0909341SAndroid Build Coastguard Worker mov t1d, FILTER_%4 1438*c0909341SAndroid Build Coastguard Worker%endif 1439*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter 1440*c0909341SAndroid Build Coastguard Worker jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1441*c0909341SAndroid Build Coastguard Worker%endif 1442*c0909341SAndroid Build Coastguard Worker%endmacro 1443*c0909341SAndroid Build Coastguard Worker 1444*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb 1445*c0909341SAndroid Build Coastguard Worker%if %5 1446*c0909341SAndroid Build Coastguard Worker vpermb m%2, m6, m%1 1447*c0909341SAndroid Build Coastguard Worker vpermb m%3, m7, m%1 1448*c0909341SAndroid Build Coastguard Worker vpermb m%4, m8, m%1 1449*c0909341SAndroid Build Coastguard Worker%else 1450*c0909341SAndroid Build Coastguard Worker%if %2 < %4 ; reuse a previous value if possible 1451*c0909341SAndroid Build Coastguard Worker pshufb m%2, m%1, m6 1452*c0909341SAndroid Build Coastguard Worker%endif 1453*c0909341SAndroid Build Coastguard Worker pshufb m%3, m%1, m7 1454*c0909341SAndroid Build Coastguard Worker pshufb m%4, m%1, m8 1455*c0909341SAndroid Build Coastguard Worker%endif 1456*c0909341SAndroid Build Coastguard Worker mova m%1, m5 1457*c0909341SAndroid Build Coastguard Worker vpdpbusd m%1, m%2, m9 1458*c0909341SAndroid Build Coastguard Worker mova m%2, m5 1459*c0909341SAndroid Build Coastguard Worker vpdpbusd m%2, m%3, m9 1460*c0909341SAndroid Build Coastguard Worker vpdpbusd m%1, m%3, m10 1461*c0909341SAndroid Build Coastguard Worker vpdpbusd m%2, m%4, m10 1462*c0909341SAndroid Build Coastguard Worker packusdw m%1, m%2 1463*c0909341SAndroid Build Coastguard Worker psrlw m%1, 6 1464*c0909341SAndroid Build Coastguard Worker%endmacro 1465*c0909341SAndroid Build Coastguard Worker 1466*c0909341SAndroid Build Coastguard Worker%if WIN64 1467*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5 1468*c0909341SAndroid Build Coastguard Worker%else 1469*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8 1470*c0909341SAndroid Build Coastguard Worker%endif 1471*c0909341SAndroid Build Coastguard Worker 1472*c0909341SAndroid Build Coastguard Worker; Due to the use of vpdpbusd (which does 4 pixels per instruction) in 1473*c0909341SAndroid Build Coastguard Worker; the horizontal filter, 6-tap is only used for the vertical filter. 1474*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap, 1475*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc 1476*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc 1477*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc 1478*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc 1479*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc 1480*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular, REGULAR, REGULAR 1481*c0909341SAndroid Build Coastguard Worker 1482*c0909341SAndroid Build Coastguard Workercglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns 1483*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx512icl 1484*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 1485*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 1486*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 1487*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 1488*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx512icl] 1489*c0909341SAndroid Build Coastguard Worker movsxd wq, wm 1490*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1491*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 1492*c0909341SAndroid Build Coastguard Worker jnz .h 1493*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1494*c0909341SAndroid Build Coastguard Worker jnz .v 1495*c0909341SAndroid Build Coastguard Worker.put: 1496*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 1497*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put,)] 1498*c0909341SAndroid Build Coastguard Worker add wq, r8 1499*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 1500*c0909341SAndroid Build Coastguard Worker lea r7, [dsq*3] 1501*c0909341SAndroid Build Coastguard Worker%if WIN64 1502*c0909341SAndroid Build Coastguard Worker pop r8 1503*c0909341SAndroid Build Coastguard Worker%endif 1504*c0909341SAndroid Build Coastguard Worker jmp wq 1505*c0909341SAndroid Build Coastguard Worker.v: 1506*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1507*c0909341SAndroid Build Coastguard Worker shr myd, 16 1508*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1509*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1510*c0909341SAndroid Build Coastguard Worker tzcnt r6d, wd 1511*c0909341SAndroid Build Coastguard Worker movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] 1512*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pw_512] 1513*c0909341SAndroid Build Coastguard Worker lea myq, [base+subpel_filters+1+myq*8] 1514*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [myq+0] 1515*c0909341SAndroid Build Coastguard Worker add r6, r8 1516*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+2] 1517*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 1518*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [myq+4] 1519*c0909341SAndroid Build Coastguard Worker neg nsq 1520*c0909341SAndroid Build Coastguard Worker jmp r6 1521*c0909341SAndroid Build Coastguard Worker.v_w2: 1522*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+nsq*2] 1523*c0909341SAndroid Build Coastguard Worker pinsrw xmm2, [srcq+nsq*1], 2 1524*c0909341SAndroid Build Coastguard Worker pinsrw xmm2, [srcq+ssq*0], 4 1525*c0909341SAndroid Build Coastguard Worker pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3 1526*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1527*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 1528*c0909341SAndroid Build Coastguard Worker palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1529*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm2, xmm3 ; 01 12 1530*c0909341SAndroid Build Coastguard Worker punpckhbw xmm2, xmm3 ; 23 34 1531*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 1532*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm4, [srcq+ssq*1] 1533*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1534*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm3, xmm1, xm7 ; a0 b0 1535*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 1536*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm2, xm8 ; a1 b1 1537*c0909341SAndroid Build Coastguard Worker paddw xmm3, xmm2 1538*c0909341SAndroid Build Coastguard Worker vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 1539*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 1540*c0909341SAndroid Build Coastguard Worker vpblendd xmm4, xmm0, 0x02 ; 5 6 1541*c0909341SAndroid Build Coastguard Worker punpcklbw xmm2, xmm4 ; 67 78 1542*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm4, xmm2, xm9 ; a3 b3 1543*c0909341SAndroid Build Coastguard Worker paddw xmm3, xmm4 1544*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm3, xm6 1545*c0909341SAndroid Build Coastguard Worker packuswb xmm3, xmm3 1546*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm3, 0 1547*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm3, 2 1548*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1549*c0909341SAndroid Build Coastguard Worker sub hd, 2 1550*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 1551*c0909341SAndroid Build Coastguard Worker RET 1552*c0909341SAndroid Build Coastguard Worker.v_w4: 1553*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+nsq*2] 1554*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+nsq*1], 1 1555*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*0], 2 1556*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 1557*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1558*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 1559*c0909341SAndroid Build Coastguard Worker palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 1560*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm2, xmm3 ; 01 12 1561*c0909341SAndroid Build Coastguard Worker punpckhbw xmm2, xmm3 ; 23 34 1562*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 1563*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm4, [srcq+ssq*1] 1564*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1565*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm3, xmm1, xm7 ; a0 b0 1566*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 1567*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm2, xm8 ; a1 b1 1568*c0909341SAndroid Build Coastguard Worker paddw xmm3, xmm2 1569*c0909341SAndroid Build Coastguard Worker vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 1570*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 1571*c0909341SAndroid Build Coastguard Worker vpblendd xmm4, xmm0, 0x02 ; 5 6 1572*c0909341SAndroid Build Coastguard Worker punpcklbw xmm2, xmm4 ; 45 56 1573*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm4, xmm2, xm9 ; a2 b2 1574*c0909341SAndroid Build Coastguard Worker paddw xmm3, xmm4 1575*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm3, xm6 1576*c0909341SAndroid Build Coastguard Worker packuswb xmm3, xmm3 1577*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm3 1578*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm3, 1 1579*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1580*c0909341SAndroid Build Coastguard Worker sub hd, 2 1581*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 1582*c0909341SAndroid Build Coastguard Worker RET 1583*c0909341SAndroid Build Coastguard Worker.v_w8: 1584*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+nsq*2] 1585*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+nsq*1] 1586*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm2, [srcq+ssq*0] 1587*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*1] 1588*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1589*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 1590*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm3, 0x30 1591*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm2, 0x30 1592*c0909341SAndroid Build Coastguard Worker punpcklbw ymm1, ymm3 ; 01 12 1593*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm4, 0x30 1594*c0909341SAndroid Build Coastguard Worker vpblendd ymm4, ymm0, 0x30 1595*c0909341SAndroid Build Coastguard Worker punpcklbw ymm2, ymm4 ; 23 34 1596*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 1597*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*1] 1598*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1599*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm3, ymm1, ym7 ; a0 b0 1600*c0909341SAndroid Build Coastguard Worker mova ymm1, ymm2 1601*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm2, ym8 ; a1 b1 1602*c0909341SAndroid Build Coastguard Worker paddw ymm3, ymm2 1603*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm0, ymm4, 0x30 1604*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 1605*c0909341SAndroid Build Coastguard Worker vpblendd ymm4, ymm0, 0x30 1606*c0909341SAndroid Build Coastguard Worker punpcklbw ymm2, ymm4 ; 45 56 1607*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm4, ymm2, ym9 ; a2 b2 1608*c0909341SAndroid Build Coastguard Worker paddw ymm3, ymm4 1609*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm3, ym6 1610*c0909341SAndroid Build Coastguard Worker vextracti128 xmm4, ymm3, 1 1611*c0909341SAndroid Build Coastguard Worker packuswb xmm3, xmm4 1612*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm3 1613*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm3 1614*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1615*c0909341SAndroid Build Coastguard Worker sub hd, 2 1616*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 1617*c0909341SAndroid Build Coastguard Worker vzeroupper 1618*c0909341SAndroid Build Coastguard Worker RET 1619*c0909341SAndroid Build Coastguard Worker.v_w16: 1620*c0909341SAndroid Build Coastguard Worker mova m5, [spel_v_perm16a] 1621*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [srcq+nsq*2] 1622*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [srcq+nsq*1] 1623*c0909341SAndroid Build Coastguard Worker mov r6d, 0x0f 1624*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ssq*0] 1625*c0909341SAndroid Build Coastguard Worker kmovb k1, r6d 1626*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+ssq*1] 1627*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1628*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*0] 1629*c0909341SAndroid Build Coastguard Worker vshufpd m1{k1}, m3, m2, 0xcc 1630*c0909341SAndroid Build Coastguard Worker vshufpd m2{k1}, m4, m0, 0xcc 1631*c0909341SAndroid Build Coastguard Worker vpermb m1, m5, m1 ; 01 12 1632*c0909341SAndroid Build Coastguard Worker vpermb m2, m5, m2 ; 23 34 1633*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 1634*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+ssq*1] 1635*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1636*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m7 ; a0 b0 1637*c0909341SAndroid Build Coastguard Worker mova m1, m2 1638*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m8 ; a1 b1 1639*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1640*c0909341SAndroid Build Coastguard Worker mova m2, m0 1641*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*0] 1642*c0909341SAndroid Build Coastguard Worker vshufpd m2{k1}, m4, m0, 0xcc 1643*c0909341SAndroid Build Coastguard Worker vpermb m2, m5, m2 ; 45 56 1644*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m9 ; a2 b2 1645*c0909341SAndroid Build Coastguard Worker paddw m3, m4 1646*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 1647*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym4, m3, 1 1648*c0909341SAndroid Build Coastguard Worker packuswb ym3, ym4 1649*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm3 1650*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym3, 1 1651*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1652*c0909341SAndroid Build Coastguard Worker sub hd, 2 1653*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 1654*c0909341SAndroid Build Coastguard Worker RET 1655*c0909341SAndroid Build Coastguard Worker.v_w32: 1656*c0909341SAndroid Build Coastguard Worker mova m10, [spel_v_perm32] 1657*c0909341SAndroid Build Coastguard Worker pmovzxbq m5, [pb_02461357] 1658*c0909341SAndroid Build Coastguard Worker vpshrdw m11, m10, m10, 8 1659*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+nsq*2] 1660*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+nsq*1], 1 1661*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m0 ; 01 1662*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 0 1663*c0909341SAndroid Build Coastguard Worker vpermb m2, m11, m0 ; 12 1664*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 1665*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1666*c0909341SAndroid Build Coastguard Worker vpermb m3, m10, m0 ; 23 1667*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 0 1668*c0909341SAndroid Build Coastguard Worker vpermb m4, m11, m0 ; 34 1669*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 1670*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 1671*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1672*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m1, m7 1673*c0909341SAndroid Build Coastguard Worker mova m1, m3 1674*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m2, m7 1675*c0909341SAndroid Build Coastguard Worker mova m2, m4 1676*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m3, m8 1677*c0909341SAndroid Build Coastguard Worker vpermb m3, m10, m0 ; 45 1678*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 0 1679*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m4, m8 1680*c0909341SAndroid Build Coastguard Worker vpermb m4, m11, m0 ; 56 1681*c0909341SAndroid Build Coastguard Worker paddw m12, m14 1682*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m3, m9 1683*c0909341SAndroid Build Coastguard Worker paddw m13, m15 1684*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m4, m9 1685*c0909341SAndroid Build Coastguard Worker paddw m12, m14 1686*c0909341SAndroid Build Coastguard Worker paddw m13, m15 1687*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, m6 1688*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, m6 1689*c0909341SAndroid Build Coastguard Worker packuswb m12, m13 1690*c0909341SAndroid Build Coastguard Worker vpermq m12, m5, m12 1691*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym12 1692*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m12, 1 1693*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1694*c0909341SAndroid Build Coastguard Worker sub hd, 2 1695*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 1696*c0909341SAndroid Build Coastguard Worker RET 1697*c0909341SAndroid Build Coastguard Worker.v_w64: 1698*c0909341SAndroid Build Coastguard Worker.v_w128: 1699*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq*4-256] 1700*c0909341SAndroid Build Coastguard Worker.v_loop0: 1701*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+nsq*2] 1702*c0909341SAndroid Build Coastguard Worker movu m4, [srcq+nsq*1] 1703*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ssq*2] 1704*c0909341SAndroid Build Coastguard Worker movu m11, [srcq+ssq*0] 1705*c0909341SAndroid Build Coastguard Worker movu m13, [srcq+ssq*1] 1706*c0909341SAndroid Build Coastguard Worker mov r7, dstq 1707*c0909341SAndroid Build Coastguard Worker movu m0, [r4 +ssq*0] 1708*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m4 ; 01l 1709*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m4 ; 01h 1710*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m11 ; 12l 1711*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m11 ; 12h 1712*c0909341SAndroid Build Coastguard Worker punpcklbw m10, m11, m13 ; 23l 1713*c0909341SAndroid Build Coastguard Worker punpckhbw m11, m13 ; 23h 1714*c0909341SAndroid Build Coastguard Worker punpcklbw m12, m13, m0 ; 34l 1715*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m0 ; 34h 1716*c0909341SAndroid Build Coastguard Worker.v_loop: 1717*c0909341SAndroid Build Coastguard Worker movu m5, [r4+ssq*1] 1718*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m1, m7 ; a0l 1719*c0909341SAndroid Build Coastguard Worker mova m1, m10 1720*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m8 ; a1l 1721*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 1722*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m2, m7 ; a0h 1723*c0909341SAndroid Build Coastguard Worker mova m2, m11 1724*c0909341SAndroid Build Coastguard Worker pmaddubsw m11, m8 ; a1h 1725*c0909341SAndroid Build Coastguard Worker paddw m14, m10 1726*c0909341SAndroid Build Coastguard Worker punpcklbw m10, m0, m5 ; 45l 1727*c0909341SAndroid Build Coastguard Worker paddw m15, m11 1728*c0909341SAndroid Build Coastguard Worker punpckhbw m11, m0, m5 ; 45h 1729*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m10, m9 ; a2l 1730*c0909341SAndroid Build Coastguard Worker paddw m14, m0 1731*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11, m9 ; a2h 1732*c0909341SAndroid Build Coastguard Worker paddw m15, m0 1733*c0909341SAndroid Build Coastguard Worker movu m0, [r4+ssq*0] 1734*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m6 1735*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m6 1736*c0909341SAndroid Build Coastguard Worker packuswb m14, m15 1737*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m3, m7 ; b0l 1738*c0909341SAndroid Build Coastguard Worker mova m3, m12 1739*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m8 ; b1l 1740*c0909341SAndroid Build Coastguard Worker mova [r7+dsq*0], m14 1741*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m4, m7 ; b0h 1742*c0909341SAndroid Build Coastguard Worker mova m4, m13 1743*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m8 ; b1h 1744*c0909341SAndroid Build Coastguard Worker paddw m15, m12 1745*c0909341SAndroid Build Coastguard Worker punpcklbw m12, m5, m0 ; 56l 1746*c0909341SAndroid Build Coastguard Worker paddw m14, m13 1747*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m5, m0 ; 56h 1748*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m12, m9 ; b2l 1749*c0909341SAndroid Build Coastguard Worker paddw m15, m5 1750*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m13, m9 ; b2h 1751*c0909341SAndroid Build Coastguard Worker paddw m14, m5 1752*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m6 1753*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m6 1754*c0909341SAndroid Build Coastguard Worker packuswb m15, m14 1755*c0909341SAndroid Build Coastguard Worker mova [r7+dsq*1], m15 1756*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 1757*c0909341SAndroid Build Coastguard Worker sub hd, 2 1758*c0909341SAndroid Build Coastguard Worker jg .v_loop 1759*c0909341SAndroid Build Coastguard Worker add srcq, 64 1760*c0909341SAndroid Build Coastguard Worker add dstq, 64 1761*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 1762*c0909341SAndroid Build Coastguard Worker sub r6d, 256 1763*c0909341SAndroid Build Coastguard Worker jg .v_loop0 1764*c0909341SAndroid Build Coastguard Worker RET 1765*c0909341SAndroid Build Coastguard Worker.h: 1766*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 1767*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2 1768*c0909341SAndroid Build Coastguard Worker.hv: 1769*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_34] 1770*c0909341SAndroid Build Coastguard Worker mova xm10, [spel_hv_end] 1771*c0909341SAndroid Build Coastguard Worker pxor xm0, xm0 1772*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1773*c0909341SAndroid Build Coastguard Worker jg .hv_w8 1774*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 1775*c0909341SAndroid Build Coastguard Worker dec srcq 1776*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+subpel_filters+mxq*8+2] 1777*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1778*c0909341SAndroid Build Coastguard Worker shr myd, 16 1779*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1780*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1781*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym1, [base+subpel_filters+1+myq*8] 1782*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 1783*c0909341SAndroid Build Coastguard Worker punpcklbw ym0, ym1 1784*c0909341SAndroid Build Coastguard Worker neg nsq 1785*c0909341SAndroid Build Coastguard Worker psraw ym0, 2 ; << 6 1786*c0909341SAndroid Build Coastguard Worker pshufd ym11, ym0, q0000 1787*c0909341SAndroid Build Coastguard Worker pshufd ym12, ym0, q1111 1788*c0909341SAndroid Build Coastguard Worker pshufd ym13, ym0, q2222 1789*c0909341SAndroid Build Coastguard Worker cmp wd, 4 1790*c0909341SAndroid Build Coastguard Worker je .hv_w4 1791*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ym5, [subpel_h_shuf4] 1792*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+nsq*2] 1793*c0909341SAndroid Build Coastguard Worker movhps xmm0, [srcq+nsq*1] 1794*c0909341SAndroid Build Coastguard Worker movq xmm2, [srcq+ssq*0] 1795*c0909341SAndroid Build Coastguard Worker movhps xmm2, [srcq+ssq*1] 1796*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1797*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm1, [srcq+ssq*0] 1798*c0909341SAndroid Build Coastguard Worker vpblendd ymm0, ymm1, 0x30 1799*c0909341SAndroid Build Coastguard Worker pshufb xmm2, xm5 ; 2 3 1800*c0909341SAndroid Build Coastguard Worker pshufb ymm0, ym5 ; 0 1 4 1801*c0909341SAndroid Build Coastguard Worker mova xmm1, xm9 1802*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm1, xmm2, xm7 1803*c0909341SAndroid Build Coastguard Worker mova ymm2, ym9 1804*c0909341SAndroid Build Coastguard Worker vpdpbusd ymm2, ymm0, ym7 1805*c0909341SAndroid Build Coastguard Worker packssdw ymm2, ymm1 1806*c0909341SAndroid Build Coastguard Worker psraw ymm2, 2 1807*c0909341SAndroid Build Coastguard Worker vextracti128 xmm0, ymm2, 1 1808*c0909341SAndroid Build Coastguard Worker vzeroupper 1809*c0909341SAndroid Build Coastguard Worker palignr xmm0, xmm2, 4 1810*c0909341SAndroid Build Coastguard Worker punpcklwd xmm1, xmm2, xmm0 ; 01 12 1811*c0909341SAndroid Build Coastguard Worker punpckhwd xmm2, xmm0 ; 23 34 1812*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 1813*c0909341SAndroid Build Coastguard Worker movq xmm3, [srcq+ssq*1] 1814*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1815*c0909341SAndroid Build Coastguard Worker movhps xmm3, [srcq+ssq*0] 1816*c0909341SAndroid Build Coastguard Worker pmaddwd xmm4, xmm1, xm11 ; a0 b0 1817*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 1818*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm4, xmm2, xm12 ; a1 b1 1819*c0909341SAndroid Build Coastguard Worker pshufb xmm3, xm5 1820*c0909341SAndroid Build Coastguard Worker mova xmm2, xm9 1821*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm2, xmm3, xm7 1822*c0909341SAndroid Build Coastguard Worker packssdw xmm3, xmm2, xmm2 1823*c0909341SAndroid Build Coastguard Worker psraw xmm3, 2 1824*c0909341SAndroid Build Coastguard Worker palignr xmm2, xmm3, xmm0, 12 1825*c0909341SAndroid Build Coastguard Worker mova xmm0, xmm3 1826*c0909341SAndroid Build Coastguard Worker punpcklwd xmm2, xmm3 ; 45 56 1827*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm4, xmm2, xm13 ; a2 b2 1828*c0909341SAndroid Build Coastguard Worker packuswb xmm4, xmm4 1829*c0909341SAndroid Build Coastguard Worker pshufb xmm4, xm10 1830*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm4, 0 1831*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm4, 1 1832*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1833*c0909341SAndroid Build Coastguard Worker sub hd, 2 1834*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 1835*c0909341SAndroid Build Coastguard Worker RET 1836*c0909341SAndroid Build Coastguard Worker.hv_w4: 1837*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+nsq*2] 1838*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym1, [srcq+nsq*1] 1839*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym2, [srcq+ssq*0], 1 1840*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3 1841*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1842*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [subpel_h_shufA] 1843*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4 1844*c0909341SAndroid Build Coastguard Worker pshufb m1, m5 1845*c0909341SAndroid Build Coastguard Worker mova m0, m9 1846*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 1847*c0909341SAndroid Build Coastguard Worker mova m3, m9 1848*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m1, m7 1849*c0909341SAndroid Build Coastguard Worker mova ym1, [spel_hv_perm4a] 1850*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m2, m7 1851*c0909341SAndroid Build Coastguard Worker mova ym2, [spel_hv_perm4b] 1852*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5555 1853*c0909341SAndroid Build Coastguard Worker mova ym6, [spel_hv_perm4d] 1854*c0909341SAndroid Build Coastguard Worker packssdw m0, m3 1855*c0909341SAndroid Build Coastguard Worker kmovw k1, r6d 1856*c0909341SAndroid Build Coastguard Worker psraw m0, 2 ; _ 0 1 2 3 4 5 6 1857*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym1, ym0 ; 01 12 1858*c0909341SAndroid Build Coastguard Worker vpermb m2, m2, m0 ; 23 34 1859*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 1860*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+ssq*1] 1861*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1862*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [srcq+ssq*0], 1 1863*c0909341SAndroid Build Coastguard Worker pmaddwd ym4, ym1, ym11 ; a0 b0 1864*c0909341SAndroid Build Coastguard Worker mova ym1, ym2 1865*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym5 1866*c0909341SAndroid Build Coastguard Worker mova ym0, ym9 1867*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym3, ym7 1868*c0909341SAndroid Build Coastguard Worker vpdpwssd ym4, ym2, ym12 ; a1 b1 1869*c0909341SAndroid Build Coastguard Worker vpsraw ym2{k1}, ym0, 2 ; 5 6 1870*c0909341SAndroid Build Coastguard Worker vpermb ym2, ym6, ym2 ; 45 56 1871*c0909341SAndroid Build Coastguard Worker vpdpwssd ym4, ym2, ym13 ; a2 b2 1872*c0909341SAndroid Build Coastguard Worker packuswb ym4, ym4 1873*c0909341SAndroid Build Coastguard Worker vpermb ym4, ym10, ym4 1874*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm4 1875*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm4, 1 1876*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1877*c0909341SAndroid Build Coastguard Worker sub hd, 2 1878*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 1879*c0909341SAndroid Build Coastguard Worker RET 1880*c0909341SAndroid Build Coastguard Worker.hv_w8: 1881*c0909341SAndroid Build Coastguard Worker shr mxd, 16 1882*c0909341SAndroid Build Coastguard Worker sub srcq, 3 1883*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+subpel_filters+mxq*8+0] 1884*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+subpel_filters+mxq*8+4] 1885*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 1886*c0909341SAndroid Build Coastguard Worker shr myd, 16 1887*c0909341SAndroid Build Coastguard Worker cmp hd, 6 1888*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 1889*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+subpel_filters+1+myq*8] 1890*c0909341SAndroid Build Coastguard Worker mov nsq, ssq 1891*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 1892*c0909341SAndroid Build Coastguard Worker neg nsq 1893*c0909341SAndroid Build Coastguard Worker psraw m0, 2 ; << 6 1894*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q0000 1895*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q1111 1896*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q2222 1897*c0909341SAndroid Build Coastguard Worker cmp wd, 8 1898*c0909341SAndroid Build Coastguard Worker jne .hv_w16 1899*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+nsq*2] 1900*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+nsq*1], 1 1901*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [subpel_h_shufA] 1902*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*0], 2 1903*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [subpel_h_shufB] 1904*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [srcq+ssq*1], 3 1905*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1906*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [subpel_h_shufC] 1907*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym5, [srcq+ssq*0] 1908*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [subpel_h_shufA] 1909*c0909341SAndroid Build Coastguard Worker pshufb m1, m0, m1 ; 0 1 2 3 0123 1910*c0909341SAndroid Build Coastguard Worker mova m2, m9 1911*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m1, m11 1912*c0909341SAndroid Build Coastguard Worker pshufb m4, m0, m4 ; 0 1 2 3 4567 1913*c0909341SAndroid Build Coastguard Worker mova m1, m9 1914*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m11 1915*c0909341SAndroid Build Coastguard Worker pshufb m0, m7 ; 0 1 2 3 89ab 1916*c0909341SAndroid Build Coastguard Worker pshufb ym7, ym5, ym6 ; 4 0123 4567 1917*c0909341SAndroid Build Coastguard Worker mova ym3, ym9 1918*c0909341SAndroid Build Coastguard Worker vpdpbusd ym3, ym7, ym11 1919*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m7, [subpel_h_shufB] 1920*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m4, m12 1921*c0909341SAndroid Build Coastguard Worker mova m4, [spel_hv_perm8a] 1922*c0909341SAndroid Build Coastguard Worker pshufb ym5, ym7 ; 4 4567 89ab 1923*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m0, m12 1924*c0909341SAndroid Build Coastguard Worker vpaddd m0, m4, [pb_32] {1to16} 1925*c0909341SAndroid Build Coastguard Worker vpdpbusd ym3, ym5, ym12 1926*c0909341SAndroid Build Coastguard Worker mova m5, [spel_hv_perm8b] 1927*c0909341SAndroid Build Coastguard Worker mov r6, 0x55555555ff00 1928*c0909341SAndroid Build Coastguard Worker packssdw m2, m1 1929*c0909341SAndroid Build Coastguard Worker vpmovsdw xm3, ym3 1930*c0909341SAndroid Build Coastguard Worker kmovq k1, r6 1931*c0909341SAndroid Build Coastguard Worker psraw m2, 2 ; 0 1 2 3 1932*c0909341SAndroid Build Coastguard Worker psraw xm3, 2 ; 4 1933*c0909341SAndroid Build Coastguard Worker vpermb m1, m4, m2 ; 01 12 1934*c0909341SAndroid Build Coastguard Worker kshiftrq k2, k1, 16 1935*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m0, m3 ; 23 34 1936*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 1937*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [srcq+ssq*1] 1938*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 1939*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3{k1}, [srcq+ssq*0] 1940*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m1, m13 ; a0 b0 1941*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m6 ; 5 6 0123 4567 1942*c0909341SAndroid Build Coastguard Worker mova m4, m9 1943*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m1, m11 1944*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 ; 5 6 4567 89ab 1945*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m14 ; a1 b1 1946*c0909341SAndroid Build Coastguard Worker mova m1, m2 1947*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m3, m12 1948*c0909341SAndroid Build Coastguard Worker psraw m2{k2}, m4, 2 ; 53 64 1949*c0909341SAndroid Build Coastguard Worker vpermb m2, m5, m2 ; 45 56 1950*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m15 ; a2 b2 1951*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1952*c0909341SAndroid Build Coastguard Worker vpermb m0, m10, m0 1953*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 1954*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 1955*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 1956*c0909341SAndroid Build Coastguard Worker sub hd, 2 1957*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 1958*c0909341SAndroid Build Coastguard Worker RET 1959*c0909341SAndroid Build Coastguard Worker.hv_w16: 1960*c0909341SAndroid Build Coastguard Worker movu m19, [spel_hv_perm16a] 1961*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pb_4] 1962*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*2-32] 1963*c0909341SAndroid Build Coastguard Worker mova m6, [spel_hv_perm16b] 1964*c0909341SAndroid Build Coastguard Worker paddb m20, m7, m19 1965*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*8] 1966*c0909341SAndroid Build Coastguard Worker paddb m21, m7, m20 1967*c0909341SAndroid Build Coastguard Worker mova ym10, [spel_hv_end16] 1968*c0909341SAndroid Build Coastguard Worker paddb m7, m6 1969*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 1970*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+nsq*2] 1971*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [srcq+nsq*1], 1 1972*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ssq*2] 1973*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+ssq*0] 1974*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [srcq+ssq*1], 1 1975*c0909341SAndroid Build Coastguard Worker mov r7, dstq 1976*c0909341SAndroid Build Coastguard Worker movu ym18, [r4 +ssq*0] 1977*c0909341SAndroid Build Coastguard Worker vpermb m2, m19, m16 ; 0 1 0123 89ab 1978*c0909341SAndroid Build Coastguard Worker mova m1, m9 1979*c0909341SAndroid Build Coastguard Worker vpermb m3, m21, m16 ; 0 1 89ab ghij 1980*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m11 1981*c0909341SAndroid Build Coastguard Worker mova m2, m9 1982*c0909341SAndroid Build Coastguard Worker vpermb m4, m19, m17 ; 2 3 0123 89ab 1983*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m3, m12 1984*c0909341SAndroid Build Coastguard Worker mova m3, m9 1985*c0909341SAndroid Build Coastguard Worker vpermb m5, m21, m17 ; 2 3 89ab ghij 1986*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m11 1987*c0909341SAndroid Build Coastguard Worker mova m4, m9 1988*c0909341SAndroid Build Coastguard Worker vpermb m0, m6, m18 ; 4 0145 2367 89cd abef 1989*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m5, m12 1990*c0909341SAndroid Build Coastguard Worker mova m5, m9 1991*c0909341SAndroid Build Coastguard Worker vpermb m16, m20, m16 ; 0 1 4567 cdef 1992*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m0, m11 1993*c0909341SAndroid Build Coastguard Worker vpermb m17, m20, m17 ; 2 3 4567 cdef 1994*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m16, m12 1995*c0909341SAndroid Build Coastguard Worker vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij 1996*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m16, m11 1997*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m17, m12 1998*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m17, m11 1999*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m18, m12 2000*c0909341SAndroid Build Coastguard Worker packssdw m1, m2 ; 01 2001*c0909341SAndroid Build Coastguard Worker packssdw m3, m4 ; 23 2002*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m1, m3, m5 2003*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 12 2004*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m3, m5, 16 ; 34 2005*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 2006*c0909341SAndroid Build Coastguard Worker movu ym18, [r4+ssq*1] 2007*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 2008*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [r4+ssq*0], 1 2009*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m1, m13 ; a0 2010*c0909341SAndroid Build Coastguard Worker vpermb m1, m19, m18 ; 5 6 0123 89ab 2011*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m2, m13 ; b0 2012*c0909341SAndroid Build Coastguard Worker vpermb m2, m20, m18 ; 5 6 4567 cdef 2013*c0909341SAndroid Build Coastguard Worker mova m0, m9 2014*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m1, m11 2015*c0909341SAndroid Build Coastguard Worker vpermb m18, m21, m18 2016*c0909341SAndroid Build Coastguard Worker mova m1, m9 2017*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m11 2018*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m3, m14 ; a1 2019*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m4, m14 ; b1 2020*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m12 2021*c0909341SAndroid Build Coastguard Worker mova m2, m4 2022*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m18, m12 2023*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 2024*c0909341SAndroid Build Coastguard Worker mova m1, m3 2025*c0909341SAndroid Build Coastguard Worker psraw m4, m0, 2 ; 5 6 2026*c0909341SAndroid Build Coastguard Worker vpshrdd m3, m2, m4, 16 ; 4 5 2027*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m4, m15 ; b2 2028*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m3, m15 ; a2 2029*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2030*c0909341SAndroid Build Coastguard Worker vpermb m16, m10, m16 2031*c0909341SAndroid Build Coastguard Worker mova [r7+dsq*0], xm16 2032*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+dsq*1], ym16, 1 2033*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 2034*c0909341SAndroid Build Coastguard Worker sub hd, 2 2035*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 2036*c0909341SAndroid Build Coastguard Worker add srcq, 16 2037*c0909341SAndroid Build Coastguard Worker add dstq, 16 2038*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2039*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 2040*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 2041*c0909341SAndroid Build Coastguard Worker vzeroupper 2042*c0909341SAndroid Build Coastguard Worker RET 2043*c0909341SAndroid Build Coastguard Worker 2044*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc 2045*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc 2046*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp, SHARP, SHARP 2047*c0909341SAndroid Build Coastguard Worker 2048*c0909341SAndroid Build Coastguard Workercglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 2049*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2050*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 2051*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2052*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 2053*c0909341SAndroid Build Coastguard Worker lea r8, [put_avx512icl] 2054*c0909341SAndroid Build Coastguard Worker movsxd wq, wm 2055*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2056*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2057*c0909341SAndroid Build Coastguard Worker jnz .h 2058*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2059*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put 2060*c0909341SAndroid Build Coastguard Worker.v: 2061*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2062*c0909341SAndroid Build Coastguard Worker shr myd, 16 2063*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2064*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2065*c0909341SAndroid Build Coastguard Worker tzcnt r6d, wd 2066*c0909341SAndroid Build Coastguard Worker lea myq, [base+subpel_filters+myq*8] 2067*c0909341SAndroid Build Coastguard Worker movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] 2068*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_512] 2069*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+0] 2070*c0909341SAndroid Build Coastguard Worker add r6, r8 2071*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [myq+2] 2072*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2073*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [myq+4] 2074*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 2075*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [myq+6] 2076*c0909341SAndroid Build Coastguard Worker jmp r6 2077*c0909341SAndroid Build Coastguard Worker.v_w2: 2078*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+ssq*0] 2079*c0909341SAndroid Build Coastguard Worker pinsrw xmm2, [srcq+ssq*1], 2 2080*c0909341SAndroid Build Coastguard Worker pinsrw xmm2, [srcq+ssq*2], 4 2081*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2082*c0909341SAndroid Build Coastguard Worker pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 2083*c0909341SAndroid Build Coastguard Worker movd xmm3, [srcq+ssq*1] 2084*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm1, [srcq+ssq*2] 2085*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2086*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 2087*c0909341SAndroid Build Coastguard Worker vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 2088*c0909341SAndroid Build Coastguard Worker vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 2089*c0909341SAndroid Build Coastguard Worker palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2090*c0909341SAndroid Build Coastguard Worker punpcklbw xmm3, xmm1 ; 45 56 2091*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm2, xmm4 ; 01 12 2092*c0909341SAndroid Build Coastguard Worker punpckhbw xmm2, xmm4 ; 23 34 2093*c0909341SAndroid Build Coastguard Worker.v_w2_loop: 2094*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm5, xmm1, xm8 ; a0 b0 2095*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 2096*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm2, xm9 ; a1 b1 2097*c0909341SAndroid Build Coastguard Worker paddw xmm5, xmm2 2098*c0909341SAndroid Build Coastguard Worker mova xmm2, xmm3 2099*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm3, xm10 ; a2 b2 2100*c0909341SAndroid Build Coastguard Worker paddw xmm5, xmm3 2101*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm4, [srcq+ssq*1] 2102*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2103*c0909341SAndroid Build Coastguard Worker vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2104*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 2105*c0909341SAndroid Build Coastguard Worker vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 2106*c0909341SAndroid Build Coastguard Worker punpcklbw xmm3, xmm4 ; 67 78 2107*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm4, xmm3, xm11 ; a3 b3 2108*c0909341SAndroid Build Coastguard Worker paddw xmm5, xmm4 2109*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm5, xm7 2110*c0909341SAndroid Build Coastguard Worker packuswb xmm5, xmm5 2111*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm5, 0 2112*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm5, 2 2113*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2114*c0909341SAndroid Build Coastguard Worker sub hd, 2 2115*c0909341SAndroid Build Coastguard Worker jg .v_w2_loop 2116*c0909341SAndroid Build Coastguard Worker RET 2117*c0909341SAndroid Build Coastguard Worker.v_w4: 2118*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+ssq*0] 2119*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*1], 1 2120*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*2], 2 2121*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2122*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 2123*c0909341SAndroid Build Coastguard Worker movd xmm3, [srcq+ssq*1] 2124*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm1, [srcq+ssq*2] 2125*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2126*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 2127*c0909341SAndroid Build Coastguard Worker vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 2128*c0909341SAndroid Build Coastguard Worker vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 2129*c0909341SAndroid Build Coastguard Worker palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 2130*c0909341SAndroid Build Coastguard Worker punpcklbw xmm3, xmm1 ; 45 56 2131*c0909341SAndroid Build Coastguard Worker punpcklbw xmm1, xmm2, xmm4 ; 01 12 2132*c0909341SAndroid Build Coastguard Worker punpckhbw xmm2, xmm4 ; 23 34 2133*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 2134*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm4, [srcq+ssq*1] 2135*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2136*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm5, xmm1, xm8 ; a0 b0 2137*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 2138*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm2, xm9 ; a1 b1 2139*c0909341SAndroid Build Coastguard Worker paddw xmm5, xmm2 2140*c0909341SAndroid Build Coastguard Worker mova xmm2, xmm3 2141*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm3, xm10 ; a2 b2 2142*c0909341SAndroid Build Coastguard Worker paddw xmm5, xmm3 2143*c0909341SAndroid Build Coastguard Worker vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 2144*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm0, [srcq+ssq*0] 2145*c0909341SAndroid Build Coastguard Worker vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 2146*c0909341SAndroid Build Coastguard Worker punpcklbw xmm3, xmm4 ; 67 78 2147*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm4, xmm3, xm11 ; a3 b3 2148*c0909341SAndroid Build Coastguard Worker paddw xmm5, xmm4 2149*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm5, xm7 2150*c0909341SAndroid Build Coastguard Worker packuswb xmm5, xmm5 2151*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm5 2152*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm5, 1 2153*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2154*c0909341SAndroid Build Coastguard Worker sub hd, 2 2155*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 2156*c0909341SAndroid Build Coastguard Worker RET 2157*c0909341SAndroid Build Coastguard Worker.v_w8: 2158*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*0] 2159*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*1] 2160*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm2, [srcq+ssq*2] 2161*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2162*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm5, [srcq+ssq*0] 2163*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+ssq*1] 2164*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*2] 2165*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2166*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm0, 0x30 2167*c0909341SAndroid Build Coastguard Worker vpblendd ymm0, ymm2, 0x30 2168*c0909341SAndroid Build Coastguard Worker punpcklbw ymm1, ymm0 ; 01 12 2169*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 2170*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm5, 0x30 2171*c0909341SAndroid Build Coastguard Worker vpblendd ymm5, ymm3, 0x30 2172*c0909341SAndroid Build Coastguard Worker punpcklbw ymm2, ymm5 ; 23 34 2173*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm4, 0x30 2174*c0909341SAndroid Build Coastguard Worker vpblendd ymm4, ymm0, 0x30 2175*c0909341SAndroid Build Coastguard Worker punpcklbw ymm3, ymm4 ; 45 56 2176*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 2177*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*1] 2178*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2179*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm5, ymm1, ym8 ; a0 b0 2180*c0909341SAndroid Build Coastguard Worker mova ymm1, ymm2 2181*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm2, ym9 ; a1 b1 2182*c0909341SAndroid Build Coastguard Worker paddw ymm5, ymm2 2183*c0909341SAndroid Build Coastguard Worker mova ymm2, ymm3 2184*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm3, ym10 ; a2 b2 2185*c0909341SAndroid Build Coastguard Worker paddw ymm5, ymm3 2186*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, ymm4, 0x30 2187*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm0, [srcq+ssq*0] 2188*c0909341SAndroid Build Coastguard Worker vpblendd ymm4, ymm4, ymm0, 0x30 2189*c0909341SAndroid Build Coastguard Worker punpcklbw ymm3, ymm4 ; 67 78 2190*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm4, ymm3, ym11 ; a3 b3 2191*c0909341SAndroid Build Coastguard Worker paddw ymm5, ymm4 2192*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm5, ym7 2193*c0909341SAndroid Build Coastguard Worker vextracti128 xmm4, ymm5, 1 2194*c0909341SAndroid Build Coastguard Worker packuswb xmm5, xmm4 2195*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm5 2196*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm5 2197*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2198*c0909341SAndroid Build Coastguard Worker sub hd, 2 2199*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2200*c0909341SAndroid Build Coastguard Worker vzeroupper 2201*c0909341SAndroid Build Coastguard Worker RET 2202*c0909341SAndroid Build Coastguard Worker.v_w16: 2203*c0909341SAndroid Build Coastguard Worker mova m12, [spel_v_perm16a] 2204*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [srcq+ssq*0] 2205*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+ssq*1] 2206*c0909341SAndroid Build Coastguard Worker mov r6d, 0x0f 2207*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ssq*2] 2208*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2209*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym5, [srcq+ssq*0] 2210*c0909341SAndroid Build Coastguard Worker kmovb k1, r6d 2211*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [srcq+ssq*1] 2212*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym6, [srcq+ssq*2] 2213*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2214*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*0] 2215*c0909341SAndroid Build Coastguard Worker vshufpd m1{k1}, m4, m2, 0xcc 2216*c0909341SAndroid Build Coastguard Worker vshufpd m2{k1}, m5, m3, 0xcc 2217*c0909341SAndroid Build Coastguard Worker vshufpd m3{k1}, m6, m0, 0xcc 2218*c0909341SAndroid Build Coastguard Worker vpermb m1, m12, m1 ; 01 12 2219*c0909341SAndroid Build Coastguard Worker vpermb m2, m12, m2 ; 23 34 2220*c0909341SAndroid Build Coastguard Worker vpermb m3, m12, m3 ; 45 56 2221*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 2222*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m1, m8 ; a0 b0 2223*c0909341SAndroid Build Coastguard Worker mova m1, m2 2224*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m2, m9 ; a1 b1 2225*c0909341SAndroid Build Coastguard Worker mova m2, m3 2226*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m3, m10 ; a2 b2 2227*c0909341SAndroid Build Coastguard Worker mova m3, m0 2228*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2229*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym5, [srcq+ssq*1] 2230*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2231*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*0] 2232*c0909341SAndroid Build Coastguard Worker vshufpd m3{k1}, m5, m0, 0xcc 2233*c0909341SAndroid Build Coastguard Worker vpermb m3, m12, m3 ; 67 78 2234*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m11 ; a3 b3 2235*c0909341SAndroid Build Coastguard Worker paddw m4, m6 2236*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2237*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2238*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym5, m4, 1 2239*c0909341SAndroid Build Coastguard Worker packuswb ym4, ym5 2240*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm4 2241*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym4, 1 2242*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2243*c0909341SAndroid Build Coastguard Worker sub hd, 2 2244*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 2245*c0909341SAndroid Build Coastguard Worker RET 2246*c0909341SAndroid Build Coastguard Worker.v_w32: 2247*c0909341SAndroid Build Coastguard Worker mova m12, [spel_v_perm32] 2248*c0909341SAndroid Build Coastguard Worker pmovzxbq m14, [pb_02461357] 2249*c0909341SAndroid Build Coastguard Worker vpshrdw m13, m12, m12, 8 2250*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 2251*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 2252*c0909341SAndroid Build Coastguard Worker vpermb m1, m12, m0 ; 01 2253*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*2], 0 2254*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2255*c0909341SAndroid Build Coastguard Worker vpermb m2, m13, m0 ; 12 2256*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 1 2257*c0909341SAndroid Build Coastguard Worker vpermb m3, m12, m0 ; 23 2258*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 0 2259*c0909341SAndroid Build Coastguard Worker vpermb m4, m13, m0 ; 34 2260*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*2], 1 2261*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2262*c0909341SAndroid Build Coastguard Worker vpermb m5, m12, m0 ; 45 2263*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 0 2264*c0909341SAndroid Build Coastguard Worker vpermb m6, m13, m0 ; 56 2265*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 2266*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 2267*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2268*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m1, m8 2269*c0909341SAndroid Build Coastguard Worker mova m1, m3 2270*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m2, m8 2271*c0909341SAndroid Build Coastguard Worker mova m2, m4 2272*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m3, m9 2273*c0909341SAndroid Build Coastguard Worker mova m3, m5 2274*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m4, m9 2275*c0909341SAndroid Build Coastguard Worker mova m4, m6 2276*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m5, m10 2277*c0909341SAndroid Build Coastguard Worker vpermb m5, m12, m0 ; 67 2278*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*0], 0 2279*c0909341SAndroid Build Coastguard Worker pmaddubsw m20, m6, m10 2280*c0909341SAndroid Build Coastguard Worker vpermb m6, m13, m0 ; 78 2281*c0909341SAndroid Build Coastguard Worker paddw m15, m17 2282*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m5, m11 2283*c0909341SAndroid Build Coastguard Worker paddw m16, m18 2284*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m6, m11 2285*c0909341SAndroid Build Coastguard Worker paddw m15, m19 2286*c0909341SAndroid Build Coastguard Worker paddw m16, m20 2287*c0909341SAndroid Build Coastguard Worker paddw m15, m17 2288*c0909341SAndroid Build Coastguard Worker paddw m16, m18 2289*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m7 2290*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m7 2291*c0909341SAndroid Build Coastguard Worker packuswb m15, m16 2292*c0909341SAndroid Build Coastguard Worker vpermq m15, m14, m15 2293*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym15 2294*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m15, 1 2295*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2296*c0909341SAndroid Build Coastguard Worker sub hd, 2 2297*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 2298*c0909341SAndroid Build Coastguard Worker vzeroupper 2299*c0909341SAndroid Build Coastguard Worker RET 2300*c0909341SAndroid Build Coastguard Worker.v_w64: 2301*c0909341SAndroid Build Coastguard Worker.v_w128: 2302*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq*4-256] 2303*c0909341SAndroid Build Coastguard Worker mov r4, srcq 2304*c0909341SAndroid Build Coastguard Worker mov r7, dstq 2305*c0909341SAndroid Build Coastguard Worker.v_loop0: 2306*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+ssq*0] 2307*c0909341SAndroid Build Coastguard Worker movu m4, [srcq+ssq*1] 2308*c0909341SAndroid Build Coastguard Worker movu m6, [srcq+ssq*2] 2309*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2310*c0909341SAndroid Build Coastguard Worker movu m13, [srcq+ssq*0] 2311*c0909341SAndroid Build Coastguard Worker movu m15, [srcq+ssq*1] 2312*c0909341SAndroid Build Coastguard Worker movu m17, [srcq+ssq*2] 2313*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2314*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 2315*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m2, m4 ; 01l 2316*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m4 ; 01h 2317*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m6 ; 12l 2318*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m6 ; 12h 2319*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m13 ; 23l 2320*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m13 ; 23h 2321*c0909341SAndroid Build Coastguard Worker punpcklbw m12, m13, m15 ; 34l 2322*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m15 ; 34h 2323*c0909341SAndroid Build Coastguard Worker punpcklbw m14, m15, m17 ; 45l 2324*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m17 ; 45h 2325*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m17, m0 ; 56l 2326*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m0 ; 56h 2327*c0909341SAndroid Build Coastguard Worker.v_loop: 2328*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m1, m8 ; a0l 2329*c0909341SAndroid Build Coastguard Worker mova m1, m5 2330*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m2, m8 ; a0h 2331*c0909341SAndroid Build Coastguard Worker mova m2, m6 2332*c0909341SAndroid Build Coastguard Worker pmaddubsw m20, m3, m8 ; b0l 2333*c0909341SAndroid Build Coastguard Worker mova m3, m12 2334*c0909341SAndroid Build Coastguard Worker pmaddubsw m21, m4, m8 ; b0h 2335*c0909341SAndroid Build Coastguard Worker mova m4, m13 2336*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m9 ; a1l 2337*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m9 ; a1h 2338*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m9 ; b1l 2339*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m9 ; b1h 2340*c0909341SAndroid Build Coastguard Worker paddw m18, m5 2341*c0909341SAndroid Build Coastguard Worker mova m5, m14 2342*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m10 ; a2l 2343*c0909341SAndroid Build Coastguard Worker paddw m19, m6 2344*c0909341SAndroid Build Coastguard Worker mova m6, m15 2345*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m10 ; a2h 2346*c0909341SAndroid Build Coastguard Worker paddw m20, m12 2347*c0909341SAndroid Build Coastguard Worker mova m12, m16 2348*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10 ; b2l 2349*c0909341SAndroid Build Coastguard Worker paddw m21, m13 2350*c0909341SAndroid Build Coastguard Worker mova m13, m17 2351*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m10 ; b2h 2352*c0909341SAndroid Build Coastguard Worker paddw m18, m14 2353*c0909341SAndroid Build Coastguard Worker paddw m19, m15 2354*c0909341SAndroid Build Coastguard Worker paddw m20, m16 2355*c0909341SAndroid Build Coastguard Worker paddw m21, m17 2356*c0909341SAndroid Build Coastguard Worker movu m17, [srcq+ssq*1] 2357*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2358*c0909341SAndroid Build Coastguard Worker punpcklbw m14, m0, m17 ; 67l 2359*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m0, m17 ; 67h 2360*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m14, m11 ; a3l 2361*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m15, m11 ; a3h 2362*c0909341SAndroid Build Coastguard Worker paddw m18, m16 2363*c0909341SAndroid Build Coastguard Worker paddw m19, m0 2364*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+ssq*0] 2365*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m17, m0 ; 78l 2366*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m0 ; 78h 2367*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m7 2368*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m7 2369*c0909341SAndroid Build Coastguard Worker packuswb m18, m19 2370*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], m18 2371*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m16, m11 ; b3l 2372*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m17, m11 ; b3h 2373*c0909341SAndroid Build Coastguard Worker paddw m18, m20 2374*c0909341SAndroid Build Coastguard Worker paddw m19, m21 2375*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m7 2376*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m7 2377*c0909341SAndroid Build Coastguard Worker packuswb m18, m19 2378*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*1], m18 2379*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2380*c0909341SAndroid Build Coastguard Worker sub hd, 2 2381*c0909341SAndroid Build Coastguard Worker jg .v_loop 2382*c0909341SAndroid Build Coastguard Worker add r4, 64 2383*c0909341SAndroid Build Coastguard Worker add r7, 64 2384*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2385*c0909341SAndroid Build Coastguard Worker mov srcq, r4 2386*c0909341SAndroid Build Coastguard Worker mov dstq, r7 2387*c0909341SAndroid Build Coastguard Worker sub r6d, 256 2388*c0909341SAndroid Build Coastguard Worker jg .v_loop0 2389*c0909341SAndroid Build Coastguard Worker vzeroupper 2390*c0909341SAndroid Build Coastguard Worker RET 2391*c0909341SAndroid Build Coastguard Worker.h: 2392*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2393*c0909341SAndroid Build Coastguard Worker jnz .hv 2394*c0909341SAndroid Build Coastguard Worker.h2: 2395*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) 2396*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2397*c0909341SAndroid Build Coastguard Worker jl .h_w2 2398*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufA] 2399*c0909341SAndroid Build Coastguard Worker je .h_w4 2400*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2401*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufB] 2402*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [subpel_h_shufC] 2403*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2404*c0909341SAndroid Build Coastguard Worker sub srcq, 3 2405*c0909341SAndroid Build Coastguard Worker movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] 2406*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+mxq*8+subpel_filters+0] 2407*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+mxq*8+subpel_filters+4] 2408*c0909341SAndroid Build Coastguard Worker add wq, r8 2409*c0909341SAndroid Build Coastguard Worker jmp wq 2410*c0909341SAndroid Build Coastguard Worker.h_w2: 2411*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2412*c0909341SAndroid Build Coastguard Worker dec srcq 2413*c0909341SAndroid Build Coastguard Worker mova xmm4, [subpel_h_shuf4] 2414*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] 2415*c0909341SAndroid Build Coastguard Worker.h_w2_loop: 2416*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 2417*c0909341SAndroid Build Coastguard Worker movhps xmm0, [srcq+ssq*1] 2418*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2419*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xmm4 2420*c0909341SAndroid Build Coastguard Worker mova xmm1, xm5 2421*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm1, xmm0, xmm3 2422*c0909341SAndroid Build Coastguard Worker packssdw xmm0, xmm1, xmm1 2423*c0909341SAndroid Build Coastguard Worker psraw xmm0, 6 2424*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xm0 2425*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm0, 0 2426*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm0, 1 2427*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2428*c0909341SAndroid Build Coastguard Worker sub hd, 2 2429*c0909341SAndroid Build Coastguard Worker jg .h_w2_loop 2430*c0909341SAndroid Build Coastguard Worker RET 2431*c0909341SAndroid Build Coastguard Worker.h_w4: 2432*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2433*c0909341SAndroid Build Coastguard Worker dec srcq 2434*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] 2435*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 2436*c0909341SAndroid Build Coastguard Worker movq xmm0, [srcq+ssq*0] 2437*c0909341SAndroid Build Coastguard Worker movq xmm1, [srcq+ssq*1] 2438*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2439*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xm6 2440*c0909341SAndroid Build Coastguard Worker pshufb xmm1, xm6 2441*c0909341SAndroid Build Coastguard Worker mova xmm2, xm5 2442*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm2, xmm0, xmm3 2443*c0909341SAndroid Build Coastguard Worker mova xmm0, xm5 2444*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm0, xmm1, xmm3 2445*c0909341SAndroid Build Coastguard Worker packssdw xmm0, xmm2, xmm0 2446*c0909341SAndroid Build Coastguard Worker psraw xmm0, 6 2447*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 2448*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 2449*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 2450*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2451*c0909341SAndroid Build Coastguard Worker sub hd, 2 2452*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 2453*c0909341SAndroid Build Coastguard Worker RET 2454*c0909341SAndroid Build Coastguard Worker.h_w8: 2455*c0909341SAndroid Build Coastguard Worker movu xm0, [srcq+ssq*0] 2456*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [srcq+ssq*1], 1 2457*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2458*c0909341SAndroid Build Coastguard Worker WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 2459*c0909341SAndroid Build Coastguard Worker vpmovuswb xm0, ym0 2460*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm0 2461*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm0 2462*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2463*c0909341SAndroid Build Coastguard Worker sub hd, 2 2464*c0909341SAndroid Build Coastguard Worker jg .h_w8 2465*c0909341SAndroid Build Coastguard Worker RET 2466*c0909341SAndroid Build Coastguard Worker.h_w16: 2467*c0909341SAndroid Build Coastguard Worker mova m6, [spel_h_perm16] 2468*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pb_4] 2469*c0909341SAndroid Build Coastguard Worker paddb m7, m8, m6 2470*c0909341SAndroid Build Coastguard Worker paddb m8, m7 2471*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 2472*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0] 2473*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1], 1 2474*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2475*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 1, 2, 3, 1 2476*c0909341SAndroid Build Coastguard Worker vpmovuswb ym0, m0 2477*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 2478*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+dsq*1], ym0, 1 2479*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2480*c0909341SAndroid Build Coastguard Worker sub hd, 2 2481*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 2482*c0909341SAndroid Build Coastguard Worker RET 2483*c0909341SAndroid Build Coastguard Worker.h_w32: 2484*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+ssq*0+8*0] 2485*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+ssq*1+8*0], 1 2486*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+ssq*0+8*1] 2487*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+ssq*1+8*1], 1 2488*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2489*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 2, 3, 4 2490*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 1, 4, 3, 2 2491*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2492*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 2493*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 2494*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2495*c0909341SAndroid Build Coastguard Worker sub hd, 2 2496*c0909341SAndroid Build Coastguard Worker jg .h_w32 2497*c0909341SAndroid Build Coastguard Worker RET 2498*c0909341SAndroid Build Coastguard Worker.h_w64: 2499*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 2500*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*1] 2501*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2502*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 2, 3, 4 2503*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 1, 4, 3, 2 2504*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2505*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 2506*c0909341SAndroid Build Coastguard Worker add dstq, dsq 2507*c0909341SAndroid Build Coastguard Worker dec hd 2508*c0909341SAndroid Build Coastguard Worker jg .h_w64 2509*c0909341SAndroid Build Coastguard Worker RET 2510*c0909341SAndroid Build Coastguard Worker.h_w128: 2511*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+8*0] 2512*c0909341SAndroid Build Coastguard Worker movu m2, [srcq+8*1] 2513*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+8*8] 2514*c0909341SAndroid Build Coastguard Worker movu m3, [srcq+8*9] 2515*c0909341SAndroid Build Coastguard Worker add srcq, ssq 2516*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 0, 4, 11, 12 2517*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 2, 12, 11, 4 2518*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 1, 4, 11, 12 2519*c0909341SAndroid Build Coastguard Worker PUT_8TAP_H 3, 12, 11, 4 2520*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 2521*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 2522*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 2523*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 2524*c0909341SAndroid Build Coastguard Worker add dstq, dsq 2525*c0909341SAndroid Build Coastguard Worker dec hd 2526*c0909341SAndroid Build Coastguard Worker jg .h_w128 2527*c0909341SAndroid Build Coastguard Worker RET 2528*c0909341SAndroid Build Coastguard Worker.hv: 2529*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_34] 2530*c0909341SAndroid Build Coastguard Worker pxor xm0, xm0 2531*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2532*c0909341SAndroid Build Coastguard Worker jg .hv_w8 2533*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 2534*c0909341SAndroid Build Coastguard Worker dec srcq 2535*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+subpel_filters+mxq*8+2] 2536*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2537*c0909341SAndroid Build Coastguard Worker shr myd, 16 2538*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2539*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2540*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym1, [base+subpel_filters+myq*8] 2541*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2542*c0909341SAndroid Build Coastguard Worker mov r6, srcq 2543*c0909341SAndroid Build Coastguard Worker punpcklbw ym0, ym1 2544*c0909341SAndroid Build Coastguard Worker sub r6, ss3q 2545*c0909341SAndroid Build Coastguard Worker psraw ym0, 2 ; << 6 2546*c0909341SAndroid Build Coastguard Worker mova xm14, [spel_hv_end] 2547*c0909341SAndroid Build Coastguard Worker pshufd ym10, ym0, q0000 2548*c0909341SAndroid Build Coastguard Worker pshufd ym11, ym0, q1111 2549*c0909341SAndroid Build Coastguard Worker pshufd ym12, ym0, q2222 2550*c0909341SAndroid Build Coastguard Worker pshufd ym13, ym0, q3333 2551*c0909341SAndroid Build Coastguard Worker cmp wd, 4 2552*c0909341SAndroid Build Coastguard Worker je .hv_w4 2553*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ym6, [subpel_h_shuf4] 2554*c0909341SAndroid Build Coastguard Worker movq xmm2, [r6+ssq*0] 2555*c0909341SAndroid Build Coastguard Worker movhps xmm2, [r6+ssq*1] 2556*c0909341SAndroid Build Coastguard Worker movq xmm0, [r6+ssq*2] 2557*c0909341SAndroid Build Coastguard Worker movhps xmm0, [srcq+ssq*0] 2558*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [srcq+ssq*1] 2559*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm4, [srcq+ssq*2] 2560*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2561*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm1, [srcq+ssq*0] 2562*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm3, 0x30 2563*c0909341SAndroid Build Coastguard Worker vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ 2564*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 2565*c0909341SAndroid Build Coastguard Worker pshufb ymm2, ym6 2566*c0909341SAndroid Build Coastguard Worker pshufb ymm0, ym6 2567*c0909341SAndroid Build Coastguard Worker mova ymm1, ym9 2568*c0909341SAndroid Build Coastguard Worker vpdpbusd ymm1, ymm2, ym7 2569*c0909341SAndroid Build Coastguard Worker mova ymm2, ym9 2570*c0909341SAndroid Build Coastguard Worker vpdpbusd ymm2, ymm0, ym7 2571*c0909341SAndroid Build Coastguard Worker packssdw ymm2, ymm1, ymm2 2572*c0909341SAndroid Build Coastguard Worker psraw ymm2, 2 2573*c0909341SAndroid Build Coastguard Worker vextracti128 xmm3, ymm2, 1 2574*c0909341SAndroid Build Coastguard Worker palignr xmm4, xmm3, xmm2, 4 2575*c0909341SAndroid Build Coastguard Worker punpcklwd xmm1, xmm2, xmm4 ; 01 12 2576*c0909341SAndroid Build Coastguard Worker punpckhwd xmm2, xmm4 ; 23 34 2577*c0909341SAndroid Build Coastguard Worker pshufd xmm0, xmm3, q2121 2578*c0909341SAndroid Build Coastguard Worker punpcklwd xmm3, xmm0 ; 45 56 2579*c0909341SAndroid Build Coastguard Worker.hv_w2_loop: 2580*c0909341SAndroid Build Coastguard Worker movq xmm4, [srcq+ssq*1] 2581*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2582*c0909341SAndroid Build Coastguard Worker movhps xmm4, [srcq+ssq*0] 2583*c0909341SAndroid Build Coastguard Worker pmaddwd xmm5, xmm1, xm10 ; a0 b0 2584*c0909341SAndroid Build Coastguard Worker mova xmm1, xmm2 2585*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xmm2, xm11 ; a1 b1 2586*c0909341SAndroid Build Coastguard Worker pshufb xmm4, xm6 2587*c0909341SAndroid Build Coastguard Worker mova xmm2, xmm3 2588*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xmm3, xm12 ; a2 b2 2589*c0909341SAndroid Build Coastguard Worker mova xmm3, xm9 2590*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm3, xmm4, xm7 2591*c0909341SAndroid Build Coastguard Worker packssdw xmm4, xmm3, xmm3 2592*c0909341SAndroid Build Coastguard Worker psraw xmm4, 2 2593*c0909341SAndroid Build Coastguard Worker palignr xmm3, xmm4, xmm0, 12 2594*c0909341SAndroid Build Coastguard Worker mova xmm0, xmm4 2595*c0909341SAndroid Build Coastguard Worker punpcklwd xmm3, xmm4 ; 67 78 2596*c0909341SAndroid Build Coastguard Worker vpdpwssd xmm5, xmm3, xm13 ; a3 b3 2597*c0909341SAndroid Build Coastguard Worker packuswb xmm5, xmm5 2598*c0909341SAndroid Build Coastguard Worker pshufb xmm5, xm14 2599*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm5, 0 2600*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm5, 1 2601*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2602*c0909341SAndroid Build Coastguard Worker sub hd, 2 2603*c0909341SAndroid Build Coastguard Worker jg .hv_w2_loop 2604*c0909341SAndroid Build Coastguard Worker vzeroupper 2605*c0909341SAndroid Build Coastguard Worker RET 2606*c0909341SAndroid Build Coastguard Worker.hv_w4: 2607*c0909341SAndroid Build Coastguard Worker movq xmm1, [r6+ssq*0] 2608*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym2, [r6+ssq*1] 2609*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 2610*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+ssq*0], 2 2611*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+ssq*1], 2 2612*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 2613*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [subpel_h_shufA] 2614*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2615*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 2616*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 2617*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 2618*c0909341SAndroid Build Coastguard Worker mova m0, m9 2619*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m7 2620*c0909341SAndroid Build Coastguard Worker mova m4, m9 2621*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m1, m7 2622*c0909341SAndroid Build Coastguard Worker mova ym1, [spel_hv_perm4a] 2623*c0909341SAndroid Build Coastguard Worker mova ym2, [spel_hv_perm4b] 2624*c0909341SAndroid Build Coastguard Worker mova ym3, [spel_hv_perm4c] 2625*c0909341SAndroid Build Coastguard Worker packssdw m0, m4 2626*c0909341SAndroid Build Coastguard Worker psraw m0, 2 ; _ 0 1 2 3 4 5 6 2627*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5555 2628*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym1, ym0 ; 01 12 2629*c0909341SAndroid Build Coastguard Worker vpermb m2, m2, m0 ; 23 34 2630*c0909341SAndroid Build Coastguard Worker vpermb m3, m3, m0 ; 45 56 2631*c0909341SAndroid Build Coastguard Worker kmovw k1, r6d 2632*c0909341SAndroid Build Coastguard Worker mova ym15, [spel_hv_perm4d] 2633*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 2634*c0909341SAndroid Build Coastguard Worker movq xmm4, [srcq+ssq*1] 2635*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2636*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 2637*c0909341SAndroid Build Coastguard Worker pmaddwd ym5, ym1, ym10 ; a0 b0 2638*c0909341SAndroid Build Coastguard Worker mova ym1, ym2 2639*c0909341SAndroid Build Coastguard Worker pshufb ym4, ym6 2640*c0909341SAndroid Build Coastguard Worker mova ym0, ym9 2641*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym4, ym7 2642*c0909341SAndroid Build Coastguard Worker vpdpwssd ym5, ym2, ym11 ; a1 b1 2643*c0909341SAndroid Build Coastguard Worker mova ym2, ym3 2644*c0909341SAndroid Build Coastguard Worker vpdpwssd ym5, ym3, ym12 ; a2 b2 2645*c0909341SAndroid Build Coastguard Worker vpsraw ym3{k1}, ym0, 2 ; 7 8 2646*c0909341SAndroid Build Coastguard Worker vpermb ym3, ym15, ym3 ; 67 78 2647*c0909341SAndroid Build Coastguard Worker vpdpwssd ym5, ym3, ym13 ; a3 b3 2648*c0909341SAndroid Build Coastguard Worker packuswb ym5, ym5 2649*c0909341SAndroid Build Coastguard Worker vpermb ym5, ym14, ym5 2650*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xm5 2651*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xm5, 1 2652*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2653*c0909341SAndroid Build Coastguard Worker sub hd, 2 2654*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 2655*c0909341SAndroid Build Coastguard Worker RET 2656*c0909341SAndroid Build Coastguard Worker.hv_w8: 2657*c0909341SAndroid Build Coastguard Worker shr mxd, 16 2658*c0909341SAndroid Build Coastguard Worker sub srcq, 3 2659*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+subpel_filters+mxq*8+0] 2660*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+subpel_filters+mxq*8+4] 2661*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2662*c0909341SAndroid Build Coastguard Worker shr myd, 16 2663*c0909341SAndroid Build Coastguard Worker cmp hd, 6 2664*c0909341SAndroid Build Coastguard Worker cmovs myd, mxd 2665*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [base+subpel_filters+myq*8] 2666*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 2667*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2668*c0909341SAndroid Build Coastguard Worker psraw m0, 2 ; << 6 2669*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 2670*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 2671*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 2672*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q3333 2673*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2674*c0909341SAndroid Build Coastguard Worker jne .hv_w16 2675*c0909341SAndroid Build Coastguard Worker mov r6, srcq 2676*c0909341SAndroid Build Coastguard Worker sub r6, ss3q 2677*c0909341SAndroid Build Coastguard Worker movu xmm1, [r6+ssq*0] 2678*c0909341SAndroid Build Coastguard Worker vinserti128 ymm1, [r6+ssq*1], 1 2679*c0909341SAndroid Build Coastguard Worker movu xmm2, [srcq+ssq*1] 2680*c0909341SAndroid Build Coastguard Worker vinserti32x4 m6, zmm1, [r6+ssq*2], 2 2681*c0909341SAndroid Build Coastguard Worker vinserti128 ymm2, [srcq+ssq*2], 1 2682*c0909341SAndroid Build Coastguard Worker vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 2683*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2684*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [subpel_h_shufA] 2685*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ 2686*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [subpel_h_shufB] 2687*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [subpel_h_shufC] 2688*c0909341SAndroid Build Coastguard Worker pshufb m1, m6, m4 ; 0 1 2 3 0123 2689*c0909341SAndroid Build Coastguard Worker mova m2, m9 2690*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m1, m10 2691*c0909341SAndroid Build Coastguard Worker pshufb m5, m6, m7 ; 0 1 2 3 4567 2692*c0909341SAndroid Build Coastguard Worker mova m1, m9 2693*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m5, m10 2694*c0909341SAndroid Build Coastguard Worker pshufb m4, m0, m4 ; 4 5 6 _ 0123 2695*c0909341SAndroid Build Coastguard Worker mova m3, m9 2696*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m10 2697*c0909341SAndroid Build Coastguard Worker pshufb m7, m0, m7 ; 4 5 6 _ 4567 2698*c0909341SAndroid Build Coastguard Worker mova m4, m9 2699*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m7, m10 2700*c0909341SAndroid Build Coastguard Worker pshufb m6, m8 2701*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m5, m11 2702*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m6, m11 2703*c0909341SAndroid Build Coastguard Worker pshufb m6, m0, m8 2704*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m7, m11 2705*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m6, m11 2706*c0909341SAndroid Build Coastguard Worker mova m5, [spel_hv_perm8a] 2707*c0909341SAndroid Build Coastguard Worker vpaddd m0, m5, [pb_32] {1to16} 2708*c0909341SAndroid Build Coastguard Worker mov r6, 0x55555555ff00 2709*c0909341SAndroid Build Coastguard Worker packssdw m2, m1 2710*c0909341SAndroid Build Coastguard Worker packssdw m3, m4 2711*c0909341SAndroid Build Coastguard Worker mova m8, [spel_hv_perm8b] 2712*c0909341SAndroid Build Coastguard Worker psraw m2, 2 ; 0 1 2 3 2713*c0909341SAndroid Build Coastguard Worker psraw m3, 2 ; 4 5 6 _ 2714*c0909341SAndroid Build Coastguard Worker vpermb m1, m5, m2 ; 01 12 2715*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [subpel_h_shufA] 2716*c0909341SAndroid Build Coastguard Worker kmovq k1, r6 2717*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m0, m3 ; 23 34 2718*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m7, [subpel_h_shufB] 2719*c0909341SAndroid Build Coastguard Worker kshiftrq k2, k1, 16 2720*c0909341SAndroid Build Coastguard Worker mova xm16, [spel_hv_end] 2721*c0909341SAndroid Build Coastguard Worker vpermb m3, m5, m3 ; 45 56 2722*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 2723*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+ssq*1] 2724*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 2725*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4{k1}, [srcq+ssq*0] 2726*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m1, m12 ; a0 b0 2727*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 ; 7 8 0123 4567 2728*c0909341SAndroid Build Coastguard Worker mova m5, m9 2729*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m1, m10 2730*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 ; 7 8 4567 89ab 2731*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m13 ; a1 b1 2732*c0909341SAndroid Build Coastguard Worker mova m1, m2 2733*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m4, m11 2734*c0909341SAndroid Build Coastguard Worker mova m2, m3 2735*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m3, m14 ; a2 b2 2736*c0909341SAndroid Build Coastguard Worker psraw m3{k2}, m5, 2 ; 75 86 2737*c0909341SAndroid Build Coastguard Worker vpermb m3, m8, m3 ; 67 78 2738*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m3, m15 ; a3 b3 2739*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2740*c0909341SAndroid Build Coastguard Worker vpermb zmm1, m16, m0 2741*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm1 2742*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm1 2743*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 2744*c0909341SAndroid Build Coastguard Worker sub hd, 2 2745*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 2746*c0909341SAndroid Build Coastguard Worker vzeroupper 2747*c0909341SAndroid Build Coastguard Worker RET 2748*c0909341SAndroid Build Coastguard Worker.hv_w16: 2749*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 23 2750*c0909341SAndroid Build Coastguard Worker movu m22, [spel_hv_perm16a] 2751*c0909341SAndroid Build Coastguard Worker sub srcq, ss3q 2752*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pb_4] 2753*c0909341SAndroid Build Coastguard Worker lea r6d, [wq*2-32] 2754*c0909341SAndroid Build Coastguard Worker mova m7, [spel_hv_perm16b] 2755*c0909341SAndroid Build Coastguard Worker paddb m20, m8, m22 2756*c0909341SAndroid Build Coastguard Worker mova ym16, [spel_hv_end16] 2757*c0909341SAndroid Build Coastguard Worker paddb m21, m8, m20 2758*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+r6*8] 2759*c0909341SAndroid Build Coastguard Worker paddb m8, m7 2760*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 2761*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+ssq*0] 2762*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 2763*c0909341SAndroid Build Coastguard Worker lea r4, [srcq+ss3q] 2764*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*2] 2765*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3 2766*c0909341SAndroid Build Coastguard Worker mov r7, dstq 2767*c0909341SAndroid Build Coastguard Worker movu ym19, [r4 +ssq*1] 2768*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5 2769*c0909341SAndroid Build Coastguard Worker add r4, ss3q 2770*c0909341SAndroid Build Coastguard Worker vpermb m2, m22, m17 ; 0 1 0123 89ab 2771*c0909341SAndroid Build Coastguard Worker mova m1, m9 2772*c0909341SAndroid Build Coastguard Worker vpermb m3, m21, m17 ; 0 1 89ab ghij 2773*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m10 2774*c0909341SAndroid Build Coastguard Worker mova m2, m9 2775*c0909341SAndroid Build Coastguard Worker vpermb m4, m22, m18 ; 2 3 0123 89ab 2776*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m3, m11 2777*c0909341SAndroid Build Coastguard Worker mova m3, m9 2778*c0909341SAndroid Build Coastguard Worker vpermb m5, m21, m18 ; 2 3 89ab ghij 2779*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m10 2780*c0909341SAndroid Build Coastguard Worker mova m4, m9 2781*c0909341SAndroid Build Coastguard Worker vpermb m6, m22, m19 ; 4 5 0123 89ab 2782*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m5, m11 2783*c0909341SAndroid Build Coastguard Worker mova m5, m9 2784*c0909341SAndroid Build Coastguard Worker vpermb m17, m20, m17 ; 0 1 4567 cdef 2785*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m6, m10 2786*c0909341SAndroid Build Coastguard Worker mova m6, m9 2787*c0909341SAndroid Build Coastguard Worker vpermb m0, m21, m19 ; 4 5 89ab ghij 2788*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m17, m11 2789*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m17, m10 2790*c0909341SAndroid Build Coastguard Worker movu ym17, [r4+ssq*0] ; 6 2791*c0909341SAndroid Build Coastguard Worker vpermb m18, m20, m18 ; 2 3 4567 cdef 2792*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m0, m11 2793*c0909341SAndroid Build Coastguard Worker vpermb m0, m7, m17 ; 6 0145 2367 89cd abef 2794*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m18, m11 2795*c0909341SAndroid Build Coastguard Worker vpermb m19, m20, m19 ; 4 5 4567 cdef 2796*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m18, m10 2797*c0909341SAndroid Build Coastguard Worker mova m18, m9 2798*c0909341SAndroid Build Coastguard Worker vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij 2799*c0909341SAndroid Build Coastguard Worker vpdpbusd m18, m0, m10 2800*c0909341SAndroid Build Coastguard Worker packssdw m1, m2 2801*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m19, m11 2802*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m19, m10 2803*c0909341SAndroid Build Coastguard Worker packssdw m3, m4 2804*c0909341SAndroid Build Coastguard Worker vpdpbusd m18, m17, m11 2805*c0909341SAndroid Build Coastguard Worker psraw m1, 2 ; 01 2806*c0909341SAndroid Build Coastguard Worker psraw m3, 2 ; 23 2807*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 2808*c0909341SAndroid Build Coastguard Worker vpshrdd m2, m1, m3, 16 ; 12 2809*c0909341SAndroid Build Coastguard Worker psraw m5, 2 ; 45 2810*c0909341SAndroid Build Coastguard Worker vpshrdd m4, m3, m5, 16 ; 34 2811*c0909341SAndroid Build Coastguard Worker psraw m18, 2 2812*c0909341SAndroid Build Coastguard Worker vpshrdd m6, m5, m18, 16 ; 56 2813*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 2814*c0909341SAndroid Build Coastguard Worker movu ym19, [r4+ssq*1] 2815*c0909341SAndroid Build Coastguard Worker lea r4, [r4+ssq*2] 2816*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [r4+ssq*0], 1 2817*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m1, m12 ; a0 2818*c0909341SAndroid Build Coastguard Worker vpermb m1, m22, m19 ; 7 8 0123 89ab 2819*c0909341SAndroid Build Coastguard Worker pmaddwd m18, m2, m12 ; b0 2820*c0909341SAndroid Build Coastguard Worker mova m0, m9 2821*c0909341SAndroid Build Coastguard Worker vpermb m2, m21, m19 ; 7 8 89ab ghij 2822*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m1, m10 2823*c0909341SAndroid Build Coastguard Worker mova m1, m9 2824*c0909341SAndroid Build Coastguard Worker vpermb m19, m20, m19 ; 7 8 4567 cdef 2825*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m11 2826*c0909341SAndroid Build Coastguard Worker mova m2, m4 2827*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m3, m13 ; a1 2828*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m4, m13 ; b1 2829*c0909341SAndroid Build Coastguard Worker mova m4, m6 2830*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m19, m11 2831*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m19, m10 2832*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m5, m14 ; a2 2833*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m6, m14 ; b2 2834*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 2835*c0909341SAndroid Build Coastguard Worker mova m1, m3 2836*c0909341SAndroid Build Coastguard Worker psraw m6, m0, 2 ; 78 2837*c0909341SAndroid Build Coastguard Worker mova m3, m5 2838*c0909341SAndroid Build Coastguard Worker vpshrdd m5, m4, m6, 16 ; 67 2839*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m6, m15 ; b3 2840*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m5, m15 ; a3 2841*c0909341SAndroid Build Coastguard Worker packuswb m17, m18 2842*c0909341SAndroid Build Coastguard Worker vpermb m17, m16, m17 2843*c0909341SAndroid Build Coastguard Worker mova [r7+dsq*0], xm17 2844*c0909341SAndroid Build Coastguard Worker vextracti128 [r7+dsq*1], ym17, 1 2845*c0909341SAndroid Build Coastguard Worker lea r7, [r7+dsq*2] 2846*c0909341SAndroid Build Coastguard Worker sub hd, 2 2847*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 2848*c0909341SAndroid Build Coastguard Worker add srcq, 16 2849*c0909341SAndroid Build Coastguard Worker add dstq, 16 2850*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 2851*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 2852*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 2853*c0909341SAndroid Build Coastguard Worker RET 2854*c0909341SAndroid Build Coastguard Worker 2855*c0909341SAndroid Build Coastguard Worker%if WIN64 2856*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4 2857*c0909341SAndroid Build Coastguard Worker%else 2858*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7 2859*c0909341SAndroid Build Coastguard Worker%endif 2860*c0909341SAndroid Build Coastguard Worker 2861*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap, 2862*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc 2863*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc 2864*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc 2865*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc 2866*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc 2867*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular, REGULAR, REGULAR 2868*c0909341SAndroid Build Coastguard Worker 2869*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3 2870*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx512icl 2871*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 2872*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 6tap_h, mx, 4tap_h 2873*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 2874*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 6tap_v, my, 4tap_v 2875*c0909341SAndroid Build Coastguard Worker lea r7, [prep_avx512icl] 2876*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2877*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 2878*c0909341SAndroid Build Coastguard Worker jnz .h 2879*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 2880*c0909341SAndroid Build Coastguard Worker jnz .v 2881*c0909341SAndroid Build Coastguard Worker.prep: 2882*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 2883*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep,)] 2884*c0909341SAndroid Build Coastguard Worker add wq, r7 2885*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*3] 2886*c0909341SAndroid Build Coastguard Worker%if WIN64 2887*c0909341SAndroid Build Coastguard Worker pop r7 2888*c0909341SAndroid Build Coastguard Worker%endif 2889*c0909341SAndroid Build Coastguard Worker jmp wq 2890*c0909341SAndroid Build Coastguard Worker.v: 2891*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 2892*c0909341SAndroid Build Coastguard Worker shr myd, 16 2893*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2894*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 2895*c0909341SAndroid Build Coastguard Worker tzcnt r5d, wd 2896*c0909341SAndroid Build Coastguard Worker lea myq, [base+subpel_filters+1+myq*8] 2897*c0909341SAndroid Build Coastguard Worker movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)] 2898*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 2899*c0909341SAndroid Build Coastguard Worker sub srcq, ssq 2900*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+0] 2901*c0909341SAndroid Build Coastguard Worker add r5, r7 2902*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [myq+2] 2903*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 2904*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [myq+4] 2905*c0909341SAndroid Build Coastguard Worker sub srcq, ssq 2906*c0909341SAndroid Build Coastguard Worker jmp r5 2907*c0909341SAndroid Build Coastguard Worker.v_w4: 2908*c0909341SAndroid Build Coastguard Worker movd xmm2, [srcq+ssq*0] 2909*c0909341SAndroid Build Coastguard Worker pinsrd xmm2, [srcq+ssq*1], 1 2910*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm1, [srcq+ssq*2] 2911*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2912*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm3, [srcq+ssq*0] 2913*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm0, [srcq+ssq*1] 2914*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm5, [deint_shuf4] 2915*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm2, 0xeb 2916*c0909341SAndroid Build Coastguard Worker punpcklqdq ymm3, ymm0 2917*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _ 2918*c0909341SAndroid Build Coastguard Worker pshufb ymm1, ymm5 ; 01 12 23 34 2919*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 2920*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [srcq+ssq*2], 1 2921*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm2, [srcq+ss3q ] 2922*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2923*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm3, [srcq+ssq*0] 2924*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm0, 0xeb 2925*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm0, [srcq+ssq*1] 2926*c0909341SAndroid Build Coastguard Worker punpcklqdq ymm3, ymm0 2927*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _ 2928*c0909341SAndroid Build Coastguard Worker pshufb ymm2, ymm5 ; 45 56 67 78 2929*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0 2930*c0909341SAndroid Build Coastguard Worker vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56 2931*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2 2932*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm1, ym9 ; a1 b1 c1 d1 2933*c0909341SAndroid Build Coastguard Worker paddw ymm3, ymm4 2934*c0909341SAndroid Build Coastguard Worker paddw ymm3, ymm1 2935*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm3, ym7 2936*c0909341SAndroid Build Coastguard Worker mova ymm1, ymm2 2937*c0909341SAndroid Build Coastguard Worker mova [tmpq], ymm3 2938*c0909341SAndroid Build Coastguard Worker add tmpq, 32 2939*c0909341SAndroid Build Coastguard Worker sub hd, 4 2940*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 2941*c0909341SAndroid Build Coastguard Worker vzeroupper 2942*c0909341SAndroid Build Coastguard Worker RET 2943*c0909341SAndroid Build Coastguard Worker.v_w8: 2944*c0909341SAndroid Build Coastguard Worker mova m6, [spel_v_perm8] 2945*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+ssq*0] 2946*c0909341SAndroid Build Coastguard Worker mov r6d, 0x3e 2947*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+ssq*1] 2948*c0909341SAndroid Build Coastguard Worker kmovb k1, r6d 2949*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3, [srcq+ssq*2] 2950*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2951*c0909341SAndroid Build Coastguard Worker vpunpcklqdq ym2, [srcq+ssq*0] {1to4} 2952*c0909341SAndroid Build Coastguard Worker vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8} 2953*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+ssq*1] 2954*c0909341SAndroid Build Coastguard Worker kshiftlb k2, k1, 2 2955*c0909341SAndroid Build Coastguard Worker shufpd m1, m2, 0x18 ; 0 1 2 3 4 2956*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m1 ; 01 12 23 34 2957*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 2958*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3, [srcq+ss3q ] 2959*c0909341SAndroid Build Coastguard Worker vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4} 2960*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2961*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+ssq*1] 2962*c0909341SAndroid Build Coastguard Worker vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8} 2963*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 2964*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m0 ; 45 56 67 78 2965*c0909341SAndroid Build Coastguard Worker mova xm0, xm3 2966*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m2, q1032 ; 23 34 45 56 2967*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m10 ; a3 b3 c3 d3 2968*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m1, m9 ; a2 b2 c2 d2 2969*c0909341SAndroid Build Coastguard Worker mova m1, m2 2970*c0909341SAndroid Build Coastguard Worker paddw m4, m3 2971*c0909341SAndroid Build Coastguard Worker paddw m4, m5 2972*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2973*c0909341SAndroid Build Coastguard Worker mova [tmpq], m4 2974*c0909341SAndroid Build Coastguard Worker add tmpq, 64 2975*c0909341SAndroid Build Coastguard Worker sub hd, 4 2976*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 2977*c0909341SAndroid Build Coastguard Worker RET 2978*c0909341SAndroid Build Coastguard Worker.v_w16: 2979*c0909341SAndroid Build Coastguard Worker mova m11, [spel_v_perm16b] 2980*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [srcq+ssq*0] 2981*c0909341SAndroid Build Coastguard Worker mov r6d, 0x0f 2982*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [srcq+ssq*1] 2983*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ssq*2] 2984*c0909341SAndroid Build Coastguard Worker kmovb k1, r6d 2985*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 2986*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+ssq*0] 2987*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*1] 2988*c0909341SAndroid Build Coastguard Worker vshufpd m1{k1}, m3, m2, 0xcc 2989*c0909341SAndroid Build Coastguard Worker vshufpd m2{k1}, m4, m0, 0xcc 2990*c0909341SAndroid Build Coastguard Worker vpermb m1, m11, m1 ; 01 12 2991*c0909341SAndroid Build Coastguard Worker vpermb m2, m11, m2 ; 23 34 2992*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 2993*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m8 ; a0 b0 2994*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m2, m9 ; a1 b1 2995*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym6, [srcq+ssq*2] 2996*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m8 ; c0 d0 2997*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+ss3q ] 2998*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 2999*c0909341SAndroid Build Coastguard Worker vshufpd m0{k1}, m6, m2, 0xcc 3000*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym6, [srcq+ssq*0] 3001*c0909341SAndroid Build Coastguard Worker vpermb m1, m11, m0 ; 45 56 3002*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+ssq*1] 3003*c0909341SAndroid Build Coastguard Worker vshufpd m2{k1}, m6, m0, 0xcc 3004*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m1, m9 ; c1 d1 3005*c0909341SAndroid Build Coastguard Worker vpermb m2, m11, m2 ; 67 78 3006*c0909341SAndroid Build Coastguard Worker paddw m3, m5 3007*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m1, m10 ; a2 b2 3008*c0909341SAndroid Build Coastguard Worker paddw m4, m6 3009*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m2, m10 ; c2 d2 3010*c0909341SAndroid Build Coastguard Worker paddw m3, m5 3011*c0909341SAndroid Build Coastguard Worker paddw m4, m6 3012*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m7 3013*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 3014*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], m3 3015*c0909341SAndroid Build Coastguard Worker mova [tmpq+64], m4 3016*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3017*c0909341SAndroid Build Coastguard Worker sub hd, 4 3018*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 3019*c0909341SAndroid Build Coastguard Worker RET 3020*c0909341SAndroid Build Coastguard Worker.v_w32: 3021*c0909341SAndroid Build Coastguard Worker movshdup m6, [bilin_v_perm64] 3022*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+ssq*0] 3023*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+ssq*1] 3024*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+ssq*2] 3025*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 3026*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*0] 3027*c0909341SAndroid Build Coastguard Worker add srcq, ssq 3028*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+ssq*0] 3029*c0909341SAndroid Build Coastguard Worker vpermt2q m16, m6, m18 ; 0 2 3030*c0909341SAndroid Build Coastguard Worker vpermt2q m17, m6, m19 ; 1 3 3031*c0909341SAndroid Build Coastguard Worker vpermt2q m18, m6, m20 ; 2 4 3032*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m16, m17 ; 01 3033*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m17, m18 ; 12 3034*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m16, m17 ; 23 3035*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m17, m18 ; 34 3036*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 3037*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+ssq*1] 3038*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 3039*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+ssq*0] 3040*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m0, m8 ; a0 3041*c0909341SAndroid Build Coastguard Worker mova m0, m2 3042*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 ; a1 3043*c0909341SAndroid Build Coastguard Worker vpermt2q m16, m6, m17 ; 5 6 3044*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m1, m8 ; b0 3045*c0909341SAndroid Build Coastguard Worker mova m1, m3 3046*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 ; b1 3047*c0909341SAndroid Build Coastguard Worker shufpd m18, m16, 0x55 ; 4 5 3048*c0909341SAndroid Build Coastguard Worker paddw m4, m2 3049*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m18, m16 ; 45 3050*c0909341SAndroid Build Coastguard Worker paddw m5, m3 3051*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m18, m16 ; 56 3052*c0909341SAndroid Build Coastguard Worker mova m18, m16 3053*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m2, m10 ; a2 3054*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m3, m10 ; b2 3055*c0909341SAndroid Build Coastguard Worker paddw m4, m16 3056*c0909341SAndroid Build Coastguard Worker paddw m5, m17 3057*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 3058*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 3059*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], m4 3060*c0909341SAndroid Build Coastguard Worker mova [tmpq+64], m5 3061*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3062*c0909341SAndroid Build Coastguard Worker sub hd, 2 3063*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 3064*c0909341SAndroid Build Coastguard Worker vzeroupper 3065*c0909341SAndroid Build Coastguard Worker RET 3066*c0909341SAndroid Build Coastguard Worker.v_w64: 3067*c0909341SAndroid Build Coastguard Worker.v_w128: 3068*c0909341SAndroid Build Coastguard Worker mova m6, [bilin_v_perm64] 3069*c0909341SAndroid Build Coastguard Worker add wd, wd 3070*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq] 3071*c0909341SAndroid Build Coastguard Worker.v_loop0: 3072*c0909341SAndroid Build Coastguard Worker vpermq m12, m6, [srcq+ssq*0] 3073*c0909341SAndroid Build Coastguard Worker vpermq m13, m6, [srcq+ssq*1] 3074*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+ssq*2] 3075*c0909341SAndroid Build Coastguard Worker vpermq m14, m6, [r5 +ssq*0] 3076*c0909341SAndroid Build Coastguard Worker vpermq m15, m6, [r5 +ssq*1] 3077*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 3078*c0909341SAndroid Build Coastguard Worker vpermq m16, m6, [r5 +ssq*0] 3079*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3080*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m12, m13 ; 01 3081*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m13 3082*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m13, m14 ; 12 3083*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m14 3084*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m14, m15 ; 23 3085*c0909341SAndroid Build Coastguard Worker punpckhbw m14, m15 3086*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m15, m16 ; 34 3087*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m16 3088*c0909341SAndroid Build Coastguard Worker.v_loop: 3089*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m0, m8 ; a0 3090*c0909341SAndroid Build Coastguard Worker vpermq m5, m6, [r5+ssq*1] 3091*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m12, m8 3092*c0909341SAndroid Build Coastguard Worker mova m0, m2 3093*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 ; a1 3094*c0909341SAndroid Build Coastguard Worker mova m12, m14 3095*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m9 3096*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 3097*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m1, m8 ; b0 3098*c0909341SAndroid Build Coastguard Worker pmaddubsw m20, m13, m8 3099*c0909341SAndroid Build Coastguard Worker mova m1, m3 3100*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 ; b1 3101*c0909341SAndroid Build Coastguard Worker mova m13, m15 3102*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m9 3103*c0909341SAndroid Build Coastguard Worker paddw m17, m2 3104*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m16, m5 ; 67 3105*c0909341SAndroid Build Coastguard Worker paddw m18, m14 3106*c0909341SAndroid Build Coastguard Worker punpckhbw m14, m16, m5 3107*c0909341SAndroid Build Coastguard Worker vpermq m16, m6, [r5+ssq*0] 3108*c0909341SAndroid Build Coastguard Worker paddw m19, m3 3109*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m10 ; a3 3110*c0909341SAndroid Build Coastguard Worker paddw m20, m15 3111*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m14, m10 3112*c0909341SAndroid Build Coastguard Worker paddw m17, m3 3113*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m5, m16 ; 78 3114*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m3, m10 ; b3 3115*c0909341SAndroid Build Coastguard Worker paddw m18, m15 3116*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m5, m16 3117*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m15, m10 3118*c0909341SAndroid Build Coastguard Worker paddw m19, m4 3119*c0909341SAndroid Build Coastguard Worker paddw m20, m5 3120*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m7}, m17, m18, m19, m20 3121*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0+ 0], m17 3122*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0+64], m18 3123*c0909341SAndroid Build Coastguard Worker mova [r7+wq*1+ 0], m19 3124*c0909341SAndroid Build Coastguard Worker mova [r7+wq*1+64], m20 3125*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*2] 3126*c0909341SAndroid Build Coastguard Worker sub hd, 2 3127*c0909341SAndroid Build Coastguard Worker jg .v_loop 3128*c0909341SAndroid Build Coastguard Worker add srcq, 64 3129*c0909341SAndroid Build Coastguard Worker add tmpq, 128 3130*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 3131*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 3132*c0909341SAndroid Build Coastguard Worker jg .v_loop0 3133*c0909341SAndroid Build Coastguard Worker vzeroupper 3134*c0909341SAndroid Build Coastguard Worker RET 3135*c0909341SAndroid Build Coastguard Worker.h: 3136*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3137*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2 3138*c0909341SAndroid Build Coastguard Worker.hv: 3139*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_2] 3140*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_32] 3141*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3142*c0909341SAndroid Build Coastguard Worker jg .hv_w8 3143*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 3144*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+subpel_filters+mxq*8+2] 3145*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3146*c0909341SAndroid Build Coastguard Worker shr myd, 16 3147*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3148*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3149*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [base+subpel_filters+1+myq*8] 3150*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [subpel_h_shufA] 3151*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*2+1] 3152*c0909341SAndroid Build Coastguard Worker mov r3d, 0x30 3153*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3154*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 3155*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym2, [srcq+ssq*0] 3156*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 3157*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [srcq+ssq*1] 3158*c0909341SAndroid Build Coastguard Worker kaddb k2, k1, k1 3159*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2{k1}, [srcq+ssq*2] 3160*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 3161*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3 3162*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m3 3163*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4 3164*c0909341SAndroid Build Coastguard Worker psraw m3, 8 ; sign-extend 3165*c0909341SAndroid Build Coastguard Worker mova m6, [spel_hv_perm4a] 3166*c0909341SAndroid Build Coastguard Worker kshiftrb k1, k1, 2 3167*c0909341SAndroid Build Coastguard Worker movu m7, [spel_hv_perm4b] 3168*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 3169*c0909341SAndroid Build Coastguard Worker mova m0, m8 3170*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m1, m11 3171*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 3172*c0909341SAndroid Build Coastguard Worker mova m1, m8 3173*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m11 3174*c0909341SAndroid Build Coastguard Worker pshufd m12, m3, q0000 3175*c0909341SAndroid Build Coastguard Worker pshufd m13, m3, q1111 3176*c0909341SAndroid Build Coastguard Worker pshufd m14, m3, q2222 3177*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 ; _ _ _ 0 1 2 3 4 3178*c0909341SAndroid Build Coastguard Worker psraw m0, 2 3179*c0909341SAndroid Build Coastguard Worker vpermb m1, m7, m0 ; 01 12 23 34 3180*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 3181*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+ssq*2] 3182*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+ss3q ] 3183*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3184*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7 3185*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8 3186*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym10 3187*c0909341SAndroid Build Coastguard Worker mova ym2, ym8 3188*c0909341SAndroid Build Coastguard Worker vpdpbusd ym2, ym3, ym11 3189*c0909341SAndroid Build Coastguard Worker pshufb ym4, ym10 3190*c0909341SAndroid Build Coastguard Worker mova ym3, ym8 3191*c0909341SAndroid Build Coastguard Worker vpdpbusd ym3, ym4, ym11 3192*c0909341SAndroid Build Coastguard Worker mova m4, m9 3193*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m1, m12 ; a0 b0 c0 d0 3194*c0909341SAndroid Build Coastguard Worker packssdw ym2, ym3 ; 5 6 7 8 3195*c0909341SAndroid Build Coastguard Worker psraw ym2, 2 3196*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8 3197*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m0 ; 23 34 45 56 3198*c0909341SAndroid Build Coastguard Worker vpermb m1, m7, m0 ; 45 56 67 78 3199*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m2, m13 ; a1 b1 c1 d1 3200*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m1, m14 ; a2 b2 c2 d2 3201*c0909341SAndroid Build Coastguard Worker psrad m4, 6 3202*c0909341SAndroid Build Coastguard Worker vpmovdw [tmpq], m4 3203*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3204*c0909341SAndroid Build Coastguard Worker sub hd, 4 3205*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 3206*c0909341SAndroid Build Coastguard Worker RET 3207*c0909341SAndroid Build Coastguard Worker.hv_w8: 3208*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3209*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+subpel_filters+mxq*8+0] 3210*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+subpel_filters+mxq*8+4] 3211*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3212*c0909341SAndroid Build Coastguard Worker shr myd, 16 3213*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3214*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3215*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+1+myq*8] 3216*c0909341SAndroid Build Coastguard Worker lea r6, [ssq*2+3] 3217*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 3218*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3219*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 3220*c0909341SAndroid Build Coastguard Worker lea ss3q, [ssq*3] 3221*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 3222*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 3223*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 3224*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3225*c0909341SAndroid Build Coastguard Worker jg .hv_w16 3226*c0909341SAndroid Build Coastguard Worker movu xm16, [srcq+ssq*0] 3227*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m19, [subpel_h_shufA] 3228*c0909341SAndroid Build Coastguard Worker vinserti128 ym16, [srcq+ssq*1], 1 3229*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m21, [subpel_h_shufC] 3230*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [srcq+ssq*2], 2 3231*c0909341SAndroid Build Coastguard Worker add srcq, ss3q 3232*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [srcq+ssq*0], 3 3233*c0909341SAndroid Build Coastguard Worker movu xm17, [srcq+ssq*1] 3234*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m20, [subpel_h_shufB] 3235*c0909341SAndroid Build Coastguard Worker pshufb m3, m16, m19 ; 0 1 2 3 0123 3236*c0909341SAndroid Build Coastguard Worker mova m2, m8 3237*c0909341SAndroid Build Coastguard Worker pshufb m0, m16, m21 ; 0 1 2 3 89ab 3238*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m3, m10 3239*c0909341SAndroid Build Coastguard Worker mova m3, m8 3240*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm17, xm19 ; 3 4 5 6 0123 3241*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m0, m11 3242*c0909341SAndroid Build Coastguard Worker mova xm0, xm8 3243*c0909341SAndroid Build Coastguard Worker pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab 3244*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xm1, xm10 3245*c0909341SAndroid Build Coastguard Worker mova xm1, xm8 3246*c0909341SAndroid Build Coastguard Worker pshufb m16, m20 ; 0 1 2 3 4567 3247*c0909341SAndroid Build Coastguard Worker vpdpbusd xm1, xm18, xm11 3248*c0909341SAndroid Build Coastguard Worker pshufb xm17, xm20 ; 3 4 5 6 4567 3249*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m16, m11 3250*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m16, m10 3251*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xm17, xm11 3252*c0909341SAndroid Build Coastguard Worker vpdpbusd xm1, xm17, xm10 3253*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 3254*c0909341SAndroid Build Coastguard Worker packssdw xm0, xm1 3255*c0909341SAndroid Build Coastguard Worker psraw m2, 2 ; 0 1 2 3 3256*c0909341SAndroid Build Coastguard Worker psraw xm0, 2 ; 4 3257*c0909341SAndroid Build Coastguard Worker valignq m0, m2, 2 ; 1 2 3 4 3258*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m0 ; 01 12 23 34 3259*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0 3260*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 3261*c0909341SAndroid Build Coastguard Worker movu xm16, [srcq+ssq*2] 3262*c0909341SAndroid Build Coastguard Worker vinserti128 ym16, [srcq+ss3q ], 1 3263*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*4] 3264*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [srcq+ssq*0], 2 3265*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [srcq+ssq*1], 3 3266*c0909341SAndroid Build Coastguard Worker pshufb m6, m16, m19 ; 5 6 7 8 0123 3267*c0909341SAndroid Build Coastguard Worker mova m5, m8 3268*c0909341SAndroid Build Coastguard Worker pshufb m3, m16, m21 ; 5 6 7 8 89ab 3269*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m6, m10 3270*c0909341SAndroid Build Coastguard Worker mova m6, m8 3271*c0909341SAndroid Build Coastguard Worker pshufb m16, m20 ; 5 6 7 8 4567 3272*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m3, m11 3273*c0909341SAndroid Build Coastguard Worker mova m3, m9 3274*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m1, m12 ; a0 b0 c0 d0 3275*c0909341SAndroid Build Coastguard Worker mova m4, m9 3276*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m2, m12 3277*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m16, m11 3278*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m16, m10 3279*c0909341SAndroid Build Coastguard Worker mova m16, m1 3280*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 3281*c0909341SAndroid Build Coastguard Worker mova m6, m2 3282*c0909341SAndroid Build Coastguard Worker psraw m5, 2 ; 5 6 7 8 3283*c0909341SAndroid Build Coastguard Worker valignq m2, m5, m0, 6 ; 4 5 6 7 3284*c0909341SAndroid Build Coastguard Worker mova m0, m5 3285*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m5 ; 45 56 67 78 3286*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 3287*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m1, m14 ; a2 b2 c2 d2 3288*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m2, m14 3289*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m1, q1032 ; 23 34 45 56 3290*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m2, q1032 3291*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m16, m13 ; a1 b1 c1 d1 3292*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m6, m13 3293*c0909341SAndroid Build Coastguard Worker psrad m3, 6 3294*c0909341SAndroid Build Coastguard Worker psrad m4, 6 3295*c0909341SAndroid Build Coastguard Worker packssdw m3, m4 3296*c0909341SAndroid Build Coastguard Worker mova [tmpq], m3 3297*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3298*c0909341SAndroid Build Coastguard Worker sub hd, 4 3299*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 3300*c0909341SAndroid Build Coastguard Worker vzeroupper 3301*c0909341SAndroid Build Coastguard Worker RET 3302*c0909341SAndroid Build Coastguard Worker.hv_w16: 3303*c0909341SAndroid Build Coastguard Worker mova m16, [spel_h_perm16] 3304*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [pb_4] 3305*c0909341SAndroid Build Coastguard Worker add wd, wd 3306*c0909341SAndroid Build Coastguard Worker paddb m17, m18, m16 3307*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq*8-256] 3308*c0909341SAndroid Build Coastguard Worker paddb m18, m17 3309*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 3310*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+ssq*0] 3311*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+ssq*1], 1 3312*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+ssq*2] 3313*c0909341SAndroid Build Coastguard Worker movu ym20, [r5 +ssq*0] 3314*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [r5 +ssq*1], 1 3315*c0909341SAndroid Build Coastguard Worker lea r5, [r5 +ssq*2] 3316*c0909341SAndroid Build Coastguard Worker movu ym21, [r5 +ssq*0] 3317*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3318*c0909341SAndroid Build Coastguard Worker vpermb m3, m16, m19 ; 0 1 0123 89ab 3319*c0909341SAndroid Build Coastguard Worker mova m2, m8 3320*c0909341SAndroid Build Coastguard Worker vpermb m4, m18, m19 ; 0 1 89ab ghij 3321*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m3, m10 3322*c0909341SAndroid Build Coastguard Worker mova m3, m8 3323*c0909341SAndroid Build Coastguard Worker vpermb m5, m16, m20 ; 2 3 0123 89ab 3324*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m11 3325*c0909341SAndroid Build Coastguard Worker mova m4, m8 3326*c0909341SAndroid Build Coastguard Worker vpermb m0, m18, m20 ; 2 3 89ab ghij 3327*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m5, m10 3328*c0909341SAndroid Build Coastguard Worker mova m5, m8 3329*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym16, ym21 ; 4 0123 89ab 3330*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m0, m11 3331*c0909341SAndroid Build Coastguard Worker mova ym0, ym8 3332*c0909341SAndroid Build Coastguard Worker vpermb ym6, ym18, ym21 ; 4 89ab ghij 3333*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym10 3334*c0909341SAndroid Build Coastguard Worker mova ym1, ym8 3335*c0909341SAndroid Build Coastguard Worker vpermb m19, m17, m19 ; 0 1 4567 cdef 3336*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym6, ym11 3337*c0909341SAndroid Build Coastguard Worker vpermb m20, m17, m20 ; 2 3 4567 cdef 3338*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m19, m11 3339*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m19, m10 3340*c0909341SAndroid Build Coastguard Worker vpermb ym21, ym17, ym21 ; 4 4567 cdef 3341*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m20, m11 3342*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m20, m10 3343*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym21, ym11 3344*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym21, ym10 3345*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 ; 0 1 3346*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 ; 2 3 3347*c0909341SAndroid Build Coastguard Worker packssdw ym0, ym1 ; 4 3348*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m2, m4, ym0 3349*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m2, m4, q1032 ; 1 2 3350*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m4, m0, q1032 ; 3 4 3351*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 ; 01 12 3352*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 3353*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 23 34 3354*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 3355*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 3356*c0909341SAndroid Build Coastguard Worker movu ym19, [r5+ssq*1] 3357*c0909341SAndroid Build Coastguard Worker lea r5, [r5+ssq*2] 3358*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [r5+ssq*0], 1 3359*c0909341SAndroid Build Coastguard Worker vpermb m6, m16, m19 ; 5 6 0123 89ab 3360*c0909341SAndroid Build Coastguard Worker mova m5, m8 3361*c0909341SAndroid Build Coastguard Worker vpermb m20, m18, m19 ; 5 6 89ab ghij 3362*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m6, m10 3363*c0909341SAndroid Build Coastguard Worker mova m6, m8 3364*c0909341SAndroid Build Coastguard Worker vpermb m19, m17, m19 ; 5 6 4567 cdef 3365*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m20, m11 3366*c0909341SAndroid Build Coastguard Worker mova m20, m9 3367*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m1, m12 ; a0 b0 3368*c0909341SAndroid Build Coastguard Worker mova m21, m9 3369*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m2, m12 3370*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m19, m11 3371*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m19, m10 3372*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m3, m13 ; a1 b1 3373*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m4, m13 3374*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 3375*c0909341SAndroid Build Coastguard Worker mova m1, m3 3376*c0909341SAndroid Build Coastguard Worker psraw m5, 2 ; 5 6 3377*c0909341SAndroid Build Coastguard Worker mova m2, m4 3378*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m0, m5, q1032 ; 4 5 3379*c0909341SAndroid Build Coastguard Worker mova m0, m5 3380*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 45 56 3381*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 3382*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m3, m14 ; a2 b2 3383*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m4, m14 3384*c0909341SAndroid Build Coastguard Worker psrad m20, 6 3385*c0909341SAndroid Build Coastguard Worker psrad m21, 6 3386*c0909341SAndroid Build Coastguard Worker packssdw m20, m21 3387*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0], ym20 3388*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r7+wq*1], m20, 1 3389*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*2] 3390*c0909341SAndroid Build Coastguard Worker sub hd, 2 3391*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 3392*c0909341SAndroid Build Coastguard Worker add srcq, 16 3393*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3394*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 3395*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 3396*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 3397*c0909341SAndroid Build Coastguard Worker vzeroupper 3398*c0909341SAndroid Build Coastguard Worker RET 3399*c0909341SAndroid Build Coastguard Worker 3400*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 0 3401*c0909341SAndroid Build Coastguard Worker vpermb m10, m5, m0 3402*c0909341SAndroid Build Coastguard Worker vpermb m11, m5, m1 3403*c0909341SAndroid Build Coastguard Worker vpermb m12, m6, m0 3404*c0909341SAndroid Build Coastguard Worker vpermb m13, m6, m1 3405*c0909341SAndroid Build Coastguard Worker vpermb m14, m7, m0 3406*c0909341SAndroid Build Coastguard Worker vpermb m15, m7, m1 3407*c0909341SAndroid Build Coastguard Worker mova m0, m4 3408*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m10, m8 3409*c0909341SAndroid Build Coastguard Worker mova m2, m4 3410*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m12, m8 3411*c0909341SAndroid Build Coastguard Worker mova m1, m4 3412*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m11, m8 3413*c0909341SAndroid Build Coastguard Worker mova m3, m4 3414*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m13, m8 3415*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m12, m9 3416*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m14, m9 3417*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m13, m9 3418*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m15, m9 3419*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 3420*c0909341SAndroid Build Coastguard Worker packssdw m1, m3 3421*c0909341SAndroid Build Coastguard Worker psraw m0, 2 3422*c0909341SAndroid Build Coastguard Worker psraw m1, 2 3423*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*0], m0 3424*c0909341SAndroid Build Coastguard Worker mova [tmpq+64*1], m1 3425*c0909341SAndroid Build Coastguard Worker%endmacro 3426*c0909341SAndroid Build Coastguard Worker 3427*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc 3428*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc 3429*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp, SHARP, SHARP 3430*c0909341SAndroid Build Coastguard Worker 3431*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3 3432*c0909341SAndroid Build Coastguard Worker imul mxd, mxm, 0x010101 3433*c0909341SAndroid Build Coastguard Worker add mxd, t0d ; 8tap_h, mx, 4tap_h 3434*c0909341SAndroid Build Coastguard Worker imul myd, mym, 0x010101 3435*c0909341SAndroid Build Coastguard Worker add myd, t1d ; 8tap_v, my, 4tap_v 3436*c0909341SAndroid Build Coastguard Worker lea r7, [prep_avx512icl] 3437*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 3438*c0909341SAndroid Build Coastguard Worker test mxd, 0xf00 3439*c0909341SAndroid Build Coastguard Worker jnz .h 3440*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3441*c0909341SAndroid Build Coastguard Worker jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep 3442*c0909341SAndroid Build Coastguard Worker.v: 3443*c0909341SAndroid Build Coastguard Worker movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. 3444*c0909341SAndroid Build Coastguard Worker shr myd, 16 ; Note that the code is 8-tap only, having 3445*c0909341SAndroid Build Coastguard Worker cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 3446*c0909341SAndroid Build Coastguard Worker cmove myd, mxd ; had a negligible effect on performance. 3447*c0909341SAndroid Build Coastguard Worker tzcnt r5d, wd 3448*c0909341SAndroid Build Coastguard Worker lea myq, [base+subpel_filters+myq*8] 3449*c0909341SAndroid Build Coastguard Worker movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)] 3450*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_8192] 3451*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, [myq+0] 3452*c0909341SAndroid Build Coastguard Worker add r5, r7 3453*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [myq+2] 3454*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3455*c0909341SAndroid Build Coastguard Worker vpbroadcastw m10, [myq+4] 3456*c0909341SAndroid Build Coastguard Worker sub srcq, stride3q 3457*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [myq+6] 3458*c0909341SAndroid Build Coastguard Worker jmp r5 3459*c0909341SAndroid Build Coastguard Worker.v_w4: 3460*c0909341SAndroid Build Coastguard Worker movd xmm0, [srcq+strideq*0] 3461*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm1, [srcq+strideq*2] 3462*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm2, [srcq+strideq*1] 3463*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm3, [srcq+stride3q ] 3464*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3465*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ 3466*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ 3467*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm0, [srcq+strideq*0] 3468*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm2, [srcq+strideq*1] 3469*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ 3470*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm0, [srcq+strideq*2] 3471*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm5, [deint_shuf4] 3472*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 3473*c0909341SAndroid Build Coastguard Worker vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 3474*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ 3475*c0909341SAndroid Build Coastguard Worker punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 3476*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 3477*c0909341SAndroid Build Coastguard Worker punpckhbw ymm2, ymm3 ; 23 34 45 56 3478*c0909341SAndroid Build Coastguard Worker.v_w4_loop: 3479*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [srcq+stride3q ], 1 3480*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3481*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm3, [srcq+strideq*0] 3482*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm4, [srcq+strideq*1] 3483*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ 3484*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ 3485*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm0, [srcq+strideq*2] 3486*c0909341SAndroid Build Coastguard Worker vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ 3487*c0909341SAndroid Build Coastguard Worker pshufb ymm3, ymm5 ; 67 78 89 9a 3488*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm4, ymm1, ym8 3489*c0909341SAndroid Build Coastguard Worker vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 3490*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm2, ym9 3491*c0909341SAndroid Build Coastguard Worker paddw ymm4, ymm2 3492*c0909341SAndroid Build Coastguard Worker mova ymm2, ymm3 3493*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm3, ym11 3494*c0909341SAndroid Build Coastguard Worker paddw ymm3, ymm4 3495*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm4, ymm1, ym10 3496*c0909341SAndroid Build Coastguard Worker paddw ymm3, ymm4 3497*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm3, ym7 3498*c0909341SAndroid Build Coastguard Worker mova [tmpq], ymm3 3499*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3500*c0909341SAndroid Build Coastguard Worker sub hd, 4 3501*c0909341SAndroid Build Coastguard Worker jg .v_w4_loop 3502*c0909341SAndroid Build Coastguard Worker vzeroupper 3503*c0909341SAndroid Build Coastguard Worker RET 3504*c0909341SAndroid Build Coastguard Worker.v_w8: 3505*c0909341SAndroid Build Coastguard Worker mova m6, [spel_v_perm8] 3506*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*0] 3507*c0909341SAndroid Build Coastguard Worker mov r6d, 0x3e 3508*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+strideq*1] 3509*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3, [srcq+strideq*2] 3510*c0909341SAndroid Build Coastguard Worker kmovb k1, r6d 3511*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym4, [srcq+stride3q ] 3512*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3513*c0909341SAndroid Build Coastguard Worker vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8} 3514*c0909341SAndroid Build Coastguard Worker vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8} 3515*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*2] 3516*c0909341SAndroid Build Coastguard Worker kshiftlb k2, k1, 2 3517*c0909341SAndroid Build Coastguard Worker shufpd m1, m2, 0x30 ; 0 1 2 3 4 5 3518*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _ 3519*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m1 ; 01 12 23 34 3520*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m2 ; 23 34 45 56 3521*c0909341SAndroid Build Coastguard Worker.v_w8_loop: 3522*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3, [srcq+strideq*4] 3523*c0909341SAndroid Build Coastguard Worker vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4} 3524*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3525*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [srcq+strideq*2] 3526*c0909341SAndroid Build Coastguard Worker vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8} 3527*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 3528*c0909341SAndroid Build Coastguard Worker mova m1, m2 3529*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m2, m9 ; a1 b1 c1 d1 3530*c0909341SAndroid Build Coastguard Worker vpermb m2, m6, m0 ; 67 78 89 9a 3531*c0909341SAndroid Build Coastguard Worker mova xm0, xm3 3532*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m2, q1032 ; 45 56 67 78 3533*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m11 ; a3 b3 c3 d3 3534*c0909341SAndroid Build Coastguard Worker paddw m4, m5 3535*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m1, m10 ; a2 b2 c2 d2 3536*c0909341SAndroid Build Coastguard Worker paddw m4, m3 3537*c0909341SAndroid Build Coastguard Worker paddw m4, m5 3538*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 3539*c0909341SAndroid Build Coastguard Worker mova [tmpq], m4 3540*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3541*c0909341SAndroid Build Coastguard Worker sub hd, 4 3542*c0909341SAndroid Build Coastguard Worker jg .v_w8_loop 3543*c0909341SAndroid Build Coastguard Worker RET 3544*c0909341SAndroid Build Coastguard Worker.v_w16: 3545*c0909341SAndroid Build Coastguard Worker mova m12, [spel_v_perm16b] 3546*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [srcq+strideq*0] 3547*c0909341SAndroid Build Coastguard Worker mov r6d, 0x0f 3548*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [srcq+strideq*1] 3549*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [srcq+strideq*2] 3550*c0909341SAndroid Build Coastguard Worker kmovb k1, r6d 3551*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym5, [srcq+stride3q ] 3552*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3553*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [srcq+strideq*0] 3554*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym6, [srcq+strideq*1] 3555*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+strideq*2] 3556*c0909341SAndroid Build Coastguard Worker vshufpd m1{k1}, m4, m2, 0xcc 3557*c0909341SAndroid Build Coastguard Worker vshufpd m2{k1}, m5, m3, 0xcc 3558*c0909341SAndroid Build Coastguard Worker vshufpd m3{k1}, m6, m0, 0xcc 3559*c0909341SAndroid Build Coastguard Worker vpermb m1, m12, m1 ; 01 12 3560*c0909341SAndroid Build Coastguard Worker vpermb m2, m12, m2 ; 23 34 3561*c0909341SAndroid Build Coastguard Worker vpermb m3, m12, m3 ; 45 56 3562*c0909341SAndroid Build Coastguard Worker.v_w16_loop: 3563*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m1, m8 ; a0 b0 3564*c0909341SAndroid Build Coastguard Worker mova m1, m3 3565*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m2, m9 ; a1 b1 3566*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym6, [srcq+stride3q ] 3567*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m2, m8 ; c0 d0 3568*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3569*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m3, m9 ; c1 d1 3570*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [srcq+strideq*0] 3571*c0909341SAndroid Build Coastguard Worker vshufpd m0{k1}, m6, m3, 0xcc 3572*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym6, [srcq+strideq*1] 3573*c0909341SAndroid Build Coastguard Worker vpermb m2, m12, m0 ; 67 78 3574*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq+strideq*2] 3575*c0909341SAndroid Build Coastguard Worker vshufpd m3{k1}, m6, m0, 0xcc 3576*c0909341SAndroid Build Coastguard Worker paddw m4, m13 3577*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m1, m10 ; a2 b2 3578*c0909341SAndroid Build Coastguard Worker vpermb m3, m12, m3 ; 89 9a 3579*c0909341SAndroid Build Coastguard Worker paddw m5, m14 3580*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m2, m10 ; c2 d2 3581*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m2, m11 ; a3 b3 3582*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m3, m11 ; c3 d3 3583*c0909341SAndroid Build Coastguard Worker paddw m4, m13 3584*c0909341SAndroid Build Coastguard Worker paddw m5, m14 3585*c0909341SAndroid Build Coastguard Worker paddw m4, m15 3586*c0909341SAndroid Build Coastguard Worker paddw m5, m6 3587*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 3588*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 3589*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], m4 3590*c0909341SAndroid Build Coastguard Worker mova [tmpq+64], m5 3591*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3592*c0909341SAndroid Build Coastguard Worker sub hd, 4 3593*c0909341SAndroid Build Coastguard Worker jg .v_w16_loop 3594*c0909341SAndroid Build Coastguard Worker RET 3595*c0909341SAndroid Build Coastguard Worker.v_w32: 3596*c0909341SAndroid Build Coastguard Worker movshdup m21, [bilin_v_perm64] 3597*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+strideq*0] 3598*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+strideq*1] 3599*c0909341SAndroid Build Coastguard Worker movu ym18, [srcq+strideq*2] 3600*c0909341SAndroid Build Coastguard Worker add srcq, stride3q 3601*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+strideq*0] 3602*c0909341SAndroid Build Coastguard Worker vpermt2q m16, m21, m19 ; 0 3 3603*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+strideq*1] 3604*c0909341SAndroid Build Coastguard Worker vpermt2q m17, m21, m20 ; 1 4 3605*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+strideq*2] 3606*c0909341SAndroid Build Coastguard Worker add srcq, stride3q 3607*c0909341SAndroid Build Coastguard Worker vpermt2q m18, m21, m20 ; 2 5 3608*c0909341SAndroid Build Coastguard Worker movu ym20, [srcq+strideq*0] 3609*c0909341SAndroid Build Coastguard Worker vpermt2q m19, m21, m20 ; 3 6 3610*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m16, m17 ; 01 3611*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m17, m18 ; 12 3612*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m18, m19 ; 23 3613*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m16, m17 ; 34 3614*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m17, m18 ; 45 3615*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m18, m19 ; 56 3616*c0909341SAndroid Build Coastguard Worker.v_w32_loop: 3617*c0909341SAndroid Build Coastguard Worker movu ym16, [srcq+strideq*1] 3618*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3619*c0909341SAndroid Build Coastguard Worker movu ym17, [srcq+strideq*0] 3620*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m0, m8 3621*c0909341SAndroid Build Coastguard Worker mova m0, m2 3622*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m1, m8 3623*c0909341SAndroid Build Coastguard Worker mova m1, m3 3624*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 3625*c0909341SAndroid Build Coastguard Worker vpermt2q m16, m21, m17 ; 7 8 3626*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 3627*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m4, m10 3628*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m5, m10 3629*c0909341SAndroid Build Coastguard Worker shufpd m19, m16, 0x55 ; 6 7 3630*c0909341SAndroid Build Coastguard Worker paddw m14, m2 3631*c0909341SAndroid Build Coastguard Worker mova m2, m4 3632*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m19, m16 ; 67 3633*c0909341SAndroid Build Coastguard Worker paddw m15, m3 3634*c0909341SAndroid Build Coastguard Worker mova m3, m5 3635*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m19, m16 ; 78 3636*c0909341SAndroid Build Coastguard Worker paddw m14, m12 3637*c0909341SAndroid Build Coastguard Worker paddw m15, m13 3638*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m4, m11 3639*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m5, m11 3640*c0909341SAndroid Build Coastguard Worker mova m19, m16 3641*c0909341SAndroid Build Coastguard Worker paddw m14, m12 3642*c0909341SAndroid Build Coastguard Worker paddw m15, m13 3643*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m7 3644*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m7 3645*c0909341SAndroid Build Coastguard Worker mova [tmpq+ 0], m14 3646*c0909341SAndroid Build Coastguard Worker mova [tmpq+64], m15 3647*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3648*c0909341SAndroid Build Coastguard Worker sub hd, 2 3649*c0909341SAndroid Build Coastguard Worker jg .v_w32_loop 3650*c0909341SAndroid Build Coastguard Worker vzeroupper 3651*c0909341SAndroid Build Coastguard Worker RET 3652*c0909341SAndroid Build Coastguard Worker.v_w64: 3653*c0909341SAndroid Build Coastguard Worker.v_w128: 3654*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 24 3655*c0909341SAndroid Build Coastguard Worker mova m23, [bilin_v_perm64] 3656*c0909341SAndroid Build Coastguard Worker add wd, wd 3657*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq] 3658*c0909341SAndroid Build Coastguard Worker.v_loop0: 3659*c0909341SAndroid Build Coastguard Worker vpermq m12, m23, [srcq+strideq*0] 3660*c0909341SAndroid Build Coastguard Worker vpermq m13, m23, [srcq+strideq*1] 3661*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+strideq*2] 3662*c0909341SAndroid Build Coastguard Worker vpermq m14, m23, [r5 +strideq*0] 3663*c0909341SAndroid Build Coastguard Worker vpermq m15, m23, [r5 +strideq*1] 3664*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3665*c0909341SAndroid Build Coastguard Worker vpermq m16, m23, [r5 +strideq*0] 3666*c0909341SAndroid Build Coastguard Worker vpermq m17, m23, [r5 +strideq*1] 3667*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3668*c0909341SAndroid Build Coastguard Worker vpermq m18, m23, [r5 +strideq*0] 3669*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 3670*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m12, m13 ; 01 3671*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m13 3672*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m13, m14 ; 12 3673*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m14 3674*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m14, m15 ; 23 3675*c0909341SAndroid Build Coastguard Worker punpckhbw m14, m15 3676*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m15, m16 ; 34 3677*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m16 3678*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m16, m17 ; 45 3679*c0909341SAndroid Build Coastguard Worker punpckhbw m16, m17 3680*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m17, m18 ; 56 3681*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m18 3682*c0909341SAndroid Build Coastguard Worker.v_loop: 3683*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m0, m8 ; a0 3684*c0909341SAndroid Build Coastguard Worker vpermq m6, m23, [r5+strideq*1] 3685*c0909341SAndroid Build Coastguard Worker pmaddubsw m20, m12, m8 3686*c0909341SAndroid Build Coastguard Worker mova m0, m2 3687*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 ; a1 3688*c0909341SAndroid Build Coastguard Worker mova m12, m14 3689*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m9 3690*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 3691*c0909341SAndroid Build Coastguard Worker pmaddubsw m21, m1, m8 ; b0 3692*c0909341SAndroid Build Coastguard Worker pmaddubsw m22, m13, m8 3693*c0909341SAndroid Build Coastguard Worker mova m1, m3 3694*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9 ; b1 3695*c0909341SAndroid Build Coastguard Worker mova m13, m15 3696*c0909341SAndroid Build Coastguard Worker pmaddubsw m15, m9 3697*c0909341SAndroid Build Coastguard Worker paddw m19, m2 3698*c0909341SAndroid Build Coastguard Worker mova m2, m4 3699*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m10 ; a2 3700*c0909341SAndroid Build Coastguard Worker paddw m20, m14 3701*c0909341SAndroid Build Coastguard Worker mova m14, m16 3702*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10 3703*c0909341SAndroid Build Coastguard Worker paddw m21, m3 3704*c0909341SAndroid Build Coastguard Worker mova m3, m5 3705*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m10 ; b2 3706*c0909341SAndroid Build Coastguard Worker paddw m22, m15 3707*c0909341SAndroid Build Coastguard Worker mova m15, m17 3708*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m10 3709*c0909341SAndroid Build Coastguard Worker paddw m19, m4 3710*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m18, m6 ; 67 3711*c0909341SAndroid Build Coastguard Worker paddw m20, m16 3712*c0909341SAndroid Build Coastguard Worker punpckhbw m16, m18, m6 3713*c0909341SAndroid Build Coastguard Worker vpermq m18, m23, [r5+strideq*0] 3714*c0909341SAndroid Build Coastguard Worker paddw m21, m5 3715*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m4, m11 ; a3 3716*c0909341SAndroid Build Coastguard Worker paddw m22, m17 3717*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m16, m11 3718*c0909341SAndroid Build Coastguard Worker paddw m19, m5 3719*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m18 ; 78 3720*c0909341SAndroid Build Coastguard Worker paddw m20, m17 3721*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m6, m18 3722*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m5, m11 ; b3 3723*c0909341SAndroid Build Coastguard Worker paddw m21, m6 3724*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m17, m11 3725*c0909341SAndroid Build Coastguard Worker paddw m22, m6 3726*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m7}, m19, m20, m21, m22 3727*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0+ 0], m19 3728*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0+64], m20 3729*c0909341SAndroid Build Coastguard Worker mova [r7+wq*1+ 0], m21 3730*c0909341SAndroid Build Coastguard Worker mova [r7+wq*1+64], m22 3731*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*2] 3732*c0909341SAndroid Build Coastguard Worker sub hd, 2 3733*c0909341SAndroid Build Coastguard Worker jg .v_loop 3734*c0909341SAndroid Build Coastguard Worker add srcq, 64 3735*c0909341SAndroid Build Coastguard Worker add tmpq, 128 3736*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 3737*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 3738*c0909341SAndroid Build Coastguard Worker jg .v_loop0 3739*c0909341SAndroid Build Coastguard Worker RET 3740*c0909341SAndroid Build Coastguard Worker.h: 3741*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 3742*c0909341SAndroid Build Coastguard Worker test myd, 0xf00 3743*c0909341SAndroid Build Coastguard Worker jnz .hv 3744*c0909341SAndroid Build Coastguard Worker.h2: 3745*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_2] 3746*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3747*c0909341SAndroid Build Coastguard Worker je .h_w4 3748*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 3749*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3750*c0909341SAndroid Build Coastguard Worker sub srcq, 3 3751*c0909341SAndroid Build Coastguard Worker movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] 3752*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+subpel_filters+mxq*8+0] 3753*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+subpel_filters+mxq*8+4] 3754*c0909341SAndroid Build Coastguard Worker add wq, r7 3755*c0909341SAndroid Build Coastguard Worker jmp wq 3756*c0909341SAndroid Build Coastguard Worker.h_w4: 3757*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 3758*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ym5, [subpel_h_shufA] 3759*c0909341SAndroid Build Coastguard Worker mov r3d, 0x4 3760*c0909341SAndroid Build Coastguard Worker dec srcq 3761*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym6, [base+subpel_filters+mxq*8+2] 3762*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 3763*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3764*c0909341SAndroid Build Coastguard Worker.h_w4_loop: 3765*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+strideq*0] 3766*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+strideq*1] 3767*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym2{k1}, [srcq+strideq*2] 3768*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3{k1}, [srcq+stride3q ] 3769*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3770*c0909341SAndroid Build Coastguard Worker pshufb ym2, ym5 3771*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym5 3772*c0909341SAndroid Build Coastguard Worker mova ym0, ym4 3773*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym2, ym6 3774*c0909341SAndroid Build Coastguard Worker mova ym1, ym4 3775*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym3, ym6 3776*c0909341SAndroid Build Coastguard Worker packssdw ym0, ym1 3777*c0909341SAndroid Build Coastguard Worker psraw ym0, 2 3778*c0909341SAndroid Build Coastguard Worker mova [tmpq], ym0 3779*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3780*c0909341SAndroid Build Coastguard Worker sub hd, 4 3781*c0909341SAndroid Build Coastguard Worker jg .h_w4_loop 3782*c0909341SAndroid Build Coastguard Worker RET 3783*c0909341SAndroid Build Coastguard Worker.h_w8: 3784*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [subpel_h_shufA] 3785*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [subpel_h_shufB] 3786*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [subpel_h_shufC] 3787*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3788*c0909341SAndroid Build Coastguard Worker.h_w8_loop: 3789*c0909341SAndroid Build Coastguard Worker movu xmm3, [srcq+strideq*0] 3790*c0909341SAndroid Build Coastguard Worker vinserti128 ym3, ymm3, [srcq+strideq*1], 1 3791*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+strideq*2], 2 3792*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [srcq+stride3q ], 3 3793*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3794*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m5 3795*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 3796*c0909341SAndroid Build Coastguard Worker mova m0, m4 3797*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m1, m8 3798*c0909341SAndroid Build Coastguard Worker mova m1, m4 3799*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m8 3800*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 3801*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m9 3802*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m9 3803*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3804*c0909341SAndroid Build Coastguard Worker psraw m0, 2 3805*c0909341SAndroid Build Coastguard Worker mova [tmpq], m0 3806*c0909341SAndroid Build Coastguard Worker add tmpq, 64 3807*c0909341SAndroid Build Coastguard Worker sub hd, 4 3808*c0909341SAndroid Build Coastguard Worker jg .h_w8_loop 3809*c0909341SAndroid Build Coastguard Worker RET 3810*c0909341SAndroid Build Coastguard Worker.h_w16: 3811*c0909341SAndroid Build Coastguard Worker mova m5, [spel_h_perm16] 3812*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pb_4] 3813*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3814*c0909341SAndroid Build Coastguard Worker paddb m6, m7, m5 3815*c0909341SAndroid Build Coastguard Worker paddb m7, m6 3816*c0909341SAndroid Build Coastguard Worker.h_w16_loop: 3817*c0909341SAndroid Build Coastguard Worker movu ym0, [srcq+strideq*0] 3818*c0909341SAndroid Build Coastguard Worker movu ym1, [srcq+strideq*2] 3819*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [srcq+strideq*1], 1 3820*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [srcq+stride3q ], 1 3821*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3822*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3823*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3824*c0909341SAndroid Build Coastguard Worker sub hd, 4 3825*c0909341SAndroid Build Coastguard Worker jg .h_w16_loop 3826*c0909341SAndroid Build Coastguard Worker RET 3827*c0909341SAndroid Build Coastguard Worker.h_w32: 3828*c0909341SAndroid Build Coastguard Worker mova m5, [spel_h_perm32] 3829*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pb_4] 3830*c0909341SAndroid Build Coastguard Worker paddb m6, m7, m5 3831*c0909341SAndroid Build Coastguard Worker paddb m7, m6 3832*c0909341SAndroid Build Coastguard Worker.h_w32_loop: 3833*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+strideq*0] 3834*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+strideq*1] 3835*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 3836*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3837*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3838*c0909341SAndroid Build Coastguard Worker sub hd, 2 3839*c0909341SAndroid Build Coastguard Worker jg .h_w32_loop 3840*c0909341SAndroid Build Coastguard Worker RET 3841*c0909341SAndroid Build Coastguard Worker.h_w64: 3842*c0909341SAndroid Build Coastguard Worker xor r6d, r6d 3843*c0909341SAndroid Build Coastguard Worker jmp .h_start 3844*c0909341SAndroid Build Coastguard Worker.h_w128: 3845*c0909341SAndroid Build Coastguard Worker mov r6, -64*1 3846*c0909341SAndroid Build Coastguard Worker.h_start: 3847*c0909341SAndroid Build Coastguard Worker mova m5, [spel_h_perm32] 3848*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pb_4] 3849*c0909341SAndroid Build Coastguard Worker sub srcq, r6 3850*c0909341SAndroid Build Coastguard Worker paddb m6, m7, m5 3851*c0909341SAndroid Build Coastguard Worker paddb m7, m6 3852*c0909341SAndroid Build Coastguard Worker.h_loop0: 3853*c0909341SAndroid Build Coastguard Worker mov r5, r6 3854*c0909341SAndroid Build Coastguard Worker.h_loop: 3855*c0909341SAndroid Build Coastguard Worker movu m0, [srcq+r5+32*0] 3856*c0909341SAndroid Build Coastguard Worker movu m1, [srcq+r5+32*1] 3857*c0909341SAndroid Build Coastguard Worker PREP_8TAP_H 3858*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 3859*c0909341SAndroid Build Coastguard Worker add r5, 64 3860*c0909341SAndroid Build Coastguard Worker jle .h_loop 3861*c0909341SAndroid Build Coastguard Worker add srcq, strideq 3862*c0909341SAndroid Build Coastguard Worker dec hd 3863*c0909341SAndroid Build Coastguard Worker jg .h_loop0 3864*c0909341SAndroid Build Coastguard Worker RET 3865*c0909341SAndroid Build Coastguard Worker.hv: 3866*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 3867*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_2] 3868*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_32] 3869*c0909341SAndroid Build Coastguard Worker cmp wd, 4 3870*c0909341SAndroid Build Coastguard Worker jg .hv_w8 3871*c0909341SAndroid Build Coastguard Worker movzx mxd, mxb 3872*c0909341SAndroid Build Coastguard Worker dec srcq 3873*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+subpel_filters+mxq*8+2] 3874*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3875*c0909341SAndroid Build Coastguard Worker shr myd, 16 3876*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3877*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3878*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+myq*8] 3879*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3880*c0909341SAndroid Build Coastguard Worker sub srcq, stride3q 3881*c0909341SAndroid Build Coastguard Worker mov r3d, 0x04 3882*c0909341SAndroid Build Coastguard Worker kmovb k1, r3d 3883*c0909341SAndroid Build Coastguard Worker kshiftlb k2, k1, 2 3884*c0909341SAndroid Build Coastguard Worker kshiftlb k3, k1, 4 3885*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [subpel_h_shufA] 3886*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 3887*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 3888*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 3889*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 3890*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 3891*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q3333 3892*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+strideq*0] 3893*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym2, [srcq+strideq*1] 3894*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3{k1}, [srcq+strideq*2] 3895*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2{k2}, [srcq+stride3q ] 3896*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3897*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3{k2}, [srcq+strideq*0] 3898*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2{k3}, [srcq+strideq*1] 3899*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3{k3}, [srcq+strideq*2] 3900*c0909341SAndroid Build Coastguard Worker mova m6, [spel_hv_perm4a] 3901*c0909341SAndroid Build Coastguard Worker movu m7, [spel_hv_perm4b] 3902*c0909341SAndroid Build Coastguard Worker mova m0, m8 3903*c0909341SAndroid Build Coastguard Worker mova m1, m8 3904*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 3905*c0909341SAndroid Build Coastguard Worker pshufb m3, m10 3906*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m11 3907*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m11 3908*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 ; _ 0 1 2 3 4 5 6 3909*c0909341SAndroid Build Coastguard Worker psraw m0, 2 3910*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m0 ; 01 12 23 34 3911*c0909341SAndroid Build Coastguard Worker vpermb m2, m7, m0 ; 23 34 45 56 3912*c0909341SAndroid Build Coastguard Worker.hv_w4_loop: 3913*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+stride3q ] 3914*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3915*c0909341SAndroid Build Coastguard Worker movq xm4, [srcq+strideq*0] 3916*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym3{k1}, [srcq+strideq*1] 3917*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym4{k1}, [srcq+strideq*2] 3918*c0909341SAndroid Build Coastguard Worker mova m5, m9 3919*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym10 3920*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 3921*c0909341SAndroid Build Coastguard Worker mova ym1, ym8 3922*c0909341SAndroid Build Coastguard Worker pshufb ym4, ym10 3923*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym3, ym11 3924*c0909341SAndroid Build Coastguard Worker mova ym3, ym8 3925*c0909341SAndroid Build Coastguard Worker vpdpbusd ym3, ym4, ym11 3926*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m2, m13 ; a1 b1 c1 d1 3927*c0909341SAndroid Build Coastguard Worker packssdw ym1, ym3 ; 7 8 9 a 3928*c0909341SAndroid Build Coastguard Worker psraw ym1, 2 3929*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a 3930*c0909341SAndroid Build Coastguard Worker vpermb m1, m6, m0 ; 45 56 67 78 3931*c0909341SAndroid Build Coastguard Worker vpermb m2, m7, m0 ; 67 78 89 9a 3932*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 3933*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m2, m15 ; a3 b3 c3 d3 3934*c0909341SAndroid Build Coastguard Worker psrad m5, 6 3935*c0909341SAndroid Build Coastguard Worker vpmovdw [tmpq], m5 3936*c0909341SAndroid Build Coastguard Worker add tmpq, 32 3937*c0909341SAndroid Build Coastguard Worker sub hd, 4 3938*c0909341SAndroid Build Coastguard Worker jg .hv_w4_loop 3939*c0909341SAndroid Build Coastguard Worker RET 3940*c0909341SAndroid Build Coastguard Worker.hv_w8: 3941*c0909341SAndroid Build Coastguard Worker shr mxd, 16 3942*c0909341SAndroid Build Coastguard Worker sub srcq, 3 3943*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+subpel_filters+mxq*8+0] 3944*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+subpel_filters+mxq*8+4] 3945*c0909341SAndroid Build Coastguard Worker movzx mxd, myb 3946*c0909341SAndroid Build Coastguard Worker shr myd, 16 3947*c0909341SAndroid Build Coastguard Worker cmp hd, 4 3948*c0909341SAndroid Build Coastguard Worker cmove myd, mxd 3949*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [base+subpel_filters+myq*8] 3950*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 3951*c0909341SAndroid Build Coastguard Worker sub srcq, stride3q 3952*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m0 3953*c0909341SAndroid Build Coastguard Worker psraw m0, 8 ; sign-extend 3954*c0909341SAndroid Build Coastguard Worker pshufd m12, m0, q0000 3955*c0909341SAndroid Build Coastguard Worker pshufd m13, m0, q1111 3956*c0909341SAndroid Build Coastguard Worker pshufd m14, m0, q2222 3957*c0909341SAndroid Build Coastguard Worker pshufd m15, m0, q3333 3958*c0909341SAndroid Build Coastguard Worker cmp wd, 8 3959*c0909341SAndroid Build Coastguard Worker jg .hv_w16 3960*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m17, [srcq+stride3q ] 3961*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, m17, [srcq+strideq*0], 0 3962*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m19, [subpel_h_shufA] 3963*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [srcq+strideq*1], 1 3964*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m21, [subpel_h_shufC] 3965*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [srcq+strideq*2], 2 3966*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 3967*c0909341SAndroid Build Coastguard Worker vinserti128 ym17, [srcq+strideq*0], 1 3968*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m20, [subpel_h_shufB] 3969*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [srcq+strideq*1], 2 3970*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [srcq+strideq*2], 3 3971*c0909341SAndroid Build Coastguard Worker pshufb m3, m16, m19 ; 0 1 2 3 0123 3972*c0909341SAndroid Build Coastguard Worker mova m2, m8 3973*c0909341SAndroid Build Coastguard Worker pshufb m0, m16, m21 ; 0 1 2 3 89ab 3974*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m3, m10 3975*c0909341SAndroid Build Coastguard Worker mova m3, m8 3976*c0909341SAndroid Build Coastguard Worker pshufb m1, m17, m19 ; 3 4 5 6 0123 3977*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m0, m11 3978*c0909341SAndroid Build Coastguard Worker mova m0, m8 3979*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m21 ; 3 4 5 6 89ab 3980*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m1, m10 3981*c0909341SAndroid Build Coastguard Worker mova m1, m8 3982*c0909341SAndroid Build Coastguard Worker pshufb m16, m20 ; 0 1 2 3 4567 3983*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m11 3984*c0909341SAndroid Build Coastguard Worker pshufb m17, m20 ; 3 4 5 6 4567 3985*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m16, m11 3986*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m16, m10 3987*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m17, m11 3988*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m17, m10 3989*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 3990*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3991*c0909341SAndroid Build Coastguard Worker psraw m2, 2 ; 0 1 2 3 3992*c0909341SAndroid Build Coastguard Worker psraw m0, 2 ; 3 4 5 6 3993*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5 3994*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4 3995*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 ; 23 34 45 56 3996*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 3997*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m5 ; 01 12 23 34 3998*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 3999*c0909341SAndroid Build Coastguard Worker.hv_w8_loop: 4000*c0909341SAndroid Build Coastguard Worker movu xm18, [srcq+stride3q ] 4001*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 4002*c0909341SAndroid Build Coastguard Worker vinserti128 ym18, [srcq+strideq*0], 1 4003*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [srcq+strideq*1], 2 4004*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [srcq+strideq*2], 3 4005*c0909341SAndroid Build Coastguard Worker pshufb m17, m18, m19 ; 7 8 9 a 0123 4006*c0909341SAndroid Build Coastguard Worker mova m16, m8 4007*c0909341SAndroid Build Coastguard Worker pshufb m5, m18, m21 ; 7 8 9 a 89ab 4008*c0909341SAndroid Build Coastguard Worker vpdpbusd m16, m17, m10 4009*c0909341SAndroid Build Coastguard Worker mova m17, m8 4010*c0909341SAndroid Build Coastguard Worker pshufb m18, m20 ; 7 8 9 a 4567 4011*c0909341SAndroid Build Coastguard Worker vpdpbusd m17, m5, m11 4012*c0909341SAndroid Build Coastguard Worker mova m5, m9 4013*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m3, m13 ; a1 b1 c1 d1 4014*c0909341SAndroid Build Coastguard Worker mova m6, m9 4015*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m4, m13 4016*c0909341SAndroid Build Coastguard Worker vpdpbusd m16, m18, m11 4017*c0909341SAndroid Build Coastguard Worker vpdpbusd m17, m18, m10 4018*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 4019*c0909341SAndroid Build Coastguard Worker mova m1, m3 4020*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m2, m12 4021*c0909341SAndroid Build Coastguard Worker mova m2, m4 4022*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 4023*c0909341SAndroid Build Coastguard Worker psraw m16, 2 ; 7 8 9 a 4024*c0909341SAndroid Build Coastguard Worker valignq m4, m16, m0, 6 ; 6 7 8 9 4025*c0909341SAndroid Build Coastguard Worker mova m0, m16 4026*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m16 ; 67 78 89 9a 4027*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m16 4028*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m3, m15 ; a3 b3 c3 d3 4029*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m4, m15 4030*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m3, q1032 ; 45 56 67 78 4031*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m4, q1032 4032*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 4033*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m2, m14 4034*c0909341SAndroid Build Coastguard Worker psrad m5, 6 4035*c0909341SAndroid Build Coastguard Worker psrad m6, 6 4036*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 4037*c0909341SAndroid Build Coastguard Worker mova [tmpq], m5 4038*c0909341SAndroid Build Coastguard Worker add tmpq, 64 4039*c0909341SAndroid Build Coastguard Worker sub hd, 4 4040*c0909341SAndroid Build Coastguard Worker jg .hv_w8_loop 4041*c0909341SAndroid Build Coastguard Worker vzeroupper 4042*c0909341SAndroid Build Coastguard Worker RET 4043*c0909341SAndroid Build Coastguard Worker.hv_w16: 4044*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 23 4045*c0909341SAndroid Build Coastguard Worker mova m16, [spel_h_perm16] 4046*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [pb_4] 4047*c0909341SAndroid Build Coastguard Worker add wd, wd 4048*c0909341SAndroid Build Coastguard Worker paddb m17, m18, m16 4049*c0909341SAndroid Build Coastguard Worker lea r6d, [hq+wq*8-256] 4050*c0909341SAndroid Build Coastguard Worker paddb m18, m17 4051*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0: 4052*c0909341SAndroid Build Coastguard Worker movu ym19, [srcq+strideq*0] 4053*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [srcq+strideq*1], 1 4054*c0909341SAndroid Build Coastguard Worker lea r5, [srcq+strideq*2] 4055*c0909341SAndroid Build Coastguard Worker movu ym20, [r5 +strideq*0] 4056*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [r5 +strideq*1], 1 4057*c0909341SAndroid Build Coastguard Worker lea r5, [r5 +strideq*2] 4058*c0909341SAndroid Build Coastguard Worker movu ym21, [r5 +strideq*0] 4059*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [r5 +strideq*1], 1 4060*c0909341SAndroid Build Coastguard Worker lea r5, [r5 +strideq*2] 4061*c0909341SAndroid Build Coastguard Worker movu ym22, [r5 +strideq*0] 4062*c0909341SAndroid Build Coastguard Worker mov r7, tmpq 4063*c0909341SAndroid Build Coastguard Worker vpermb m3, m16, m19 ; 0 1 0123 89ab 4064*c0909341SAndroid Build Coastguard Worker mova m2, m8 4065*c0909341SAndroid Build Coastguard Worker vpermb m4, m18, m19 ; 0 1 89ab ghij 4066*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m3, m10 4067*c0909341SAndroid Build Coastguard Worker mova m3, m8 4068*c0909341SAndroid Build Coastguard Worker vpermb m5, m16, m20 ; 2 3 0123 89ab 4069*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m11 4070*c0909341SAndroid Build Coastguard Worker mova m4, m8 4071*c0909341SAndroid Build Coastguard Worker vpermb m6, m18, m20 ; 2 3 89ab ghij 4072*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m5, m10 4073*c0909341SAndroid Build Coastguard Worker mova m5, m8 4074*c0909341SAndroid Build Coastguard Worker vpermb m7, m16, m21 ; 4 5 0123 89ab 4075*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m6, m11 4076*c0909341SAndroid Build Coastguard Worker mova m6, m8 4077*c0909341SAndroid Build Coastguard Worker vpermb m0, m18, m21 ; 4 5 89ab ghij 4078*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m7, m10 4079*c0909341SAndroid Build Coastguard Worker mova m7, m8 4080*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym16, ym22 ; 6 0123 89ab 4081*c0909341SAndroid Build Coastguard Worker vpdpbusd m7, m0, m11 4082*c0909341SAndroid Build Coastguard Worker mova ym0, ym8 4083*c0909341SAndroid Build Coastguard Worker vpermb m19, m17, m19 ; 0 1 4567 cdef 4084*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym10 4085*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym18, ym22 ; 6 89ab ghij 4086*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m19, m11 4087*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m19, m10 4088*c0909341SAndroid Build Coastguard Worker mova ym19, ym8 4089*c0909341SAndroid Build Coastguard Worker vpermb m20, m17, m20 ; 2 3 4567 cdef 4090*c0909341SAndroid Build Coastguard Worker vpdpbusd ym19, ym1, ym11 4091*c0909341SAndroid Build Coastguard Worker vpermb m21, m17, m21 ; 4 5 4567 cdef 4092*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m20, m11 4093*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m20, m10 4094*c0909341SAndroid Build Coastguard Worker vpermb ym22, ym17, ym22 ; 6 4567 cdef 4095*c0909341SAndroid Build Coastguard Worker vpdpbusd m6, m21, m11 4096*c0909341SAndroid Build Coastguard Worker vpdpbusd m7, m21, m10 4097*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 ; 0 1 4098*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym22, ym11 4099*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 ; 2 3 4100*c0909341SAndroid Build Coastguard Worker vpdpbusd ym19, ym22, ym10 4101*c0909341SAndroid Build Coastguard Worker packssdw m6, m7 ; 4 5 4102*c0909341SAndroid Build Coastguard Worker packssdw ym0, ym19 ; 6 4103*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 2}, m2, m4, m6, ym0 4104*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m2, m4, q1032 ; 1 2 4105*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m4, m6, q1032 ; 3 4 4106*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m6, m0, q1032 ; 5 6 4107*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 ; 01 12 4108*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 4109*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 ; 23 34 4110*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 4111*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m0 ; 45 56 4112*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 4113*c0909341SAndroid Build Coastguard Worker.hv_w16_loop: 4114*c0909341SAndroid Build Coastguard Worker movu ym19, [r5+strideq*1] 4115*c0909341SAndroid Build Coastguard Worker lea r5, [r5+strideq*2] 4116*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [r5+strideq*0], 1 4117*c0909341SAndroid Build Coastguard Worker mova m20, m9 4118*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m1, m12 ; a0 4119*c0909341SAndroid Build Coastguard Worker vpermb m1, m16, m19 4120*c0909341SAndroid Build Coastguard Worker mova m21, m9 4121*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m2, m12 ; b0 4122*c0909341SAndroid Build Coastguard Worker vpermb m2, m17, m19 4123*c0909341SAndroid Build Coastguard Worker mova m22, m8 4124*c0909341SAndroid Build Coastguard Worker vpdpbusd m22, m1, m10 4125*c0909341SAndroid Build Coastguard Worker mova m1, m8 4126*c0909341SAndroid Build Coastguard Worker vpermb m19, m18, m19 4127*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m10 4128*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m3, m13 ; a1 4129*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m4, m13 ; b1 4130*c0909341SAndroid Build Coastguard Worker vpdpbusd m22, m2, m11 4131*c0909341SAndroid Build Coastguard Worker mova m2, m4 4132*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m19, m11 4133*c0909341SAndroid Build Coastguard Worker mova m4, m6 4134*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m5, m14 ; a2 4135*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m6, m14 ; b2 4136*c0909341SAndroid Build Coastguard Worker packssdw m22, m1 4137*c0909341SAndroid Build Coastguard Worker mova m1, m3 4138*c0909341SAndroid Build Coastguard Worker psraw m22, 2 ; 7 8 4139*c0909341SAndroid Build Coastguard Worker mova m3, m5 4140*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m0, m22, q1032 ; 6 7 4141*c0909341SAndroid Build Coastguard Worker mova m0, m22 4142*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m0 ; 67 78 4143*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0 4144*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m5, m15 ; a3 4145*c0909341SAndroid Build Coastguard Worker vpdpwssd m21, m6, m15 ; b3 4146*c0909341SAndroid Build Coastguard Worker psrad m20, 6 4147*c0909341SAndroid Build Coastguard Worker psrad m21, 6 4148*c0909341SAndroid Build Coastguard Worker packssdw m20, m21 4149*c0909341SAndroid Build Coastguard Worker mova [r7+wq*0], ym20 4150*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r7+wq*1], m20, 1 4151*c0909341SAndroid Build Coastguard Worker lea r7, [r7+wq*2] 4152*c0909341SAndroid Build Coastguard Worker sub hd, 2 4153*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop 4154*c0909341SAndroid Build Coastguard Worker add srcq, 16 4155*c0909341SAndroid Build Coastguard Worker add tmpq, 32 4156*c0909341SAndroid Build Coastguard Worker movzx hd, r6b 4157*c0909341SAndroid Build Coastguard Worker sub r6d, 1<<8 4158*c0909341SAndroid Build Coastguard Worker jg .hv_w16_loop0 4159*c0909341SAndroid Build Coastguard Worker RET 4160*c0909341SAndroid Build Coastguard Worker 4161*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts 4162*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_16384] 4163*c0909341SAndroid Build Coastguard Worker mova ym15, [warp_8x8t_end] 4164*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main 4165*c0909341SAndroid Build Coastguard Worker jmp .start 4166*c0909341SAndroid Build Coastguard Worker.loop: 4167*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 4168*c0909341SAndroid Build Coastguard Worker lea tmpq, [tmpq+tsq*4] 4169*c0909341SAndroid Build Coastguard Worker.start: 4170*c0909341SAndroid Build Coastguard Worker paddd m16, m16 4171*c0909341SAndroid Build Coastguard Worker vpermb m16, m15, m16 4172*c0909341SAndroid Build Coastguard Worker mova [tmpq+tsq*0], xm16 4173*c0909341SAndroid Build Coastguard Worker vextracti128 [tmpq+tsq*2], ym16, 1 4174*c0909341SAndroid Build Coastguard Worker sub r6d, 0x1800 4175*c0909341SAndroid Build Coastguard Worker jg .loop 4176*c0909341SAndroid Build Coastguard Worker RET 4177*c0909341SAndroid Build Coastguard Worker 4178*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter 4179*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_262144] 4180*c0909341SAndroid Build Coastguard Worker mova xm15, [warp_8x8_end] 4181*c0909341SAndroid Build Coastguard Worker call .main 4182*c0909341SAndroid Build Coastguard Worker jmp .start 4183*c0909341SAndroid Build Coastguard Worker.loop: 4184*c0909341SAndroid Build Coastguard Worker call .main2 4185*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 4186*c0909341SAndroid Build Coastguard Worker.start: 4187*c0909341SAndroid Build Coastguard Worker psrad m16, 19 4188*c0909341SAndroid Build Coastguard Worker packuswb m16, m16 4189*c0909341SAndroid Build Coastguard Worker vpermb m16, m15, m16 4190*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xm16 4191*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xm16 4192*c0909341SAndroid Build Coastguard Worker sub r6d, 0x1800 4193*c0909341SAndroid Build Coastguard Worker jg .loop 4194*c0909341SAndroid Build Coastguard Worker RET 4195*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4196*c0909341SAndroid Build Coastguard Worker.main: 4197*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [pd_512] 4198*c0909341SAndroid Build Coastguard Worker%if WIN64 4199*c0909341SAndroid Build Coastguard Worker mov abcdq, r5mp 4200*c0909341SAndroid Build Coastguard Worker vpaddd ym18, ym1, r6m {1to8} ; mx 4201*c0909341SAndroid Build Coastguard Worker%else 4202*c0909341SAndroid Build Coastguard Worker add r5d, 512 4203*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym18, r5d 4204*c0909341SAndroid Build Coastguard Worker%endif 4205*c0909341SAndroid Build Coastguard Worker vpaddd ym20, ym1, r7m {1to8} ; my 4206*c0909341SAndroid Build Coastguard Worker mova ym16, [pd_0to7] 4207*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym19, [abcdq+4*0] 4208*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym21, [abcdq+4*1] 4209*c0909341SAndroid Build Coastguard Worker lea r4, [ssq*3+3] 4210*c0909341SAndroid Build Coastguard Worker mova m10, [warp_8x8_permA] 4211*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5555 4212*c0909341SAndroid Build Coastguard Worker mova m11, [warp_8x8_permB] 4213*c0909341SAndroid Build Coastguard Worker lea filterq, [mc_warp_filter+64*8] 4214*c0909341SAndroid Build Coastguard Worker vpbroadcastq m12, [warp_8x8_hpack] 4215*c0909341SAndroid Build Coastguard Worker sub srcq, r4 ; src -= src_stride*3 + 3 4216*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m13, [warp_8x8_permC] 4217*c0909341SAndroid Build Coastguard Worker kxnorb k2, k2, k2 4218*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m14, [warp_8x8_permD] 4219*c0909341SAndroid Build Coastguard Worker vpdpwssd ym18, ym19, ym16 ; alpha 4220*c0909341SAndroid Build Coastguard Worker vpdpwssd ym20, ym21, ym16 ; gamma 4221*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [srcq] 4222*c0909341SAndroid Build Coastguard Worker psrad ym19, 16 ; beta 4223*c0909341SAndroid Build Coastguard Worker psrad ym21, 16 ; delta 4224*c0909341SAndroid Build Coastguard Worker kmovw k1, r6d 4225*c0909341SAndroid Build Coastguard Worker psrad ym16, ym18, 10 4226*c0909341SAndroid Build Coastguard Worker kmovb k3, k2 4227*c0909341SAndroid Build Coastguard Worker paddd ym18, ym19 4228*c0909341SAndroid Build Coastguard Worker vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 4229*c0909341SAndroid Build Coastguard Worker psrld m1, 8 ; pd_2 4230*c0909341SAndroid Build Coastguard Worker pshufb m0, m11 4231*c0909341SAndroid Build Coastguard Worker paddd m8, m1, m1 ; pd_4 4232*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m0, m2 4233*c0909341SAndroid Build Coastguard Worker call .h 4234*c0909341SAndroid Build Coastguard Worker psllq m2, m1, 45 4235*c0909341SAndroid Build Coastguard Worker pslld m1, 13 4236*c0909341SAndroid Build Coastguard Worker paddd m1, m2 4237*c0909341SAndroid Build Coastguard Worker vpshrdq m1, m0, 48 ; 01 12 4238*c0909341SAndroid Build Coastguard Worker call .h 4239*c0909341SAndroid Build Coastguard Worker vpshrdq m2, m1, m0, 48 ; 23 34 4240*c0909341SAndroid Build Coastguard Worker call .h 4241*c0909341SAndroid Build Coastguard Worker vpshrdq m3, m2, m0, 48 ; 45 56 4242*c0909341SAndroid Build Coastguard Worker.main2: 4243*c0909341SAndroid Build Coastguard Worker call .h 4244*c0909341SAndroid Build Coastguard Worker psrad ym17, ym20, 10 4245*c0909341SAndroid Build Coastguard Worker kmovb k2, k3 4246*c0909341SAndroid Build Coastguard Worker paddd ym20, ym21 4247*c0909341SAndroid Build Coastguard Worker vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 4248*c0909341SAndroid Build Coastguard Worker psrad ym16, ym20, 10 4249*c0909341SAndroid Build Coastguard Worker kmovb k3, k2 4250*c0909341SAndroid Build Coastguard Worker paddd ym20, ym21 4251*c0909341SAndroid Build Coastguard Worker vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 4252*c0909341SAndroid Build Coastguard Worker shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 4253*c0909341SAndroid Build Coastguard Worker mova m16, m9 4254*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 4255*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m1, m4 4256*c0909341SAndroid Build Coastguard Worker pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 4257*c0909341SAndroid Build Coastguard Worker mova m1, m2 4258*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m2, m5 4259*c0909341SAndroid Build Coastguard Worker shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 4260*c0909341SAndroid Build Coastguard Worker mova m2, m3 4261*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 4262*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m3, m4 4263*c0909341SAndroid Build Coastguard Worker vpshrdq m3, m0, 48 ; 67 78 4264*c0909341SAndroid Build Coastguard Worker pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 4265*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m3, m5 4266*c0909341SAndroid Build Coastguard Worker ret 4267*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4268*c0909341SAndroid Build Coastguard Worker.h: 4269*c0909341SAndroid Build Coastguard Worker movu xm5, [srcq+ssq*1] 4270*c0909341SAndroid Build Coastguard Worker psrad ym16, ym18, 10 4271*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+ssq*2] 4272*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym5, [srcq+ssq*0], 1 4273*c0909341SAndroid Build Coastguard Worker kmovb k2, k3 4274*c0909341SAndroid Build Coastguard Worker paddd ym18, ym19 4275*c0909341SAndroid Build Coastguard Worker vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 4276*c0909341SAndroid Build Coastguard Worker psrad ym17, ym18, 10 4277*c0909341SAndroid Build Coastguard Worker kmovb k3, k2 4278*c0909341SAndroid Build Coastguard Worker paddd ym18, ym19 4279*c0909341SAndroid Build Coastguard Worker vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 4280*c0909341SAndroid Build Coastguard Worker mova m0, m8 4281*c0909341SAndroid Build Coastguard Worker vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 4282*c0909341SAndroid Build Coastguard Worker vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 4283*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m17 4284*c0909341SAndroid Build Coastguard Worker vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb 4285*c0909341SAndroid Build Coastguard Worker vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 4286*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m5, m16 4287*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) 4288*c0909341SAndroid Build Coastguard Worker ret 4289*c0909341SAndroid Build Coastguard Worker 4290*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 1 ; op 4291*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4292*c0909341SAndroid Build Coastguard Worker jmp wq 4293*c0909341SAndroid Build Coastguard Worker.w4: 4294*c0909341SAndroid Build Coastguard Worker cmp hd, 8 4295*c0909341SAndroid Build Coastguard Worker jg .w4_h16 4296*c0909341SAndroid Build Coastguard Worker WRAP_YMM %1 0 4297*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4298*c0909341SAndroid Build Coastguard Worker movd [dstq ], xm0 4299*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 4300*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 4301*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 4302*c0909341SAndroid Build Coastguard Worker jl .w4_ret 4303*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4304*c0909341SAndroid Build Coastguard Worker pextrd [dstq ], xm0, 2 4305*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 4306*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 4307*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 4308*c0909341SAndroid Build Coastguard Worker.w4_ret: 4309*c0909341SAndroid Build Coastguard Worker RET 4310*c0909341SAndroid Build Coastguard Worker.w4_h16: 4311*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, strided 4312*c0909341SAndroid Build Coastguard Worker pmulld m7, [bidir_sctr_w4] 4313*c0909341SAndroid Build Coastguard Worker %1 0 4314*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 4315*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+m7]{k1}, m0 4316*c0909341SAndroid Build Coastguard Worker RET 4317*c0909341SAndroid Build Coastguard Worker.w8: 4318*c0909341SAndroid Build Coastguard Worker cmp hd, 4 4319*c0909341SAndroid Build Coastguard Worker jne .w8_h8 4320*c0909341SAndroid Build Coastguard Worker WRAP_YMM %1 0 4321*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4322*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm0 4323*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4324*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 4325*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 4326*c0909341SAndroid Build Coastguard Worker RET 4327*c0909341SAndroid Build Coastguard Worker.w8_loop: 4328*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 2 4329*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4330*c0909341SAndroid Build Coastguard Worker.w8_h8: 4331*c0909341SAndroid Build Coastguard Worker %1 0 4332*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4333*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 4334*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m0, 3 4335*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm0 4336*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4337*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 4338*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm3 4339*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4340*c0909341SAndroid Build Coastguard Worker movhps [dstq ], xm0 4341*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 4342*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 4343*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 4344*c0909341SAndroid Build Coastguard Worker sub hd, 8 4345*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4346*c0909341SAndroid Build Coastguard Worker RET 4347*c0909341SAndroid Build Coastguard Worker.w16_loop: 4348*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 2 4349*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4350*c0909341SAndroid Build Coastguard Worker.w16: 4351*c0909341SAndroid Build Coastguard Worker %1 0 4352*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 4353*c0909341SAndroid Build Coastguard Worker mova [dstq ], xm0 4354*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 4355*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 4356*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 4357*c0909341SAndroid Build Coastguard Worker sub hd, 4 4358*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4359*c0909341SAndroid Build Coastguard Worker RET 4360*c0909341SAndroid Build Coastguard Worker.w32: 4361*c0909341SAndroid Build Coastguard Worker pmovzxbq m7, [pb_02461357] 4362*c0909341SAndroid Build Coastguard Worker.w32_loop: 4363*c0909341SAndroid Build Coastguard Worker %1 0 4364*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 2 4365*c0909341SAndroid Build Coastguard Worker vpermq m0, m7, m0 4366*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 4367*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 4368*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4369*c0909341SAndroid Build Coastguard Worker sub hd, 2 4370*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4371*c0909341SAndroid Build Coastguard Worker RET 4372*c0909341SAndroid Build Coastguard Worker.w64: 4373*c0909341SAndroid Build Coastguard Worker pmovzxbq m7, [pb_02461357] 4374*c0909341SAndroid Build Coastguard Worker.w64_loop: 4375*c0909341SAndroid Build Coastguard Worker %1 0 4376*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 2 4377*c0909341SAndroid Build Coastguard Worker vpermq m0, m7, m0 4378*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 4379*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4380*c0909341SAndroid Build Coastguard Worker dec hd 4381*c0909341SAndroid Build Coastguard Worker jg .w64_loop 4382*c0909341SAndroid Build Coastguard Worker RET 4383*c0909341SAndroid Build Coastguard Worker.w128: 4384*c0909341SAndroid Build Coastguard Worker pmovzxbq m7, [pb_02461357] 4385*c0909341SAndroid Build Coastguard Worker.w128_loop: 4386*c0909341SAndroid Build Coastguard Worker %1 0 4387*c0909341SAndroid Build Coastguard Worker vpermq m6, m7, m0 4388*c0909341SAndroid Build Coastguard Worker %1 2 4389*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m6 4390*c0909341SAndroid Build Coastguard Worker %1_INC_PTR 4 4391*c0909341SAndroid Build Coastguard Worker vpermq m6, m7, m0 4392*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m6 4393*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4394*c0909341SAndroid Build Coastguard Worker dec hd 4395*c0909341SAndroid Build Coastguard Worker jg .w128_loop 4396*c0909341SAndroid Build Coastguard Worker RET 4397*c0909341SAndroid Build Coastguard Worker%endmacro 4398*c0909341SAndroid Build Coastguard Worker 4399*c0909341SAndroid Build Coastguard Worker%macro AVG 1 ; src_offset 4400*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+(%1+0)*mmsize] 4401*c0909341SAndroid Build Coastguard Worker paddw m0, [tmp2q+(%1+0)*mmsize] 4402*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+(%1+1)*mmsize] 4403*c0909341SAndroid Build Coastguard Worker paddw m1, [tmp2q+(%1+1)*mmsize] 4404*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 4405*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 4406*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 4407*c0909341SAndroid Build Coastguard Worker%endmacro 4408*c0909341SAndroid Build Coastguard Worker 4409*c0909341SAndroid Build Coastguard Worker%macro AVG_INC_PTR 1 4410*c0909341SAndroid Build Coastguard Worker add tmp1q, %1*mmsize 4411*c0909341SAndroid Build Coastguard Worker add tmp2q, %1*mmsize 4412*c0909341SAndroid Build Coastguard Worker%endmacro 4413*c0909341SAndroid Build Coastguard Worker 4414*c0909341SAndroid Build Coastguard Workercglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 4415*c0909341SAndroid Build Coastguard Worker%define base r6-avg_avx512icl_table 4416*c0909341SAndroid Build Coastguard Worker lea r6, [avg_avx512icl_table] 4417*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4418*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4419*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r6+wq*4] 4420*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pw_1024] 4421*c0909341SAndroid Build Coastguard Worker add wq, r6 4422*c0909341SAndroid Build Coastguard Worker BIDIR_FN AVG 4423*c0909341SAndroid Build Coastguard Worker 4424*c0909341SAndroid Build Coastguard Worker%macro W_AVG 1 ; src_offset 4425*c0909341SAndroid Build Coastguard Worker ; (a * weight + b * (16 - weight) + 128) >> 8 4426*c0909341SAndroid Build Coastguard Worker ; = ((a - b) * weight + (b << 4) + 128) >> 8 4427*c0909341SAndroid Build Coastguard Worker ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 4428*c0909341SAndroid Build Coastguard Worker ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 4429*c0909341SAndroid Build Coastguard Worker mova m0, [tmp1q+(%1+0)*mmsize] 4430*c0909341SAndroid Build Coastguard Worker psubw m2, m0, [tmp2q+(%1+0)*mmsize] 4431*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+(%1+1)*mmsize] 4432*c0909341SAndroid Build Coastguard Worker psubw m3, m1, [tmp2q+(%1+1)*mmsize] 4433*c0909341SAndroid Build Coastguard Worker pmulhw m2, m4 4434*c0909341SAndroid Build Coastguard Worker pmulhw m3, m4 4435*c0909341SAndroid Build Coastguard Worker paddw m0, m2 4436*c0909341SAndroid Build Coastguard Worker paddw m1, m3 4437*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 4438*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 4439*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 4440*c0909341SAndroid Build Coastguard Worker%endmacro 4441*c0909341SAndroid Build Coastguard Worker 4442*c0909341SAndroid Build Coastguard Worker%define W_AVG_INC_PTR AVG_INC_PTR 4443*c0909341SAndroid Build Coastguard Worker 4444*c0909341SAndroid Build Coastguard Workercglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 4445*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg_avx512icl_table 4446*c0909341SAndroid Build Coastguard Worker lea r6, [w_avg_avx512icl_table] 4447*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4448*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4449*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, r6m ; weight 4450*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r6+wq*4] 4451*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_2048] 4452*c0909341SAndroid Build Coastguard Worker psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 4453*c0909341SAndroid Build Coastguard Worker add wq, r6 4454*c0909341SAndroid Build Coastguard Worker cmp dword r6m, 7 4455*c0909341SAndroid Build Coastguard Worker jg .weight_gt7 4456*c0909341SAndroid Build Coastguard Worker mov r6, tmp1q 4457*c0909341SAndroid Build Coastguard Worker pxor m0, m0 4458*c0909341SAndroid Build Coastguard Worker mov tmp1q, tmp2q 4459*c0909341SAndroid Build Coastguard Worker psubw m4, m0, m4 ; -weight 4460*c0909341SAndroid Build Coastguard Worker mov tmp2q, r6 4461*c0909341SAndroid Build Coastguard Worker.weight_gt7: 4462*c0909341SAndroid Build Coastguard Worker BIDIR_FN W_AVG 4463*c0909341SAndroid Build Coastguard Worker 4464*c0909341SAndroid Build Coastguard Worker%macro MASK 1 ; src_offset 4465*c0909341SAndroid Build Coastguard Worker ; (a * m + b * (64 - m) + 512) >> 10 4466*c0909341SAndroid Build Coastguard Worker ; = ((a - b) * m + (b << 6) + 512) >> 10 4467*c0909341SAndroid Build Coastguard Worker ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 4468*c0909341SAndroid Build Coastguard Worker%if mmsize == 64 4469*c0909341SAndroid Build Coastguard Worker vpermq m3, m8, [maskq+%1*32] 4470*c0909341SAndroid Build Coastguard Worker%else 4471*c0909341SAndroid Build Coastguard Worker vpermq m3, [maskq+%1*16], q3120 4472*c0909341SAndroid Build Coastguard Worker%endif 4473*c0909341SAndroid Build Coastguard Worker mova m0, [tmp2q+(%1+0)*mmsize] 4474*c0909341SAndroid Build Coastguard Worker psubw m1, m0, [tmp1q+(%1+0)*mmsize] 4475*c0909341SAndroid Build Coastguard Worker psubb m3, m4, m3 4476*c0909341SAndroid Build Coastguard Worker paddw m1, m1 ; (b - a) << 1 4477*c0909341SAndroid Build Coastguard Worker paddb m3, m3 4478*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4, m3 ; -m << 9 4479*c0909341SAndroid Build Coastguard Worker pmulhw m1, m2 4480*c0909341SAndroid Build Coastguard Worker paddw m0, m1 4481*c0909341SAndroid Build Coastguard Worker mova m1, [tmp2q+(%1+1)*mmsize] 4482*c0909341SAndroid Build Coastguard Worker psubw m2, m1, [tmp1q+(%1+1)*mmsize] 4483*c0909341SAndroid Build Coastguard Worker paddw m2, m2 4484*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4, m3 4485*c0909341SAndroid Build Coastguard Worker pmulhw m2, m3 4486*c0909341SAndroid Build Coastguard Worker paddw m1, m2 4487*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 4488*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 4489*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 4490*c0909341SAndroid Build Coastguard Worker%endmacro 4491*c0909341SAndroid Build Coastguard Worker 4492*c0909341SAndroid Build Coastguard Worker%macro MASK_INC_PTR 1 4493*c0909341SAndroid Build Coastguard Worker add maskq, %1*32 4494*c0909341SAndroid Build Coastguard Worker add tmp2q, %1*64 4495*c0909341SAndroid Build Coastguard Worker add tmp1q, %1*64 4496*c0909341SAndroid Build Coastguard Worker%endmacro 4497*c0909341SAndroid Build Coastguard Worker 4498*c0909341SAndroid Build Coastguard Workercglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 4499*c0909341SAndroid Build Coastguard Worker%define base r7-mask_avx512icl_table 4500*c0909341SAndroid Build Coastguard Worker lea r7, [mask_avx512icl_table] 4501*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4502*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4503*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 4504*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r7+wq*4] 4505*c0909341SAndroid Build Coastguard Worker pxor m4, m4 4506*c0909341SAndroid Build Coastguard Worker mova m8, [base+bilin_v_perm64] 4507*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_2048] 4508*c0909341SAndroid Build Coastguard Worker add wq, r7 4509*c0909341SAndroid Build Coastguard Worker BIDIR_FN MASK 4510*c0909341SAndroid Build Coastguard Worker 4511*c0909341SAndroid Build Coastguard Worker%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 4512*c0909341SAndroid Build Coastguard Worker mova m%1, [tmp1q+mmsize*%3] 4513*c0909341SAndroid Build Coastguard Worker mova m1, [tmp2q+mmsize*%3] 4514*c0909341SAndroid Build Coastguard Worker psubw m1, m%1 4515*c0909341SAndroid Build Coastguard Worker pabsw m%2, m1 4516*c0909341SAndroid Build Coastguard Worker psubusw m%2, m6, m%2 4517*c0909341SAndroid Build Coastguard Worker psrlw m%2, 8 ; 64 - m 4518*c0909341SAndroid Build Coastguard Worker psllw m2, m%2, 10 4519*c0909341SAndroid Build Coastguard Worker pmulhw m1, m2 4520*c0909341SAndroid Build Coastguard Worker paddw m%1, m1 4521*c0909341SAndroid Build Coastguard Worker mova m1, [tmp1q+mmsize*%4] 4522*c0909341SAndroid Build Coastguard Worker mova m2, [tmp2q+mmsize*%4] 4523*c0909341SAndroid Build Coastguard Worker psubw m2, m1 4524*c0909341SAndroid Build Coastguard Worker pabsw m3, m2 4525*c0909341SAndroid Build Coastguard Worker psubusw m3, m6, m3 4526*c0909341SAndroid Build Coastguard Worker vpshldw m%2, m3, 8 4527*c0909341SAndroid Build Coastguard Worker psllw m3, m%2, 10 4528*c0909341SAndroid Build Coastguard Worker%if %5 4529*c0909341SAndroid Build Coastguard Worker psubb m%2, m5, m%2 4530*c0909341SAndroid Build Coastguard Worker%endif 4531*c0909341SAndroid Build Coastguard Worker pmulhw m2, m3 4532*c0909341SAndroid Build Coastguard Worker paddw m1, m2 4533*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m7 4534*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 4535*c0909341SAndroid Build Coastguard Worker packuswb m%1, m1 4536*c0909341SAndroid Build Coastguard Worker%endmacro 4537*c0909341SAndroid Build Coastguard Worker 4538*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 4539*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx512icl_table 4540*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_420_avx512icl_table] 4541*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4542*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; sign 4543*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4544*c0909341SAndroid Build Coastguard Worker movsxd wq, [r7+wq*4] 4545*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 4546*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_2048] 4547*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_m64] ; -1 << 6 4548*c0909341SAndroid Build Coastguard Worker mova ym10, [base+wm_420_mask+32] 4549*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 4550*c0909341SAndroid Build Coastguard Worker add wq, r7 4551*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 4552*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4553*c0909341SAndroid Build Coastguard Worker jmp wq 4554*c0909341SAndroid Build Coastguard Worker.w4: 4555*c0909341SAndroid Build Coastguard Worker mova m5, [wm_420_perm4] 4556*c0909341SAndroid Build Coastguard Worker cmp hd, 8 4557*c0909341SAndroid Build Coastguard Worker jg .w4_h16 4558*c0909341SAndroid Build Coastguard Worker WRAP_YMM W_MASK 0, 4, 0, 1 4559*c0909341SAndroid Build Coastguard Worker vinserti128 ym5, [wm_420_perm4+32], 1 4560*c0909341SAndroid Build Coastguard Worker vpermb ym4, ym5, ym4 4561*c0909341SAndroid Build Coastguard Worker vpdpbusd ym8, ym4, ym9 4562*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m0, 1 4563*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 4564*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 4565*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 4566*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 4567*c0909341SAndroid Build Coastguard Worker jl .w4_end 4568*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4569*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 4570*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 4571*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 4572*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 4573*c0909341SAndroid Build Coastguard Worker.w4_end: 4574*c0909341SAndroid Build Coastguard Worker vpermb ym8, ym10, ym8 4575*c0909341SAndroid Build Coastguard Worker movq [maskq], xm8 4576*c0909341SAndroid Build Coastguard Worker RET 4577*c0909341SAndroid Build Coastguard Worker.w4_h16: 4578*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, strided 4579*c0909341SAndroid Build Coastguard Worker pmulld m11, [bidir_sctr_w4] 4580*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4581*c0909341SAndroid Build Coastguard Worker vpermb m4, m5, m4 4582*c0909341SAndroid Build Coastguard Worker vpdpbusd m8, m4, m9 4583*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 4584*c0909341SAndroid Build Coastguard Worker vpermb m8, m10, m8 4585*c0909341SAndroid Build Coastguard Worker mova [maskq], xm8 4586*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+m11]{k1}, m0 4587*c0909341SAndroid Build Coastguard Worker RET 4588*c0909341SAndroid Build Coastguard Worker.w8: 4589*c0909341SAndroid Build Coastguard Worker mova m5, [wm_420_perm8] 4590*c0909341SAndroid Build Coastguard Worker cmp hd, 4 4591*c0909341SAndroid Build Coastguard Worker jne .w8_h8 4592*c0909341SAndroid Build Coastguard Worker WRAP_YMM W_MASK 0, 4, 0, 1 4593*c0909341SAndroid Build Coastguard Worker vinserti128 ym5, [wm_420_perm8+32], 1 4594*c0909341SAndroid Build Coastguard Worker vpermb ym4, ym5, ym4 4595*c0909341SAndroid Build Coastguard Worker vpdpbusd ym8, ym4, ym9 4596*c0909341SAndroid Build Coastguard Worker vpermb m8, m10, m8 4597*c0909341SAndroid Build Coastguard Worker mova [maskq], xm8 4598*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4599*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4600*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4601*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 4602*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 4603*c0909341SAndroid Build Coastguard Worker RET 4604*c0909341SAndroid Build Coastguard Worker.w8_loop: 4605*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4606*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4607*c0909341SAndroid Build Coastguard Worker add maskq, 16 4608*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4609*c0909341SAndroid Build Coastguard Worker.w8_h8: 4610*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4611*c0909341SAndroid Build Coastguard Worker vpermb m4, m5, m4 4612*c0909341SAndroid Build Coastguard Worker mova m1, m8 4613*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m9 4614*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4615*c0909341SAndroid Build Coastguard Worker mova [maskq], xm1 4616*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4617*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 4618*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m0, 3 4619*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4620*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4621*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 4622*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm3 4623*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4624*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm0 4625*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 4626*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 4627*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 4628*c0909341SAndroid Build Coastguard Worker sub hd, 8 4629*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4630*c0909341SAndroid Build Coastguard Worker RET 4631*c0909341SAndroid Build Coastguard Worker.w16: 4632*c0909341SAndroid Build Coastguard Worker mova m5, [wm_420_perm16] 4633*c0909341SAndroid Build Coastguard Worker.w16_loop: 4634*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4635*c0909341SAndroid Build Coastguard Worker vpermb m4, m5, m4 4636*c0909341SAndroid Build Coastguard Worker mova m1, m8 4637*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m9 4638*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4639*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4640*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4641*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 4642*c0909341SAndroid Build Coastguard Worker mova [maskq], xm1 4643*c0909341SAndroid Build Coastguard Worker add maskq, 16 4644*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 4645*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 4646*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 4647*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 4648*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4649*c0909341SAndroid Build Coastguard Worker sub hd, 4 4650*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4651*c0909341SAndroid Build Coastguard Worker RET 4652*c0909341SAndroid Build Coastguard Worker.w32: 4653*c0909341SAndroid Build Coastguard Worker pmovzxbq m5, [pb_02461357] 4654*c0909341SAndroid Build Coastguard Worker.w32_loop: 4655*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4656*c0909341SAndroid Build Coastguard Worker mova m1, m8 4657*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m9 4658*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4659*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4660*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4661*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, m0 4662*c0909341SAndroid Build Coastguard Worker mova [maskq], xm1 4663*c0909341SAndroid Build Coastguard Worker add maskq, 16 4664*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 4665*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 4666*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4667*c0909341SAndroid Build Coastguard Worker sub hd, 2 4668*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4669*c0909341SAndroid Build Coastguard Worker RET 4670*c0909341SAndroid Build Coastguard Worker.w64: 4671*c0909341SAndroid Build Coastguard Worker pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 4672*c0909341SAndroid Build Coastguard Worker psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 4673*c0909341SAndroid Build Coastguard Worker.w64_loop: 4674*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 2 4675*c0909341SAndroid Build Coastguard Worker W_MASK 11, 5, 1, 3 4676*c0909341SAndroid Build Coastguard Worker mova m2, m8 4677*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m4, m9 4678*c0909341SAndroid Build Coastguard Worker mova m3, m8 4679*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m5, m9 4680*c0909341SAndroid Build Coastguard Worker add tmp1q, 256 4681*c0909341SAndroid Build Coastguard Worker add tmp2q, 256 4682*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m10, m3 4683*c0909341SAndroid Build Coastguard Worker mova m1, m0 4684*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m12, m11 4685*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m13, m11 4686*c0909341SAndroid Build Coastguard Worker mova [maskq], ym2 4687*c0909341SAndroid Build Coastguard Worker add maskq, 32 4688*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 4689*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 4690*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4691*c0909341SAndroid Build Coastguard Worker sub hd, 2 4692*c0909341SAndroid Build Coastguard Worker jg .w64_loop 4693*c0909341SAndroid Build Coastguard Worker RET 4694*c0909341SAndroid Build Coastguard Worker.w128: 4695*c0909341SAndroid Build Coastguard Worker pmovzxbq m14, [wm_420_perm64] 4696*c0909341SAndroid Build Coastguard Worker mova m10, [wm_420_mask] 4697*c0909341SAndroid Build Coastguard Worker psrlq m15, m14, 4 4698*c0909341SAndroid Build Coastguard Worker.w128_loop: 4699*c0909341SAndroid Build Coastguard Worker W_MASK 0, 12, 0, 4 4700*c0909341SAndroid Build Coastguard Worker W_MASK 11, 13, 1, 5 4701*c0909341SAndroid Build Coastguard Worker mova m4, m8 4702*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m12, m9 4703*c0909341SAndroid Build Coastguard Worker mova m5, m8 4704*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m13, m9 4705*c0909341SAndroid Build Coastguard Worker mova m1, m0 4706*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m14, m11 4707*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m15, m11 4708*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*0], m0 4709*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*0], m1 4710*c0909341SAndroid Build Coastguard Worker W_MASK 0, 12, 2, 6 4711*c0909341SAndroid Build Coastguard Worker W_MASK 11, 13, 3, 7 4712*c0909341SAndroid Build Coastguard Worker vprold m4, 16 4713*c0909341SAndroid Build Coastguard Worker vprold m5, 16 4714*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m12, m9 4715*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m13, m9 4716*c0909341SAndroid Build Coastguard Worker add tmp1q, 512 4717*c0909341SAndroid Build Coastguard Worker add tmp2q, 512 4718*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m10, m5 4719*c0909341SAndroid Build Coastguard Worker mova m1, m0 4720*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m14, m11 4721*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m15, m11 4722*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 4723*c0909341SAndroid Build Coastguard Worker add maskq, 64 4724*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0+64*1], m0 4725*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1+64*1], m1 4726*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4727*c0909341SAndroid Build Coastguard Worker sub hd, 2 4728*c0909341SAndroid Build Coastguard Worker jg .w128_loop 4729*c0909341SAndroid Build Coastguard Worker RET 4730*c0909341SAndroid Build Coastguard Worker 4731*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 4732*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx512icl_table 4733*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_422_avx512icl_table] 4734*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4735*c0909341SAndroid Build Coastguard Worker mov r6d, r7m ; sign 4736*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4737*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r7+wq*4] 4738*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 4739*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_2048] 4740*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pw_m128] 4741*c0909341SAndroid Build Coastguard Worker mova m10, [base+wm_422_mask] 4742*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pb_127] 4743*c0909341SAndroid Build Coastguard Worker add wq, r7 4744*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+wm_sign+4+r6*4] 4745*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 4746*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4747*c0909341SAndroid Build Coastguard Worker jmp wq 4748*c0909341SAndroid Build Coastguard Worker.w4: 4749*c0909341SAndroid Build Coastguard Worker cmp hd, 8 4750*c0909341SAndroid Build Coastguard Worker jg .w4_h16 4751*c0909341SAndroid Build Coastguard Worker WRAP_YMM W_MASK 0, 4, 0, 1 4752*c0909341SAndroid Build Coastguard Worker movhps xm10, [wm_422_mask+16] 4753*c0909341SAndroid Build Coastguard Worker vpdpwssd ym8, ym4, ym9 4754*c0909341SAndroid Build Coastguard Worker vpermb ym8, ym10, ym8 4755*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m0, 1 4756*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 4757*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 4758*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 4759*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 4760*c0909341SAndroid Build Coastguard Worker jl .w4_end 4761*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4762*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 4763*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 4764*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 4765*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 4766*c0909341SAndroid Build Coastguard Worker.w4_end: 4767*c0909341SAndroid Build Coastguard Worker pand xm8, xm11 4768*c0909341SAndroid Build Coastguard Worker mova [maskq], xm8 4769*c0909341SAndroid Build Coastguard Worker RET 4770*c0909341SAndroid Build Coastguard Worker.w4_h16: 4771*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, strided 4772*c0909341SAndroid Build Coastguard Worker pmulld m5, [bidir_sctr_w4] 4773*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4774*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m4, m9 4775*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 4776*c0909341SAndroid Build Coastguard Worker vpermb m8, m10, m8 4777*c0909341SAndroid Build Coastguard Worker pand ym8, ym11 4778*c0909341SAndroid Build Coastguard Worker mova [maskq], ym8 4779*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+m5]{k1}, m0 4780*c0909341SAndroid Build Coastguard Worker RET 4781*c0909341SAndroid Build Coastguard Worker.w8: 4782*c0909341SAndroid Build Coastguard Worker cmp hd, 4 4783*c0909341SAndroid Build Coastguard Worker jne .w8_h8 4784*c0909341SAndroid Build Coastguard Worker WRAP_YMM W_MASK 0, 4, 0, 1 4785*c0909341SAndroid Build Coastguard Worker movhps xm10, [wm_422_mask+16] 4786*c0909341SAndroid Build Coastguard Worker vpdpwssd ym8, ym4, ym9 4787*c0909341SAndroid Build Coastguard Worker vpermb ym8, ym10, ym8 4788*c0909341SAndroid Build Coastguard Worker pand xm8, xm11 4789*c0909341SAndroid Build Coastguard Worker mova [maskq], xm8 4790*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4791*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4792*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4793*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 4794*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 4795*c0909341SAndroid Build Coastguard Worker RET 4796*c0909341SAndroid Build Coastguard Worker.w8_loop: 4797*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4798*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4799*c0909341SAndroid Build Coastguard Worker add maskq, 32 4800*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4801*c0909341SAndroid Build Coastguard Worker.w8_h8: 4802*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4803*c0909341SAndroid Build Coastguard Worker mova m1, m8 4804*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m9 4805*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4806*c0909341SAndroid Build Coastguard Worker pand ym1, ym11 4807*c0909341SAndroid Build Coastguard Worker mova [maskq], ym1 4808*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4809*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 4810*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m0, 3 4811*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4812*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4813*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 4814*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm3 4815*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4816*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm0 4817*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 4818*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 4819*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 4820*c0909341SAndroid Build Coastguard Worker sub hd, 8 4821*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4822*c0909341SAndroid Build Coastguard Worker RET 4823*c0909341SAndroid Build Coastguard Worker.w16_loop: 4824*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4825*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4826*c0909341SAndroid Build Coastguard Worker add maskq, 32 4827*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4828*c0909341SAndroid Build Coastguard Worker.w16: 4829*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4830*c0909341SAndroid Build Coastguard Worker mova m1, m8 4831*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m9 4832*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4833*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 4834*c0909341SAndroid Build Coastguard Worker pand ym1, ym11 4835*c0909341SAndroid Build Coastguard Worker mova [maskq], ym1 4836*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 4837*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 4838*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 4839*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 4840*c0909341SAndroid Build Coastguard Worker sub hd, 4 4841*c0909341SAndroid Build Coastguard Worker jg .w16_loop 4842*c0909341SAndroid Build Coastguard Worker RET 4843*c0909341SAndroid Build Coastguard Worker.w32: 4844*c0909341SAndroid Build Coastguard Worker pmovzxbq m5, [pb_02461357] 4845*c0909341SAndroid Build Coastguard Worker.w32_loop: 4846*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4847*c0909341SAndroid Build Coastguard Worker mova m1, m8 4848*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m9 4849*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4850*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4851*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4852*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, m0 4853*c0909341SAndroid Build Coastguard Worker pand ym1, ym11 4854*c0909341SAndroid Build Coastguard Worker mova [maskq], ym1 4855*c0909341SAndroid Build Coastguard Worker add maskq, 32 4856*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 4857*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 4858*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 4859*c0909341SAndroid Build Coastguard Worker sub hd, 2 4860*c0909341SAndroid Build Coastguard Worker jg .w32_loop 4861*c0909341SAndroid Build Coastguard Worker RET 4862*c0909341SAndroid Build Coastguard Worker.w64: 4863*c0909341SAndroid Build Coastguard Worker pmovzxbq m5, [pb_02461357] 4864*c0909341SAndroid Build Coastguard Worker.w64_loop: 4865*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4866*c0909341SAndroid Build Coastguard Worker mova m1, m8 4867*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m9 4868*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4869*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4870*c0909341SAndroid Build Coastguard Worker vpermb m1, m10, m1 4871*c0909341SAndroid Build Coastguard Worker vpermq m0, m5, m0 4872*c0909341SAndroid Build Coastguard Worker pand ym1, ym11 4873*c0909341SAndroid Build Coastguard Worker mova [maskq], ym1 4874*c0909341SAndroid Build Coastguard Worker add maskq, 32 4875*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 4876*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4877*c0909341SAndroid Build Coastguard Worker dec hd 4878*c0909341SAndroid Build Coastguard Worker jg .w64_loop 4879*c0909341SAndroid Build Coastguard Worker RET 4880*c0909341SAndroid Build Coastguard Worker.w128: 4881*c0909341SAndroid Build Coastguard Worker pmovzxbq m13, [pb_02461357] 4882*c0909341SAndroid Build Coastguard Worker.w128_loop: 4883*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1 4884*c0909341SAndroid Build Coastguard Worker W_MASK 12, 5, 2, 3 4885*c0909341SAndroid Build Coastguard Worker mova m2, m8 4886*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m9 4887*c0909341SAndroid Build Coastguard Worker mova m3, m8 4888*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m5, m9 4889*c0909341SAndroid Build Coastguard Worker add tmp1q, 256 4890*c0909341SAndroid Build Coastguard Worker add tmp2q, 256 4891*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m10, m3 4892*c0909341SAndroid Build Coastguard Worker vpermq m0, m13, m0 4893*c0909341SAndroid Build Coastguard Worker vpermq m1, m13, m12 4894*c0909341SAndroid Build Coastguard Worker pand m2, m11 4895*c0909341SAndroid Build Coastguard Worker mova [maskq], m2 4896*c0909341SAndroid Build Coastguard Worker add maskq, 64 4897*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 4898*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 4899*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4900*c0909341SAndroid Build Coastguard Worker dec hd 4901*c0909341SAndroid Build Coastguard Worker jg .w128_loop 4902*c0909341SAndroid Build Coastguard Worker RET 4903*c0909341SAndroid Build Coastguard Worker 4904*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 4905*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx512icl_table 4906*c0909341SAndroid Build Coastguard Worker lea r7, [w_mask_444_avx512icl_table] 4907*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 4908*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 4909*c0909341SAndroid Build Coastguard Worker movsxd wq, dword [r7+wq*4] 4910*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 4911*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pb_64] 4912*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_2048] 4913*c0909341SAndroid Build Coastguard Worker mova m8, [base+wm_444_mask] 4914*c0909341SAndroid Build Coastguard Worker add wq, r7 4915*c0909341SAndroid Build Coastguard Worker mov maskq, maskmp 4916*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 4917*c0909341SAndroid Build Coastguard Worker jmp wq 4918*c0909341SAndroid Build Coastguard Worker.w4: 4919*c0909341SAndroid Build Coastguard Worker cmp hd, 8 4920*c0909341SAndroid Build Coastguard Worker jg .w4_h16 4921*c0909341SAndroid Build Coastguard Worker WRAP_YMM W_MASK 0, 4, 0, 1, 1 4922*c0909341SAndroid Build Coastguard Worker vinserti128 ym8, [wm_444_mask+32], 1 4923*c0909341SAndroid Build Coastguard Worker vpermb ym4, ym8, ym4 4924*c0909341SAndroid Build Coastguard Worker mova [maskq], ym4 4925*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m0, 1 4926*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 4927*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 4928*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 4929*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 1 4930*c0909341SAndroid Build Coastguard Worker jl .w4_end 4931*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4932*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 2 4933*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 3 4934*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm1, 2 4935*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 4936*c0909341SAndroid Build Coastguard Worker.w4_end: 4937*c0909341SAndroid Build Coastguard Worker RET 4938*c0909341SAndroid Build Coastguard Worker.w4_h16: 4939*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, strided 4940*c0909341SAndroid Build Coastguard Worker pmulld m9, [bidir_sctr_w4] 4941*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 4942*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m4 4943*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 4944*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 4945*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+m9]{k1}, m0 4946*c0909341SAndroid Build Coastguard Worker RET 4947*c0909341SAndroid Build Coastguard Worker.w8: 4948*c0909341SAndroid Build Coastguard Worker cmp hd, 4 4949*c0909341SAndroid Build Coastguard Worker jne .w8_h8 4950*c0909341SAndroid Build Coastguard Worker WRAP_YMM W_MASK 0, 4, 0, 1, 1 4951*c0909341SAndroid Build Coastguard Worker vinserti128 ym8, [wm_444_mask+32], 1 4952*c0909341SAndroid Build Coastguard Worker vpermb ym4, ym8, ym4 4953*c0909341SAndroid Build Coastguard Worker mova [maskq], ym4 4954*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4955*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4956*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4957*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 4958*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 4959*c0909341SAndroid Build Coastguard Worker RET 4960*c0909341SAndroid Build Coastguard Worker.w8_loop: 4961*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4962*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4963*c0909341SAndroid Build Coastguard Worker add maskq, 64 4964*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4965*c0909341SAndroid Build Coastguard Worker.w8_h8: 4966*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 4967*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m4 4968*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 4969*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 4970*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 2 4971*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m0, 3 4972*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 4973*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 4974*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 4975*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm3 4976*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4977*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm0 4978*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 4979*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 4980*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 4981*c0909341SAndroid Build Coastguard Worker sub hd, 8 4982*c0909341SAndroid Build Coastguard Worker jg .w8_loop 4983*c0909341SAndroid Build Coastguard Worker RET 4984*c0909341SAndroid Build Coastguard Worker.w16_loop: 4985*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 4986*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 4987*c0909341SAndroid Build Coastguard Worker add maskq, 64 4988*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 4989*c0909341SAndroid Build Coastguard Worker.w16: 4990*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 4991*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m4 4992*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 4993*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 4994*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 4995*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 4996*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 4997*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 4998*c0909341SAndroid Build Coastguard Worker sub hd, 4 4999*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5000*c0909341SAndroid Build Coastguard Worker RET 5001*c0909341SAndroid Build Coastguard Worker.w32: 5002*c0909341SAndroid Build Coastguard Worker pmovzxbq m9, [pb_02461357] 5003*c0909341SAndroid Build Coastguard Worker.w32_loop: 5004*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 5005*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m4 5006*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 5007*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 5008*c0909341SAndroid Build Coastguard Worker vpermq m0, m9, m0 5009*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 5010*c0909341SAndroid Build Coastguard Worker add maskq, 64 5011*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 5012*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 5013*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 5014*c0909341SAndroid Build Coastguard Worker sub hd, 2 5015*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5016*c0909341SAndroid Build Coastguard Worker RET 5017*c0909341SAndroid Build Coastguard Worker.w64: 5018*c0909341SAndroid Build Coastguard Worker pmovzxbq m9, [pb_02461357] 5019*c0909341SAndroid Build Coastguard Worker.w64_loop: 5020*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 5021*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m4 5022*c0909341SAndroid Build Coastguard Worker add tmp1q, 128 5023*c0909341SAndroid Build Coastguard Worker add tmp2q, 128 5024*c0909341SAndroid Build Coastguard Worker vpermq m0, m9, m0 5025*c0909341SAndroid Build Coastguard Worker mova [maskq], m4 5026*c0909341SAndroid Build Coastguard Worker add maskq, 64 5027*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5028*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5029*c0909341SAndroid Build Coastguard Worker dec hd 5030*c0909341SAndroid Build Coastguard Worker jg .w64_loop 5031*c0909341SAndroid Build Coastguard Worker RET 5032*c0909341SAndroid Build Coastguard Worker.w128: 5033*c0909341SAndroid Build Coastguard Worker pmovzxbq m11, [pb_02461357] 5034*c0909341SAndroid Build Coastguard Worker.w128_loop: 5035*c0909341SAndroid Build Coastguard Worker W_MASK 0, 4, 0, 1, 1 5036*c0909341SAndroid Build Coastguard Worker W_MASK 10, 9, 2, 3, 1 5037*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, m4 5038*c0909341SAndroid Build Coastguard Worker vpermb m9, m8, m9 5039*c0909341SAndroid Build Coastguard Worker add tmp1q, 256 5040*c0909341SAndroid Build Coastguard Worker add tmp2q, 256 5041*c0909341SAndroid Build Coastguard Worker vpermq m0, m11, m0 5042*c0909341SAndroid Build Coastguard Worker vpermq m10, m11, m10 5043*c0909341SAndroid Build Coastguard Worker mova [maskq+64*0], m4 5044*c0909341SAndroid Build Coastguard Worker mova [maskq+64*1], m9 5045*c0909341SAndroid Build Coastguard Worker add maskq, 128 5046*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5047*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m10 5048*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5049*c0909341SAndroid Build Coastguard Worker dec hd 5050*c0909341SAndroid Build Coastguard Worker jg .w128_loop 5051*c0909341SAndroid Build Coastguard Worker RET 5052*c0909341SAndroid Build Coastguard Worker 5053*c0909341SAndroid Build Coastguard Workercglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask 5054*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx512icl_table 5055*c0909341SAndroid Build Coastguard Worker lea r6, [blend_avx512icl_table] 5056*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5057*c0909341SAndroid Build Coastguard Worker movifnidn maskq, maskmp 5058*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5059*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 5060*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_64] 5061*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pw_512] 5062*c0909341SAndroid Build Coastguard Worker sub tmpq, maskq 5063*c0909341SAndroid Build Coastguard Worker add wq, r6 5064*c0909341SAndroid Build Coastguard Worker lea r6, [dsq*3] 5065*c0909341SAndroid Build Coastguard Worker jmp wq 5066*c0909341SAndroid Build Coastguard Worker.w4: 5067*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5068*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [dstq+dsq*1], 1 5069*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm1, [dstq+dsq*2] 5070*c0909341SAndroid Build Coastguard Worker pinsrd xmm1, [dstq+r6 ], 3 5071*c0909341SAndroid Build Coastguard Worker mova xmm4, [maskq] 5072*c0909341SAndroid Build Coastguard Worker mova xmm5, [maskq+tmpq] 5073*c0909341SAndroid Build Coastguard Worker add maskq, 4*4 5074*c0909341SAndroid Build Coastguard Worker psubb xmm3, xm6, xmm4 5075*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm5 5076*c0909341SAndroid Build Coastguard Worker punpcklbw xmm2, xmm3, xmm4 5077*c0909341SAndroid Build Coastguard Worker punpckhbw xmm1, xmm5 5078*c0909341SAndroid Build Coastguard Worker punpckhbw xmm3, xmm4 5079*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm2 5080*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xmm3 5081*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm7 5082*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm7 5083*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm1 5084*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 5085*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 5086*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*2], xmm0, 2 5087*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r6 ], xmm0, 3 5088*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5089*c0909341SAndroid Build Coastguard Worker sub hd, 4 5090*c0909341SAndroid Build Coastguard Worker jg .w4 5091*c0909341SAndroid Build Coastguard Worker RET 5092*c0909341SAndroid Build Coastguard Worker.w8: 5093*c0909341SAndroid Build Coastguard Worker movq xmm0, [dstq+dsq*0] 5094*c0909341SAndroid Build Coastguard Worker vpbroadcastq xmm1, [dstq+dsq*1] 5095*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm2, [dstq+dsq*2] 5096*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm3, [dstq+r6 ] 5097*c0909341SAndroid Build Coastguard Worker mova ymm4, [maskq] 5098*c0909341SAndroid Build Coastguard Worker mova ymm5, [maskq+tmpq] 5099*c0909341SAndroid Build Coastguard Worker add maskq, 8*4 5100*c0909341SAndroid Build Coastguard Worker vpblendd ymm0, ymm2, 0x30 5101*c0909341SAndroid Build Coastguard Worker vpblendd ymm1, ymm3, 0xc0 5102*c0909341SAndroid Build Coastguard Worker psubb ymm3, ym6, ymm4 5103*c0909341SAndroid Build Coastguard Worker punpcklbw ymm0, ymm5 5104*c0909341SAndroid Build Coastguard Worker punpcklbw ymm2, ymm3, ymm4 5105*c0909341SAndroid Build Coastguard Worker punpckhbw ymm1, ymm5 5106*c0909341SAndroid Build Coastguard Worker punpckhbw ymm3, ymm4 5107*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm0, ymm2 5108*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm1, ymm3 5109*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm0, ym7 5110*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm1, ym7 5111*c0909341SAndroid Build Coastguard Worker packuswb ymm0, ymm1 5112*c0909341SAndroid Build Coastguard Worker vextracti128 xmm1, ymm0, 1 5113*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 5114*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 5115*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*2], xmm1 5116*c0909341SAndroid Build Coastguard Worker movhps [dstq+r6 ], xmm1 5117*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5118*c0909341SAndroid Build Coastguard Worker sub hd, 4 5119*c0909341SAndroid Build Coastguard Worker jg .w8 5120*c0909341SAndroid Build Coastguard Worker vzeroupper 5121*c0909341SAndroid Build Coastguard Worker RET 5122*c0909341SAndroid Build Coastguard Worker.w16: 5123*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*0] 5124*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [dstq+dsq*1], 1 5125*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [dstq+dsq*2], 2 5126*c0909341SAndroid Build Coastguard Worker mova m4, [maskq] 5127*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [dstq+r6 ], 3 5128*c0909341SAndroid Build Coastguard Worker mova m5, [maskq+tmpq] 5129*c0909341SAndroid Build Coastguard Worker add maskq, 16*4 5130*c0909341SAndroid Build Coastguard Worker psubb m3, m6, m4 5131*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m5 5132*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 5133*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m5 5134*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 5135*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 5136*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5137*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 5138*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 5139*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5140*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5141*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], ym0, 1 5142*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*2], m0, 2 5143*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r6 ], m0, 3 5144*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*4] 5145*c0909341SAndroid Build Coastguard Worker sub hd, 4 5146*c0909341SAndroid Build Coastguard Worker jg .w16 5147*c0909341SAndroid Build Coastguard Worker RET 5148*c0909341SAndroid Build Coastguard Worker.w32: 5149*c0909341SAndroid Build Coastguard Worker mova ym1, [dstq+dsq*0] 5150*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [dstq+dsq*1], 1 5151*c0909341SAndroid Build Coastguard Worker mova m4, [maskq] 5152*c0909341SAndroid Build Coastguard Worker mova m5, [maskq+tmpq] 5153*c0909341SAndroid Build Coastguard Worker add maskq, 32*2 5154*c0909341SAndroid Build Coastguard Worker psubb m3, m6, m4 5155*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m5 5156*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 5157*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m5 5158*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 5159*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 5160*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5161*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 5162*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 5163*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5164*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 5165*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 5166*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5167*c0909341SAndroid Build Coastguard Worker sub hd, 2 5168*c0909341SAndroid Build Coastguard Worker jg .w32 5169*c0909341SAndroid Build Coastguard Worker RET 5170*c0909341SAndroid Build Coastguard Worker 5171*c0909341SAndroid Build Coastguard Workercglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 5172*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_avx512icl_table 5173*c0909341SAndroid Build Coastguard Worker lea r5, [blend_v_avx512icl_table] 5174*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5175*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 5176*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 5177*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_512] 5178*c0909341SAndroid Build Coastguard Worker add wq, r5 5179*c0909341SAndroid Build Coastguard Worker add maskq, obmc_masks-blend_v_avx512icl_table 5180*c0909341SAndroid Build Coastguard Worker jmp wq 5181*c0909341SAndroid Build Coastguard Worker.w2: 5182*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm2, [maskq+2*2] 5183*c0909341SAndroid Build Coastguard Worker.w2_s0_loop: 5184*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5185*c0909341SAndroid Build Coastguard Worker pinsrw xmm0, [dstq+dsq*1], 1 5186*c0909341SAndroid Build Coastguard Worker movd xmm1, [tmpq] 5187*c0909341SAndroid Build Coastguard Worker add tmpq, 2*2 5188*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm1 5189*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm2 5190*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm5 5191*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 5192*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm0, 0 5193*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm0, 1 5194*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5195*c0909341SAndroid Build Coastguard Worker sub hd, 2 5196*c0909341SAndroid Build Coastguard Worker jg .w2_s0_loop 5197*c0909341SAndroid Build Coastguard Worker RET 5198*c0909341SAndroid Build Coastguard Worker.w4: 5199*c0909341SAndroid Build Coastguard Worker vpbroadcastq xmm2, [maskq+4*2] 5200*c0909341SAndroid Build Coastguard Worker.w4_loop: 5201*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5202*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [dstq+dsq*1], 1 5203*c0909341SAndroid Build Coastguard Worker movq xmm1, [tmpq] 5204*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 5205*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm1 5206*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm2 5207*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm5 5208*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 5209*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 5210*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 5211*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5212*c0909341SAndroid Build Coastguard Worker sub hd, 2 5213*c0909341SAndroid Build Coastguard Worker jg .w4_loop 5214*c0909341SAndroid Build Coastguard Worker RET 5215*c0909341SAndroid Build Coastguard Worker.w8: 5216*c0909341SAndroid Build Coastguard Worker mova xmm3, [maskq+8*2] 5217*c0909341SAndroid Build Coastguard Worker.w8_loop: 5218*c0909341SAndroid Build Coastguard Worker movq xmm0, [dstq+dsq*0] 5219*c0909341SAndroid Build Coastguard Worker vpbroadcastq xmm1, [dstq+dsq*1] 5220*c0909341SAndroid Build Coastguard Worker mova xmm2, [tmpq] 5221*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 5222*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm2 5223*c0909341SAndroid Build Coastguard Worker punpckhbw xmm1, xmm2 5224*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm3 5225*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xmm3 5226*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm5 5227*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm1, xm5 5228*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm1 5229*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*0], xmm0 5230*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*1], xmm0 5231*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5232*c0909341SAndroid Build Coastguard Worker sub hd, 2 5233*c0909341SAndroid Build Coastguard Worker jg .w8_loop 5234*c0909341SAndroid Build Coastguard Worker RET 5235*c0909341SAndroid Build Coastguard Worker.w16: 5236*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [maskq+16*2] 5237*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [maskq+16*3] 5238*c0909341SAndroid Build Coastguard Worker.w16_loop: 5239*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*0] 5240*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [dstq+dsq*1], 1 5241*c0909341SAndroid Build Coastguard Worker mova ym2, [tmpq] 5242*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 5243*c0909341SAndroid Build Coastguard Worker punpcklbw ym0, ym1, ym2 5244*c0909341SAndroid Build Coastguard Worker punpckhbw ym1, ym2 5245*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym3 5246*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym4 5247*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym5 5248*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym5 5249*c0909341SAndroid Build Coastguard Worker packuswb ym0, ym1 5250*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5251*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], m0, 1 5252*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5253*c0909341SAndroid Build Coastguard Worker sub hd, 2 5254*c0909341SAndroid Build Coastguard Worker jg .w16_loop 5255*c0909341SAndroid Build Coastguard Worker RET 5256*c0909341SAndroid Build Coastguard Worker.w32: 5257*c0909341SAndroid Build Coastguard Worker mova m4, [maskq+32*2] 5258*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m4, m4, q2020 5259*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m4, q3131 5260*c0909341SAndroid Build Coastguard Worker.w32_loop: 5261*c0909341SAndroid Build Coastguard Worker mova ym1, [dstq+dsq*0] 5262*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [dstq+dsq*1], 1 5263*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5264*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 5265*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5266*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5267*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5268*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m4 5269*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5270*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5271*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5272*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 5273*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 5274*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5275*c0909341SAndroid Build Coastguard Worker sub hd, 2 5276*c0909341SAndroid Build Coastguard Worker jg .w32_loop 5277*c0909341SAndroid Build Coastguard Worker RET 5278*c0909341SAndroid Build Coastguard Worker 5279*c0909341SAndroid Build Coastguard Workercglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask 5280*c0909341SAndroid Build Coastguard Worker%define base r6-blend_h_avx512icl_table 5281*c0909341SAndroid Build Coastguard Worker lea r6, [blend_h_avx512icl_table] 5282*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 5283*c0909341SAndroid Build Coastguard Worker mov hd, hm 5284*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 5285*c0909341SAndroid Build Coastguard Worker lea maskq, [base+obmc_masks+hq*2] 5286*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pw_512] 5287*c0909341SAndroid Build Coastguard Worker lea hd, [hq*3] 5288*c0909341SAndroid Build Coastguard Worker add wq, r6 5289*c0909341SAndroid Build Coastguard Worker shr hd, 2 ; h * 3/4 5290*c0909341SAndroid Build Coastguard Worker lea maskq, [maskq+hq*2] 5291*c0909341SAndroid Build Coastguard Worker neg hq 5292*c0909341SAndroid Build Coastguard Worker jmp wq 5293*c0909341SAndroid Build Coastguard Worker.w2: 5294*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5295*c0909341SAndroid Build Coastguard Worker pinsrw xmm0, [dstq+dsq*1], 1 5296*c0909341SAndroid Build Coastguard Worker movd xmm2, [maskq+hq*2] 5297*c0909341SAndroid Build Coastguard Worker movd xmm1, [tmpq] 5298*c0909341SAndroid Build Coastguard Worker add tmpq, 2*2 5299*c0909341SAndroid Build Coastguard Worker punpcklwd xmm2, xmm2 5300*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm1 5301*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm2 5302*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm5 5303*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 5304*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*0], xmm0, 0 5305*c0909341SAndroid Build Coastguard Worker pextrw [dstq+dsq*1], xmm0, 1 5306*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5307*c0909341SAndroid Build Coastguard Worker add hq, 2 5308*c0909341SAndroid Build Coastguard Worker jl .w2 5309*c0909341SAndroid Build Coastguard Worker RET 5310*c0909341SAndroid Build Coastguard Worker.w4: 5311*c0909341SAndroid Build Coastguard Worker mova xmm3, [blend_shuf] 5312*c0909341SAndroid Build Coastguard Worker.w4_loop: 5313*c0909341SAndroid Build Coastguard Worker movd xmm0, [dstq+dsq*0] 5314*c0909341SAndroid Build Coastguard Worker pinsrd xmm0, [dstq+dsq*1], 1 5315*c0909341SAndroid Build Coastguard Worker movd xmm2, [maskq+hq*2] 5316*c0909341SAndroid Build Coastguard Worker movq xmm1, [tmpq] 5317*c0909341SAndroid Build Coastguard Worker add tmpq, 4*2 5318*c0909341SAndroid Build Coastguard Worker pshufb xmm2, xmm3 5319*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm1 5320*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm2 5321*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xm5 5322*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 5323*c0909341SAndroid Build Coastguard Worker movd [dstq+dsq*0], xmm0 5324*c0909341SAndroid Build Coastguard Worker pextrd [dstq+dsq*1], xmm0, 1 5325*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5326*c0909341SAndroid Build Coastguard Worker add hq, 2 5327*c0909341SAndroid Build Coastguard Worker jl .w4_loop 5328*c0909341SAndroid Build Coastguard Worker RET 5329*c0909341SAndroid Build Coastguard Worker.w8: 5330*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ymm4, [blend_shuf] 5331*c0909341SAndroid Build Coastguard Worker shufpd ymm4, ymm4, 0x03 5332*c0909341SAndroid Build Coastguard Worker.w8_loop: 5333*c0909341SAndroid Build Coastguard Worker vpbroadcastq ymm1, [dstq+dsq*0] 5334*c0909341SAndroid Build Coastguard Worker movq xmm0, [dstq+dsq*1] 5335*c0909341SAndroid Build Coastguard Worker vpblendd ymm0, ymm1, 0x30 5336*c0909341SAndroid Build Coastguard Worker vpbroadcastd ymm3, [maskq+hq*2] 5337*c0909341SAndroid Build Coastguard Worker movq xmm1, [tmpq+8*1] 5338*c0909341SAndroid Build Coastguard Worker vinserti128 ymm1, [tmpq+8*0], 1 5339*c0909341SAndroid Build Coastguard Worker add tmpq, 8*2 5340*c0909341SAndroid Build Coastguard Worker pshufb ymm3, ymm4 5341*c0909341SAndroid Build Coastguard Worker punpcklbw ymm0, ymm1 5342*c0909341SAndroid Build Coastguard Worker pmaddubsw ymm0, ymm3 5343*c0909341SAndroid Build Coastguard Worker pmulhrsw ymm0, ym5 5344*c0909341SAndroid Build Coastguard Worker vextracti128 xmm1, ymm0, 1 5345*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm1 5346*c0909341SAndroid Build Coastguard Worker movhps [dstq+dsq*0], xmm0 5347*c0909341SAndroid Build Coastguard Worker movq [dstq+dsq*1], xmm0 5348*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5349*c0909341SAndroid Build Coastguard Worker add hq, 2 5350*c0909341SAndroid Build Coastguard Worker jl .w8_loop 5351*c0909341SAndroid Build Coastguard Worker vzeroupper 5352*c0909341SAndroid Build Coastguard Worker RET 5353*c0909341SAndroid Build Coastguard Worker.w16: 5354*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [blend_shuf] 5355*c0909341SAndroid Build Coastguard Worker shufpd ym4, ym4, 0x0c 5356*c0909341SAndroid Build Coastguard Worker.w16_loop: 5357*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+dsq*0] 5358*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [dstq+dsq*1], 1 5359*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3, [maskq+hq*2] 5360*c0909341SAndroid Build Coastguard Worker mova ym2, [tmpq] 5361*c0909341SAndroid Build Coastguard Worker add tmpq, 16*2 5362*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym4 5363*c0909341SAndroid Build Coastguard Worker punpcklbw ym0, ym1, ym2 5364*c0909341SAndroid Build Coastguard Worker punpckhbw ym1, ym2 5365*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym3 5366*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym3 5367*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym5 5368*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym5 5369*c0909341SAndroid Build Coastguard Worker packuswb ym0, ym1 5370*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], xm0 5371*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+dsq*1], m0, 1 5372*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5373*c0909341SAndroid Build Coastguard Worker add hq, 2 5374*c0909341SAndroid Build Coastguard Worker jl .w16_loop 5375*c0909341SAndroid Build Coastguard Worker RET 5376*c0909341SAndroid Build Coastguard Worker.w32: 5377*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [blend_shuf] 5378*c0909341SAndroid Build Coastguard Worker shufpd m4, m4, 0xf0 5379*c0909341SAndroid Build Coastguard Worker.w32_loop: 5380*c0909341SAndroid Build Coastguard Worker mova ym1, [dstq+dsq*0] 5381*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [dstq+dsq*1], 1 5382*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [maskq+hq*2] 5383*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5384*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 5385*c0909341SAndroid Build Coastguard Worker pshufb m3, m4 5386*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5387*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5388*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5389*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5390*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5391*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5392*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5393*c0909341SAndroid Build Coastguard Worker mova [dstq+dsq*0], ym0 5394*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+dsq*1], m0, 1 5395*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+dsq*2] 5396*c0909341SAndroid Build Coastguard Worker add hq, 2 5397*c0909341SAndroid Build Coastguard Worker jl .w32_loop 5398*c0909341SAndroid Build Coastguard Worker RET 5399*c0909341SAndroid Build Coastguard Worker.w64: 5400*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, [maskq+hq*2] 5401*c0909341SAndroid Build Coastguard Worker mova m1, [dstq] 5402*c0909341SAndroid Build Coastguard Worker mova m2, [tmpq] 5403*c0909341SAndroid Build Coastguard Worker add tmpq, 32*2 5404*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5405*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5406*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 5407*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 5408*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 5409*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 5410*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5411*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5412*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5413*c0909341SAndroid Build Coastguard Worker inc hq 5414*c0909341SAndroid Build Coastguard Worker jl .w64 5415*c0909341SAndroid Build Coastguard Worker RET 5416*c0909341SAndroid Build Coastguard Worker.w128: 5417*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [maskq+hq*2] 5418*c0909341SAndroid Build Coastguard Worker mova m2, [dstq+64*0] 5419*c0909341SAndroid Build Coastguard Worker mova m1, [tmpq+64*0] 5420*c0909341SAndroid Build Coastguard Worker mova m3, [dstq+64*1] 5421*c0909341SAndroid Build Coastguard Worker mova m4, [tmpq+64*1] 5422*c0909341SAndroid Build Coastguard Worker add tmpq, 64*2 5423*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m1 5424*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m1 5425*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6 5426*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m6 5427*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3, m4 5428*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 5429*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6 5430*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m6 5431*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m0, m2, m1, m3 5432*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 5433*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 5434*c0909341SAndroid Build Coastguard Worker mova [dstq+64*0], m0 5435*c0909341SAndroid Build Coastguard Worker mova [dstq+64*1], m1 5436*c0909341SAndroid Build Coastguard Worker add dstq, dsq 5437*c0909341SAndroid Build Coastguard Worker inc hq 5438*c0909341SAndroid Build Coastguard Worker jl .w128 5439*c0909341SAndroid Build Coastguard Worker RET 5440*c0909341SAndroid Build Coastguard Worker 5441*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \ 5442*c0909341SAndroid Build Coastguard Worker dst_w, h, src_w, dx, mx0 5443*c0909341SAndroid Build Coastguard Worker sub dword mx0m, 4<<14 5444*c0909341SAndroid Build Coastguard Worker sub dword src_wm, 8 5445*c0909341SAndroid Build Coastguard Worker mov r6, ~0 5446*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, dxm 5447*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, mx0m 5448*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, src_wm 5449*c0909341SAndroid Build Coastguard Worker kmovq k3, r6 5450*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 5451*c0909341SAndroid Build Coastguard Worker LEA r7, $$ 5452*c0909341SAndroid Build Coastguard Worker%define base r7-$$ 5453*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+pw_m256] 5454*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pd_63] 5455*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m15, [base+pb_8x0_8x8] 5456*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] 5457*c0909341SAndroid Build Coastguard Worker pslld m5, 4 ; dx*16 5458*c0909341SAndroid Build Coastguard Worker pslld m6, 14 5459*c0909341SAndroid Build Coastguard Worker pxor m2, m2 5460*c0909341SAndroid Build Coastguard Worker mova m16, [base+resize_permA] 5461*c0909341SAndroid Build Coastguard Worker mova m17, [base+resize_permB] 5462*c0909341SAndroid Build Coastguard Worker mova xm18, [base+resize_permC] 5463*c0909341SAndroid Build Coastguard Worker.loop_y: 5464*c0909341SAndroid Build Coastguard Worker xor xd, xd 5465*c0909341SAndroid Build Coastguard Worker mova m4, m8 ; per-line working version of mx 5466*c0909341SAndroid Build Coastguard Worker.loop_x: 5467*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m4, m2 5468*c0909341SAndroid Build Coastguard Worker psrad m9, m4, 8 ; filter offset (unmasked) 5469*c0909341SAndroid Build Coastguard Worker pminsd m0, m6 ; iclip(mx, 0, src_w-8) 5470*c0909341SAndroid Build Coastguard Worker psubd m1, m4, m0 ; pshufb offset 5471*c0909341SAndroid Build Coastguard Worker psrad m0, 14 ; clipped src_x offset 5472*c0909341SAndroid Build Coastguard Worker psrad m1, 14 ; pshufb edge_emu offset 5473*c0909341SAndroid Build Coastguard Worker vptestmd k4, m1, m1 5474*c0909341SAndroid Build Coastguard Worker pand m9, m7 ; filter offset (masked) 5475*c0909341SAndroid Build Coastguard Worker ktestw k4, k4 5476*c0909341SAndroid Build Coastguard Worker jz .load 5477*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym12, m0, 1 5478*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym13, m1, 1 5479*c0909341SAndroid Build Coastguard Worker kmovq k1, k3 5480*c0909341SAndroid Build Coastguard Worker kmovq k2, k3 5481*c0909341SAndroid Build Coastguard Worker vpgatherdq m10{k1}, [srcq+ym0] 5482*c0909341SAndroid Build Coastguard Worker vpgatherdq m11{k2}, [srcq+ym12] 5483*c0909341SAndroid Build Coastguard Worker kmovq k1, k3 5484*c0909341SAndroid Build Coastguard Worker kmovq k2, k3 5485*c0909341SAndroid Build Coastguard Worker vpgatherdq m14{k1}, [base+resize_shuf+4+ym1] 5486*c0909341SAndroid Build Coastguard Worker vpgatherdq m0{k2}, [base+resize_shuf+4+ym13] 5487*c0909341SAndroid Build Coastguard Worker mova m12, m16 5488*c0909341SAndroid Build Coastguard Worker mova m13, m17 5489*c0909341SAndroid Build Coastguard Worker paddb m14, m15 5490*c0909341SAndroid Build Coastguard Worker paddb m0, m15 5491*c0909341SAndroid Build Coastguard Worker pshufb m10, m14 5492*c0909341SAndroid Build Coastguard Worker pshufb m11, m0 5493*c0909341SAndroid Build Coastguard Worker vpermi2d m12, m10, m11 5494*c0909341SAndroid Build Coastguard Worker vpermi2d m13, m10, m11 5495*c0909341SAndroid Build Coastguard Worker jmp .filter 5496*c0909341SAndroid Build Coastguard Worker.load: 5497*c0909341SAndroid Build Coastguard Worker kmovq k1, k3 5498*c0909341SAndroid Build Coastguard Worker kmovq k2, k3 5499*c0909341SAndroid Build Coastguard Worker vpgatherdd m12{k1}, [srcq+m0+0] 5500*c0909341SAndroid Build Coastguard Worker vpgatherdd m13{k2}, [srcq+m0+4] 5501*c0909341SAndroid Build Coastguard Worker.filter: 5502*c0909341SAndroid Build Coastguard Worker kmovq k1, k3 5503*c0909341SAndroid Build Coastguard Worker kmovq k2, k3 5504*c0909341SAndroid Build Coastguard Worker vpgatherdd m10{k1}, [base+resize_filter+m9*8+0] 5505*c0909341SAndroid Build Coastguard Worker vpgatherdd m11{k2}, [base+resize_filter+m9*8+4] 5506*c0909341SAndroid Build Coastguard Worker mova m14, m2 5507*c0909341SAndroid Build Coastguard Worker vpdpbusd m14, m12, m10 5508*c0909341SAndroid Build Coastguard Worker vpdpbusd m14, m13, m11 5509*c0909341SAndroid Build Coastguard Worker packssdw m14, m14 5510*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m3 5511*c0909341SAndroid Build Coastguard Worker packuswb m14, m14 5512*c0909341SAndroid Build Coastguard Worker vpermd m14, m18, m14 5513*c0909341SAndroid Build Coastguard Worker mova [dstq+xq], xm14 5514*c0909341SAndroid Build Coastguard Worker paddd m4, m5 5515*c0909341SAndroid Build Coastguard Worker add xd, 16 5516*c0909341SAndroid Build Coastguard Worker cmp xd, dst_wd 5517*c0909341SAndroid Build Coastguard Worker jl .loop_x 5518*c0909341SAndroid Build Coastguard Worker add dstq, dst_strideq 5519*c0909341SAndroid Build Coastguard Worker add srcq, src_strideq 5520*c0909341SAndroid Build Coastguard Worker dec hd 5521*c0909341SAndroid Build Coastguard Worker jg .loop_y 5522*c0909341SAndroid Build Coastguard Worker RET 5523*c0909341SAndroid Build Coastguard Worker 5524*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 5525