1*01826a49SYabin Cui/* 2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates. 3*01826a49SYabin Cui * All rights reserved. 4*01826a49SYabin Cui * 5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the 6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree). 8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses. 9*01826a49SYabin Cui */ 10*01826a49SYabin Cui 11*01826a49SYabin Cui#include "../common/portability_macros.h" 12*01826a49SYabin Cui 13*01826a49SYabin Cui#if defined(__ELF__) && defined(__GNUC__) 14*01826a49SYabin Cui/* Stack marking 15*01826a49SYabin Cui * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart 16*01826a49SYabin Cui */ 17*01826a49SYabin Cui.section .note.GNU-stack,"",%progbits 18*01826a49SYabin Cui 19*01826a49SYabin Cui#if defined(__aarch64__) 20*01826a49SYabin Cui/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64. 21*01826a49SYabin Cui * See: https://github.com/facebook/zstd/issues/3841 22*01826a49SYabin Cui * See: https://gcc.godbolt.org/z/sqr5T4ffK 23*01826a49SYabin Cui * See: https://lore.kernel.org/linux-arm-kernel/[email protected]/ 24*01826a49SYabin Cui * See: https://reviews.llvm.org/D62609 25*01826a49SYabin Cui */ 26*01826a49SYabin Cui.pushsection .note.gnu.property, "a" 27*01826a49SYabin Cui.p2align 3 28*01826a49SYabin Cui.long 4 /* size of the name - "GNU\0" */ 29*01826a49SYabin Cui.long 0x10 /* size of descriptor */ 30*01826a49SYabin Cui.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */ 31*01826a49SYabin Cui.asciz "GNU" 32*01826a49SYabin Cui.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */ 33*01826a49SYabin Cui.long 4 /* pr_datasz - 4 bytes */ 34*01826a49SYabin Cui.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ 35*01826a49SYabin Cui.p2align 3 /* pr_padding - bring everything to 8 byte alignment */ 36*01826a49SYabin Cui.popsection 37*01826a49SYabin Cui#endif 38*01826a49SYabin Cui 39*01826a49SYabin Cui#endif 40*01826a49SYabin Cui 41*01826a49SYabin Cui#if ZSTD_ENABLE_ASM_X86_64_BMI2 42*01826a49SYabin Cui 43*01826a49SYabin Cui/* Calling convention: 44*01826a49SYabin Cui * 45*01826a49SYabin Cui * %rdi contains the first argument: HUF_DecompressAsmArgs*. 46*01826a49SYabin Cui * %rbp isn't maintained (no frame pointer). 47*01826a49SYabin Cui * %rsp contains the stack pointer that grows down. 48*01826a49SYabin Cui * No red-zone is assumed, only addresses >= %rsp are used. 49*01826a49SYabin Cui * All register contents are preserved. 50*01826a49SYabin Cui * 51*01826a49SYabin Cui * TODO: Support Windows calling convention. 52*01826a49SYabin Cui */ 53*01826a49SYabin Cui 54*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) 55*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) 56*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) 57*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) 58*01826a49SYabin Cui.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop 59*01826a49SYabin Cui.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop 60*01826a49SYabin Cui.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop 61*01826a49SYabin Cui.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop 62*01826a49SYabin Cui.text 63*01826a49SYabin Cui 64*01826a49SYabin Cui/* Sets up register mappings for clarity. 65*01826a49SYabin Cui * op[], bits[], dtable & ip[0] each get their own register. 66*01826a49SYabin Cui * ip[1,2,3] & olimit alias var[]. 67*01826a49SYabin Cui * %rax is a scratch register. 68*01826a49SYabin Cui */ 69*01826a49SYabin Cui 70*01826a49SYabin Cui#define op0 rsi 71*01826a49SYabin Cui#define op1 rbx 72*01826a49SYabin Cui#define op2 rcx 73*01826a49SYabin Cui#define op3 rdi 74*01826a49SYabin Cui 75*01826a49SYabin Cui#define ip0 r8 76*01826a49SYabin Cui#define ip1 r9 77*01826a49SYabin Cui#define ip2 r10 78*01826a49SYabin Cui#define ip3 r11 79*01826a49SYabin Cui 80*01826a49SYabin Cui#define bits0 rbp 81*01826a49SYabin Cui#define bits1 rdx 82*01826a49SYabin Cui#define bits2 r12 83*01826a49SYabin Cui#define bits3 r13 84*01826a49SYabin Cui#define dtable r14 85*01826a49SYabin Cui#define olimit r15 86*01826a49SYabin Cui 87*01826a49SYabin Cui/* var[] aliases ip[1,2,3] & olimit 88*01826a49SYabin Cui * ip[1,2,3] are saved every iteration. 89*01826a49SYabin Cui * olimit is only used in compute_olimit. 90*01826a49SYabin Cui */ 91*01826a49SYabin Cui#define var0 r15 92*01826a49SYabin Cui#define var1 r9 93*01826a49SYabin Cui#define var2 r10 94*01826a49SYabin Cui#define var3 r11 95*01826a49SYabin Cui 96*01826a49SYabin Cui/* 32-bit var registers */ 97*01826a49SYabin Cui#define vard0 r15d 98*01826a49SYabin Cui#define vard1 r9d 99*01826a49SYabin Cui#define vard2 r10d 100*01826a49SYabin Cui#define vard3 r11d 101*01826a49SYabin Cui 102*01826a49SYabin Cui/* Calls X(N) for each stream 0, 1, 2, 3. */ 103*01826a49SYabin Cui#define FOR_EACH_STREAM(X) \ 104*01826a49SYabin Cui X(0); \ 105*01826a49SYabin Cui X(1); \ 106*01826a49SYabin Cui X(2); \ 107*01826a49SYabin Cui X(3) 108*01826a49SYabin Cui 109*01826a49SYabin Cui/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ 110*01826a49SYabin Cui#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ 111*01826a49SYabin Cui X(0, idx); \ 112*01826a49SYabin Cui X(1, idx); \ 113*01826a49SYabin Cui X(2, idx); \ 114*01826a49SYabin Cui X(3, idx) 115*01826a49SYabin Cui 116*01826a49SYabin Cui/* Define both _HUF_* & HUF_* symbols because MacOS 117*01826a49SYabin Cui * C symbols are prefixed with '_' & Linux symbols aren't. 118*01826a49SYabin Cui */ 119*01826a49SYabin Cui_HUF_decompress4X1_usingDTable_internal_fast_asm_loop: 120*01826a49SYabin CuiHUF_decompress4X1_usingDTable_internal_fast_asm_loop: 121*01826a49SYabin Cui ZSTD_CET_ENDBRANCH 122*01826a49SYabin Cui /* Save all registers - even if they are callee saved for simplicity. */ 123*01826a49SYabin Cui push %rax 124*01826a49SYabin Cui push %rbx 125*01826a49SYabin Cui push %rcx 126*01826a49SYabin Cui push %rdx 127*01826a49SYabin Cui push %rbp 128*01826a49SYabin Cui push %rsi 129*01826a49SYabin Cui push %rdi 130*01826a49SYabin Cui push %r8 131*01826a49SYabin Cui push %r9 132*01826a49SYabin Cui push %r10 133*01826a49SYabin Cui push %r11 134*01826a49SYabin Cui push %r12 135*01826a49SYabin Cui push %r13 136*01826a49SYabin Cui push %r14 137*01826a49SYabin Cui push %r15 138*01826a49SYabin Cui 139*01826a49SYabin Cui /* Read HUF_DecompressAsmArgs* args from %rax */ 140*01826a49SYabin Cui movq %rdi, %rax 141*01826a49SYabin Cui movq 0(%rax), %ip0 142*01826a49SYabin Cui movq 8(%rax), %ip1 143*01826a49SYabin Cui movq 16(%rax), %ip2 144*01826a49SYabin Cui movq 24(%rax), %ip3 145*01826a49SYabin Cui movq 32(%rax), %op0 146*01826a49SYabin Cui movq 40(%rax), %op1 147*01826a49SYabin Cui movq 48(%rax), %op2 148*01826a49SYabin Cui movq 56(%rax), %op3 149*01826a49SYabin Cui movq 64(%rax), %bits0 150*01826a49SYabin Cui movq 72(%rax), %bits1 151*01826a49SYabin Cui movq 80(%rax), %bits2 152*01826a49SYabin Cui movq 88(%rax), %bits3 153*01826a49SYabin Cui movq 96(%rax), %dtable 154*01826a49SYabin Cui push %rax /* argument */ 155*01826a49SYabin Cui push 104(%rax) /* ilowest */ 156*01826a49SYabin Cui push 112(%rax) /* oend */ 157*01826a49SYabin Cui push %olimit /* olimit space */ 158*01826a49SYabin Cui 159*01826a49SYabin Cui subq $24, %rsp 160*01826a49SYabin Cui 161*01826a49SYabin Cui.L_4X1_compute_olimit: 162*01826a49SYabin Cui /* Computes how many iterations we can do safely 163*01826a49SYabin Cui * %r15, %rax may be clobbered 164*01826a49SYabin Cui * rbx, rdx must be saved 165*01826a49SYabin Cui * op3 & ip0 mustn't be clobbered 166*01826a49SYabin Cui */ 167*01826a49SYabin Cui movq %rbx, 0(%rsp) 168*01826a49SYabin Cui movq %rdx, 8(%rsp) 169*01826a49SYabin Cui 170*01826a49SYabin Cui movq 32(%rsp), %rax /* rax = oend */ 171*01826a49SYabin Cui subq %op3, %rax /* rax = oend - op3 */ 172*01826a49SYabin Cui 173*01826a49SYabin Cui /* r15 = (oend - op3) / 5 */ 174*01826a49SYabin Cui movabsq $-3689348814741910323, %rdx 175*01826a49SYabin Cui mulq %rdx 176*01826a49SYabin Cui movq %rdx, %r15 177*01826a49SYabin Cui shrq $2, %r15 178*01826a49SYabin Cui 179*01826a49SYabin Cui movq %ip0, %rax /* rax = ip0 */ 180*01826a49SYabin Cui movq 40(%rsp), %rdx /* rdx = ilowest */ 181*01826a49SYabin Cui subq %rdx, %rax /* rax = ip0 - ilowest */ 182*01826a49SYabin Cui movq %rax, %rbx /* rbx = ip0 - ilowest */ 183*01826a49SYabin Cui 184*01826a49SYabin Cui /* rdx = (ip0 - ilowest) / 7 */ 185*01826a49SYabin Cui movabsq $2635249153387078803, %rdx 186*01826a49SYabin Cui mulq %rdx 187*01826a49SYabin Cui subq %rdx, %rbx 188*01826a49SYabin Cui shrq %rbx 189*01826a49SYabin Cui addq %rbx, %rdx 190*01826a49SYabin Cui shrq $2, %rdx 191*01826a49SYabin Cui 192*01826a49SYabin Cui /* r15 = min(%rdx, %r15) */ 193*01826a49SYabin Cui cmpq %rdx, %r15 194*01826a49SYabin Cui cmova %rdx, %r15 195*01826a49SYabin Cui 196*01826a49SYabin Cui /* r15 = r15 * 5 */ 197*01826a49SYabin Cui leaq (%r15, %r15, 4), %r15 198*01826a49SYabin Cui 199*01826a49SYabin Cui /* olimit = op3 + r15 */ 200*01826a49SYabin Cui addq %op3, %olimit 201*01826a49SYabin Cui 202*01826a49SYabin Cui movq 8(%rsp), %rdx 203*01826a49SYabin Cui movq 0(%rsp), %rbx 204*01826a49SYabin Cui 205*01826a49SYabin Cui /* If (op3 + 20 > olimit) */ 206*01826a49SYabin Cui movq %op3, %rax /* rax = op3 */ 207*01826a49SYabin Cui cmpq %rax, %olimit /* op3 == olimit */ 208*01826a49SYabin Cui je .L_4X1_exit 209*01826a49SYabin Cui 210*01826a49SYabin Cui /* If (ip1 < ip0) go to exit */ 211*01826a49SYabin Cui cmpq %ip0, %ip1 212*01826a49SYabin Cui jb .L_4X1_exit 213*01826a49SYabin Cui 214*01826a49SYabin Cui /* If (ip2 < ip1) go to exit */ 215*01826a49SYabin Cui cmpq %ip1, %ip2 216*01826a49SYabin Cui jb .L_4X1_exit 217*01826a49SYabin Cui 218*01826a49SYabin Cui /* If (ip3 < ip2) go to exit */ 219*01826a49SYabin Cui cmpq %ip2, %ip3 220*01826a49SYabin Cui jb .L_4X1_exit 221*01826a49SYabin Cui 222*01826a49SYabin Cui/* Reads top 11 bits from bits[n] 223*01826a49SYabin Cui * Loads dt[bits[n]] into var[n] 224*01826a49SYabin Cui */ 225*01826a49SYabin Cui#define GET_NEXT_DELT(n) \ 226*01826a49SYabin Cui movq $53, %var##n; \ 227*01826a49SYabin Cui shrxq %var##n, %bits##n, %var##n; \ 228*01826a49SYabin Cui movzwl (%dtable,%var##n,2),%vard##n 229*01826a49SYabin Cui 230*01826a49SYabin Cui/* var[n] must contain the DTable entry computed with GET_NEXT_DELT 231*01826a49SYabin Cui * Moves var[n] to %rax 232*01826a49SYabin Cui * bits[n] <<= var[n] & 63 233*01826a49SYabin Cui * op[n][idx] = %rax >> 8 234*01826a49SYabin Cui * %ah is a way to access bits [8, 16) of %rax 235*01826a49SYabin Cui */ 236*01826a49SYabin Cui#define DECODE_FROM_DELT(n, idx) \ 237*01826a49SYabin Cui movq %var##n, %rax; \ 238*01826a49SYabin Cui shlxq %var##n, %bits##n, %bits##n; \ 239*01826a49SYabin Cui movb %ah, idx(%op##n) 240*01826a49SYabin Cui 241*01826a49SYabin Cui/* Assumes GET_NEXT_DELT has been called. 242*01826a49SYabin Cui * Calls DECODE_FROM_DELT then GET_NEXT_DELT 243*01826a49SYabin Cui */ 244*01826a49SYabin Cui#define DECODE_AND_GET_NEXT(n, idx) \ 245*01826a49SYabin Cui DECODE_FROM_DELT(n, idx); \ 246*01826a49SYabin Cui GET_NEXT_DELT(n) \ 247*01826a49SYabin Cui 248*01826a49SYabin Cui/* // ctz & nbBytes is stored in bits[n] 249*01826a49SYabin Cui * // nbBits is stored in %rax 250*01826a49SYabin Cui * ctz = CTZ[bits[n]] 251*01826a49SYabin Cui * nbBits = ctz & 7 252*01826a49SYabin Cui * nbBytes = ctz >> 3 253*01826a49SYabin Cui * op[n] += 5 254*01826a49SYabin Cui * ip[n] -= nbBytes 255*01826a49SYabin Cui * // Note: x86-64 is little-endian ==> no bswap 256*01826a49SYabin Cui * bits[n] = MEM_readST(ip[n]) | 1 257*01826a49SYabin Cui * bits[n] <<= nbBits 258*01826a49SYabin Cui */ 259*01826a49SYabin Cui#define RELOAD_BITS(n) \ 260*01826a49SYabin Cui bsfq %bits##n, %bits##n; \ 261*01826a49SYabin Cui movq %bits##n, %rax; \ 262*01826a49SYabin Cui andq $7, %rax; \ 263*01826a49SYabin Cui shrq $3, %bits##n; \ 264*01826a49SYabin Cui leaq 5(%op##n), %op##n; \ 265*01826a49SYabin Cui subq %bits##n, %ip##n; \ 266*01826a49SYabin Cui movq (%ip##n), %bits##n; \ 267*01826a49SYabin Cui orq $1, %bits##n; \ 268*01826a49SYabin Cui shlx %rax, %bits##n, %bits##n 269*01826a49SYabin Cui 270*01826a49SYabin Cui /* Store clobbered variables on the stack */ 271*01826a49SYabin Cui movq %olimit, 24(%rsp) 272*01826a49SYabin Cui movq %ip1, 0(%rsp) 273*01826a49SYabin Cui movq %ip2, 8(%rsp) 274*01826a49SYabin Cui movq %ip3, 16(%rsp) 275*01826a49SYabin Cui 276*01826a49SYabin Cui /* Call GET_NEXT_DELT for each stream */ 277*01826a49SYabin Cui FOR_EACH_STREAM(GET_NEXT_DELT) 278*01826a49SYabin Cui 279*01826a49SYabin Cui .p2align 6 280*01826a49SYabin Cui 281*01826a49SYabin Cui.L_4X1_loop_body: 282*01826a49SYabin Cui /* Decode 5 symbols in each of the 4 streams (20 total) 283*01826a49SYabin Cui * Must have called GET_NEXT_DELT for each stream 284*01826a49SYabin Cui */ 285*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) 286*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) 287*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) 288*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) 289*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) 290*01826a49SYabin Cui 291*01826a49SYabin Cui /* Load ip[1,2,3] from stack (var[] aliases them) 292*01826a49SYabin Cui * ip[] is needed for RELOAD_BITS 293*01826a49SYabin Cui * Each will be stored back to the stack after RELOAD 294*01826a49SYabin Cui */ 295*01826a49SYabin Cui movq 0(%rsp), %ip1 296*01826a49SYabin Cui movq 8(%rsp), %ip2 297*01826a49SYabin Cui movq 16(%rsp), %ip3 298*01826a49SYabin Cui 299*01826a49SYabin Cui /* Reload each stream & fetch the next table entry 300*01826a49SYabin Cui * to prepare for the next iteration 301*01826a49SYabin Cui */ 302*01826a49SYabin Cui RELOAD_BITS(0) 303*01826a49SYabin Cui GET_NEXT_DELT(0) 304*01826a49SYabin Cui 305*01826a49SYabin Cui RELOAD_BITS(1) 306*01826a49SYabin Cui movq %ip1, 0(%rsp) 307*01826a49SYabin Cui GET_NEXT_DELT(1) 308*01826a49SYabin Cui 309*01826a49SYabin Cui RELOAD_BITS(2) 310*01826a49SYabin Cui movq %ip2, 8(%rsp) 311*01826a49SYabin Cui GET_NEXT_DELT(2) 312*01826a49SYabin Cui 313*01826a49SYabin Cui RELOAD_BITS(3) 314*01826a49SYabin Cui movq %ip3, 16(%rsp) 315*01826a49SYabin Cui GET_NEXT_DELT(3) 316*01826a49SYabin Cui 317*01826a49SYabin Cui /* If op3 < olimit: continue the loop */ 318*01826a49SYabin Cui cmp %op3, 24(%rsp) 319*01826a49SYabin Cui ja .L_4X1_loop_body 320*01826a49SYabin Cui 321*01826a49SYabin Cui /* Reload ip[1,2,3] from stack */ 322*01826a49SYabin Cui movq 0(%rsp), %ip1 323*01826a49SYabin Cui movq 8(%rsp), %ip2 324*01826a49SYabin Cui movq 16(%rsp), %ip3 325*01826a49SYabin Cui 326*01826a49SYabin Cui /* Re-compute olimit */ 327*01826a49SYabin Cui jmp .L_4X1_compute_olimit 328*01826a49SYabin Cui 329*01826a49SYabin Cui#undef GET_NEXT_DELT 330*01826a49SYabin Cui#undef DECODE_FROM_DELT 331*01826a49SYabin Cui#undef DECODE 332*01826a49SYabin Cui#undef RELOAD_BITS 333*01826a49SYabin Cui.L_4X1_exit: 334*01826a49SYabin Cui addq $24, %rsp 335*01826a49SYabin Cui 336*01826a49SYabin Cui /* Restore stack (oend & olimit) */ 337*01826a49SYabin Cui pop %rax /* olimit */ 338*01826a49SYabin Cui pop %rax /* oend */ 339*01826a49SYabin Cui pop %rax /* ilowest */ 340*01826a49SYabin Cui pop %rax /* arg */ 341*01826a49SYabin Cui 342*01826a49SYabin Cui /* Save ip / op / bits */ 343*01826a49SYabin Cui movq %ip0, 0(%rax) 344*01826a49SYabin Cui movq %ip1, 8(%rax) 345*01826a49SYabin Cui movq %ip2, 16(%rax) 346*01826a49SYabin Cui movq %ip3, 24(%rax) 347*01826a49SYabin Cui movq %op0, 32(%rax) 348*01826a49SYabin Cui movq %op1, 40(%rax) 349*01826a49SYabin Cui movq %op2, 48(%rax) 350*01826a49SYabin Cui movq %op3, 56(%rax) 351*01826a49SYabin Cui movq %bits0, 64(%rax) 352*01826a49SYabin Cui movq %bits1, 72(%rax) 353*01826a49SYabin Cui movq %bits2, 80(%rax) 354*01826a49SYabin Cui movq %bits3, 88(%rax) 355*01826a49SYabin Cui 356*01826a49SYabin Cui /* Restore registers */ 357*01826a49SYabin Cui pop %r15 358*01826a49SYabin Cui pop %r14 359*01826a49SYabin Cui pop %r13 360*01826a49SYabin Cui pop %r12 361*01826a49SYabin Cui pop %r11 362*01826a49SYabin Cui pop %r10 363*01826a49SYabin Cui pop %r9 364*01826a49SYabin Cui pop %r8 365*01826a49SYabin Cui pop %rdi 366*01826a49SYabin Cui pop %rsi 367*01826a49SYabin Cui pop %rbp 368*01826a49SYabin Cui pop %rdx 369*01826a49SYabin Cui pop %rcx 370*01826a49SYabin Cui pop %rbx 371*01826a49SYabin Cui pop %rax 372*01826a49SYabin Cui ret 373*01826a49SYabin Cui 374*01826a49SYabin Cui_HUF_decompress4X2_usingDTable_internal_fast_asm_loop: 375*01826a49SYabin CuiHUF_decompress4X2_usingDTable_internal_fast_asm_loop: 376*01826a49SYabin Cui ZSTD_CET_ENDBRANCH 377*01826a49SYabin Cui /* Save all registers - even if they are callee saved for simplicity. */ 378*01826a49SYabin Cui push %rax 379*01826a49SYabin Cui push %rbx 380*01826a49SYabin Cui push %rcx 381*01826a49SYabin Cui push %rdx 382*01826a49SYabin Cui push %rbp 383*01826a49SYabin Cui push %rsi 384*01826a49SYabin Cui push %rdi 385*01826a49SYabin Cui push %r8 386*01826a49SYabin Cui push %r9 387*01826a49SYabin Cui push %r10 388*01826a49SYabin Cui push %r11 389*01826a49SYabin Cui push %r12 390*01826a49SYabin Cui push %r13 391*01826a49SYabin Cui push %r14 392*01826a49SYabin Cui push %r15 393*01826a49SYabin Cui 394*01826a49SYabin Cui movq %rdi, %rax 395*01826a49SYabin Cui movq 0(%rax), %ip0 396*01826a49SYabin Cui movq 8(%rax), %ip1 397*01826a49SYabin Cui movq 16(%rax), %ip2 398*01826a49SYabin Cui movq 24(%rax), %ip3 399*01826a49SYabin Cui movq 32(%rax), %op0 400*01826a49SYabin Cui movq 40(%rax), %op1 401*01826a49SYabin Cui movq 48(%rax), %op2 402*01826a49SYabin Cui movq 56(%rax), %op3 403*01826a49SYabin Cui movq 64(%rax), %bits0 404*01826a49SYabin Cui movq 72(%rax), %bits1 405*01826a49SYabin Cui movq 80(%rax), %bits2 406*01826a49SYabin Cui movq 88(%rax), %bits3 407*01826a49SYabin Cui movq 96(%rax), %dtable 408*01826a49SYabin Cui push %rax /* argument */ 409*01826a49SYabin Cui push %rax /* olimit */ 410*01826a49SYabin Cui push 104(%rax) /* ilowest */ 411*01826a49SYabin Cui 412*01826a49SYabin Cui movq 112(%rax), %rax 413*01826a49SYabin Cui push %rax /* oend3 */ 414*01826a49SYabin Cui 415*01826a49SYabin Cui movq %op3, %rax 416*01826a49SYabin Cui push %rax /* oend2 */ 417*01826a49SYabin Cui 418*01826a49SYabin Cui movq %op2, %rax 419*01826a49SYabin Cui push %rax /* oend1 */ 420*01826a49SYabin Cui 421*01826a49SYabin Cui movq %op1, %rax 422*01826a49SYabin Cui push %rax /* oend0 */ 423*01826a49SYabin Cui 424*01826a49SYabin Cui /* Scratch space */ 425*01826a49SYabin Cui subq $8, %rsp 426*01826a49SYabin Cui 427*01826a49SYabin Cui.L_4X2_compute_olimit: 428*01826a49SYabin Cui /* Computes how many iterations we can do safely 429*01826a49SYabin Cui * %r15, %rax may be clobbered 430*01826a49SYabin Cui * rdx must be saved 431*01826a49SYabin Cui * op[1,2,3,4] & ip0 mustn't be clobbered 432*01826a49SYabin Cui */ 433*01826a49SYabin Cui movq %rdx, 0(%rsp) 434*01826a49SYabin Cui 435*01826a49SYabin Cui /* We can consume up to 7 input bytes each iteration. */ 436*01826a49SYabin Cui movq %ip0, %rax /* rax = ip0 */ 437*01826a49SYabin Cui movq 40(%rsp), %rdx /* rdx = ilowest */ 438*01826a49SYabin Cui subq %rdx, %rax /* rax = ip0 - ilowest */ 439*01826a49SYabin Cui movq %rax, %r15 /* r15 = ip0 - ilowest */ 440*01826a49SYabin Cui 441*01826a49SYabin Cui /* rdx = rax / 7 */ 442*01826a49SYabin Cui movabsq $2635249153387078803, %rdx 443*01826a49SYabin Cui mulq %rdx 444*01826a49SYabin Cui subq %rdx, %r15 445*01826a49SYabin Cui shrq %r15 446*01826a49SYabin Cui addq %r15, %rdx 447*01826a49SYabin Cui shrq $2, %rdx 448*01826a49SYabin Cui 449*01826a49SYabin Cui /* r15 = (ip0 - ilowest) / 7 */ 450*01826a49SYabin Cui movq %rdx, %r15 451*01826a49SYabin Cui 452*01826a49SYabin Cui /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ 453*01826a49SYabin Cui movq 8(%rsp), %rax /* rax = oend0 */ 454*01826a49SYabin Cui subq %op0, %rax /* rax = oend0 - op0 */ 455*01826a49SYabin Cui movq 16(%rsp), %rdx /* rdx = oend1 */ 456*01826a49SYabin Cui subq %op1, %rdx /* rdx = oend1 - op1 */ 457*01826a49SYabin Cui 458*01826a49SYabin Cui cmpq %rax, %rdx 459*01826a49SYabin Cui cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 460*01826a49SYabin Cui 461*01826a49SYabin Cui movq 24(%rsp), %rax /* rax = oend2 */ 462*01826a49SYabin Cui subq %op2, %rax /* rax = oend2 - op2 */ 463*01826a49SYabin Cui 464*01826a49SYabin Cui cmpq %rax, %rdx 465*01826a49SYabin Cui cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 466*01826a49SYabin Cui 467*01826a49SYabin Cui movq 32(%rsp), %rax /* rax = oend3 */ 468*01826a49SYabin Cui subq %op3, %rax /* rax = oend3 - op3 */ 469*01826a49SYabin Cui 470*01826a49SYabin Cui cmpq %rax, %rdx 471*01826a49SYabin Cui cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 472*01826a49SYabin Cui 473*01826a49SYabin Cui movabsq $-3689348814741910323, %rax 474*01826a49SYabin Cui mulq %rdx 475*01826a49SYabin Cui shrq $3, %rdx /* rdx = rdx / 10 */ 476*01826a49SYabin Cui 477*01826a49SYabin Cui /* r15 = min(%rdx, %r15) */ 478*01826a49SYabin Cui cmpq %rdx, %r15 479*01826a49SYabin Cui cmova %rdx, %r15 480*01826a49SYabin Cui 481*01826a49SYabin Cui /* olimit = op3 + 5 * r15 */ 482*01826a49SYabin Cui movq %r15, %rax 483*01826a49SYabin Cui leaq (%op3, %rax, 4), %olimit 484*01826a49SYabin Cui addq %rax, %olimit 485*01826a49SYabin Cui 486*01826a49SYabin Cui movq 0(%rsp), %rdx 487*01826a49SYabin Cui 488*01826a49SYabin Cui /* If (op3 + 10 > olimit) */ 489*01826a49SYabin Cui movq %op3, %rax /* rax = op3 */ 490*01826a49SYabin Cui cmpq %rax, %olimit /* op3 == olimit */ 491*01826a49SYabin Cui je .L_4X2_exit 492*01826a49SYabin Cui 493*01826a49SYabin Cui /* If (ip1 < ip0) go to exit */ 494*01826a49SYabin Cui cmpq %ip0, %ip1 495*01826a49SYabin Cui jb .L_4X2_exit 496*01826a49SYabin Cui 497*01826a49SYabin Cui /* If (ip2 < ip1) go to exit */ 498*01826a49SYabin Cui cmpq %ip1, %ip2 499*01826a49SYabin Cui jb .L_4X2_exit 500*01826a49SYabin Cui 501*01826a49SYabin Cui /* If (ip3 < ip2) go to exit */ 502*01826a49SYabin Cui cmpq %ip2, %ip3 503*01826a49SYabin Cui jb .L_4X2_exit 504*01826a49SYabin Cui 505*01826a49SYabin Cui#define DECODE(n, idx) \ 506*01826a49SYabin Cui movq %bits##n, %rax; \ 507*01826a49SYabin Cui shrq $53, %rax; \ 508*01826a49SYabin Cui movzwl 0(%dtable,%rax,4),%r8d; \ 509*01826a49SYabin Cui movzbl 2(%dtable,%rax,4),%r15d; \ 510*01826a49SYabin Cui movzbl 3(%dtable,%rax,4),%eax; \ 511*01826a49SYabin Cui movw %r8w, (%op##n); \ 512*01826a49SYabin Cui shlxq %r15, %bits##n, %bits##n; \ 513*01826a49SYabin Cui addq %rax, %op##n 514*01826a49SYabin Cui 515*01826a49SYabin Cui#define RELOAD_BITS(n) \ 516*01826a49SYabin Cui bsfq %bits##n, %bits##n; \ 517*01826a49SYabin Cui movq %bits##n, %rax; \ 518*01826a49SYabin Cui shrq $3, %bits##n; \ 519*01826a49SYabin Cui andq $7, %rax; \ 520*01826a49SYabin Cui subq %bits##n, %ip##n; \ 521*01826a49SYabin Cui movq (%ip##n), %bits##n; \ 522*01826a49SYabin Cui orq $1, %bits##n; \ 523*01826a49SYabin Cui shlxq %rax, %bits##n, %bits##n 524*01826a49SYabin Cui 525*01826a49SYabin Cui 526*01826a49SYabin Cui movq %olimit, 48(%rsp) 527*01826a49SYabin Cui 528*01826a49SYabin Cui .p2align 6 529*01826a49SYabin Cui 530*01826a49SYabin Cui.L_4X2_loop_body: 531*01826a49SYabin Cui /* We clobber r8, so store it on the stack */ 532*01826a49SYabin Cui movq %r8, 0(%rsp) 533*01826a49SYabin Cui 534*01826a49SYabin Cui /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ 535*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) 536*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) 537*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) 538*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) 539*01826a49SYabin Cui FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) 540*01826a49SYabin Cui 541*01826a49SYabin Cui /* Reload r8 */ 542*01826a49SYabin Cui movq 0(%rsp), %r8 543*01826a49SYabin Cui 544*01826a49SYabin Cui FOR_EACH_STREAM(RELOAD_BITS) 545*01826a49SYabin Cui 546*01826a49SYabin Cui cmp %op3, 48(%rsp) 547*01826a49SYabin Cui ja .L_4X2_loop_body 548*01826a49SYabin Cui jmp .L_4X2_compute_olimit 549*01826a49SYabin Cui 550*01826a49SYabin Cui#undef DECODE 551*01826a49SYabin Cui#undef RELOAD_BITS 552*01826a49SYabin Cui.L_4X2_exit: 553*01826a49SYabin Cui addq $8, %rsp 554*01826a49SYabin Cui /* Restore stack (oend & olimit) */ 555*01826a49SYabin Cui pop %rax /* oend0 */ 556*01826a49SYabin Cui pop %rax /* oend1 */ 557*01826a49SYabin Cui pop %rax /* oend2 */ 558*01826a49SYabin Cui pop %rax /* oend3 */ 559*01826a49SYabin Cui pop %rax /* ilowest */ 560*01826a49SYabin Cui pop %rax /* olimit */ 561*01826a49SYabin Cui pop %rax /* arg */ 562*01826a49SYabin Cui 563*01826a49SYabin Cui /* Save ip / op / bits */ 564*01826a49SYabin Cui movq %ip0, 0(%rax) 565*01826a49SYabin Cui movq %ip1, 8(%rax) 566*01826a49SYabin Cui movq %ip2, 16(%rax) 567*01826a49SYabin Cui movq %ip3, 24(%rax) 568*01826a49SYabin Cui movq %op0, 32(%rax) 569*01826a49SYabin Cui movq %op1, 40(%rax) 570*01826a49SYabin Cui movq %op2, 48(%rax) 571*01826a49SYabin Cui movq %op3, 56(%rax) 572*01826a49SYabin Cui movq %bits0, 64(%rax) 573*01826a49SYabin Cui movq %bits1, 72(%rax) 574*01826a49SYabin Cui movq %bits2, 80(%rax) 575*01826a49SYabin Cui movq %bits3, 88(%rax) 576*01826a49SYabin Cui 577*01826a49SYabin Cui /* Restore registers */ 578*01826a49SYabin Cui pop %r15 579*01826a49SYabin Cui pop %r14 580*01826a49SYabin Cui pop %r13 581*01826a49SYabin Cui pop %r12 582*01826a49SYabin Cui pop %r11 583*01826a49SYabin Cui pop %r10 584*01826a49SYabin Cui pop %r9 585*01826a49SYabin Cui pop %r8 586*01826a49SYabin Cui pop %rdi 587*01826a49SYabin Cui pop %rsi 588*01826a49SYabin Cui pop %rbp 589*01826a49SYabin Cui pop %rdx 590*01826a49SYabin Cui pop %rcx 591*01826a49SYabin Cui pop %rbx 592*01826a49SYabin Cui pop %rax 593*01826a49SYabin Cui ret 594*01826a49SYabin Cui 595*01826a49SYabin Cui#endif 596