xref: /aosp_15_r20/external/zstd/lib/decompress/huf_decompress_amd64.S (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1*01826a49SYabin Cui/*
2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui * All rights reserved.
4*01826a49SYabin Cui *
5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui */
10*01826a49SYabin Cui
11*01826a49SYabin Cui#include "../common/portability_macros.h"
12*01826a49SYabin Cui
13*01826a49SYabin Cui#if defined(__ELF__) && defined(__GNUC__)
14*01826a49SYabin Cui/* Stack marking
15*01826a49SYabin Cui * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16*01826a49SYabin Cui */
17*01826a49SYabin Cui.section .note.GNU-stack,"",%progbits
18*01826a49SYabin Cui
19*01826a49SYabin Cui#if defined(__aarch64__)
20*01826a49SYabin Cui/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21*01826a49SYabin Cui * See: https://github.com/facebook/zstd/issues/3841
22*01826a49SYabin Cui * See: https://gcc.godbolt.org/z/sqr5T4ffK
23*01826a49SYabin Cui * See: https://lore.kernel.org/linux-arm-kernel/[email protected]/
24*01826a49SYabin Cui * See: https://reviews.llvm.org/D62609
25*01826a49SYabin Cui */
26*01826a49SYabin Cui.pushsection .note.gnu.property, "a"
27*01826a49SYabin Cui.p2align 3
28*01826a49SYabin Cui.long 4                 /* size of the name - "GNU\0" */
29*01826a49SYabin Cui.long 0x10              /* size of descriptor */
30*01826a49SYabin Cui.long 0x5               /* NT_GNU_PROPERTY_TYPE_0 */
31*01826a49SYabin Cui.asciz "GNU"
32*01826a49SYabin Cui.long 0xc0000000        /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33*01826a49SYabin Cui.long 4                 /* pr_datasz - 4 bytes */
34*01826a49SYabin Cui.long 3                 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35*01826a49SYabin Cui.p2align 3              /* pr_padding - bring everything to 8 byte alignment */
36*01826a49SYabin Cui.popsection
37*01826a49SYabin Cui#endif
38*01826a49SYabin Cui
39*01826a49SYabin Cui#endif
40*01826a49SYabin Cui
41*01826a49SYabin Cui#if ZSTD_ENABLE_ASM_X86_64_BMI2
42*01826a49SYabin Cui
43*01826a49SYabin Cui/* Calling convention:
44*01826a49SYabin Cui *
45*01826a49SYabin Cui * %rdi contains the first argument: HUF_DecompressAsmArgs*.
46*01826a49SYabin Cui * %rbp isn't maintained (no frame pointer).
47*01826a49SYabin Cui * %rsp contains the stack pointer that grows down.
48*01826a49SYabin Cui *      No red-zone is assumed, only addresses >= %rsp are used.
49*01826a49SYabin Cui * All register contents are preserved.
50*01826a49SYabin Cui *
51*01826a49SYabin Cui * TODO: Support Windows calling convention.
52*01826a49SYabin Cui */
53*01826a49SYabin Cui
54*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
55*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
56*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
57*01826a49SYabin CuiZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
58*01826a49SYabin Cui.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59*01826a49SYabin Cui.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60*01826a49SYabin Cui.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
61*01826a49SYabin Cui.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
62*01826a49SYabin Cui.text
63*01826a49SYabin Cui
64*01826a49SYabin Cui/* Sets up register mappings for clarity.
65*01826a49SYabin Cui * op[], bits[], dtable & ip[0] each get their own register.
66*01826a49SYabin Cui * ip[1,2,3] & olimit alias var[].
67*01826a49SYabin Cui * %rax is a scratch register.
68*01826a49SYabin Cui */
69*01826a49SYabin Cui
70*01826a49SYabin Cui#define op0    rsi
71*01826a49SYabin Cui#define op1    rbx
72*01826a49SYabin Cui#define op2    rcx
73*01826a49SYabin Cui#define op3    rdi
74*01826a49SYabin Cui
75*01826a49SYabin Cui#define ip0    r8
76*01826a49SYabin Cui#define ip1    r9
77*01826a49SYabin Cui#define ip2    r10
78*01826a49SYabin Cui#define ip3    r11
79*01826a49SYabin Cui
80*01826a49SYabin Cui#define bits0  rbp
81*01826a49SYabin Cui#define bits1  rdx
82*01826a49SYabin Cui#define bits2  r12
83*01826a49SYabin Cui#define bits3  r13
84*01826a49SYabin Cui#define dtable r14
85*01826a49SYabin Cui#define olimit r15
86*01826a49SYabin Cui
87*01826a49SYabin Cui/* var[] aliases ip[1,2,3] & olimit
88*01826a49SYabin Cui * ip[1,2,3] are saved every iteration.
89*01826a49SYabin Cui * olimit is only used in compute_olimit.
90*01826a49SYabin Cui */
91*01826a49SYabin Cui#define var0   r15
92*01826a49SYabin Cui#define var1   r9
93*01826a49SYabin Cui#define var2   r10
94*01826a49SYabin Cui#define var3   r11
95*01826a49SYabin Cui
96*01826a49SYabin Cui/* 32-bit var registers */
97*01826a49SYabin Cui#define vard0  r15d
98*01826a49SYabin Cui#define vard1  r9d
99*01826a49SYabin Cui#define vard2  r10d
100*01826a49SYabin Cui#define vard3  r11d
101*01826a49SYabin Cui
102*01826a49SYabin Cui/* Calls X(N) for each stream 0, 1, 2, 3. */
103*01826a49SYabin Cui#define FOR_EACH_STREAM(X) \
104*01826a49SYabin Cui    X(0);                  \
105*01826a49SYabin Cui    X(1);                  \
106*01826a49SYabin Cui    X(2);                  \
107*01826a49SYabin Cui    X(3)
108*01826a49SYabin Cui
109*01826a49SYabin Cui/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
110*01826a49SYabin Cui#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
111*01826a49SYabin Cui    X(0, idx);                             \
112*01826a49SYabin Cui    X(1, idx);                             \
113*01826a49SYabin Cui    X(2, idx);                             \
114*01826a49SYabin Cui    X(3, idx)
115*01826a49SYabin Cui
116*01826a49SYabin Cui/* Define both _HUF_* & HUF_* symbols because MacOS
117*01826a49SYabin Cui * C symbols are prefixed with '_' & Linux symbols aren't.
118*01826a49SYabin Cui */
119*01826a49SYabin Cui_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
120*01826a49SYabin CuiHUF_decompress4X1_usingDTable_internal_fast_asm_loop:
121*01826a49SYabin Cui    ZSTD_CET_ENDBRANCH
122*01826a49SYabin Cui    /* Save all registers - even if they are callee saved for simplicity. */
123*01826a49SYabin Cui    push %rax
124*01826a49SYabin Cui    push %rbx
125*01826a49SYabin Cui    push %rcx
126*01826a49SYabin Cui    push %rdx
127*01826a49SYabin Cui    push %rbp
128*01826a49SYabin Cui    push %rsi
129*01826a49SYabin Cui    push %rdi
130*01826a49SYabin Cui    push %r8
131*01826a49SYabin Cui    push %r9
132*01826a49SYabin Cui    push %r10
133*01826a49SYabin Cui    push %r11
134*01826a49SYabin Cui    push %r12
135*01826a49SYabin Cui    push %r13
136*01826a49SYabin Cui    push %r14
137*01826a49SYabin Cui    push %r15
138*01826a49SYabin Cui
139*01826a49SYabin Cui    /* Read HUF_DecompressAsmArgs* args from %rax */
140*01826a49SYabin Cui    movq %rdi, %rax
141*01826a49SYabin Cui    movq  0(%rax), %ip0
142*01826a49SYabin Cui    movq  8(%rax), %ip1
143*01826a49SYabin Cui    movq 16(%rax), %ip2
144*01826a49SYabin Cui    movq 24(%rax), %ip3
145*01826a49SYabin Cui    movq 32(%rax), %op0
146*01826a49SYabin Cui    movq 40(%rax), %op1
147*01826a49SYabin Cui    movq 48(%rax), %op2
148*01826a49SYabin Cui    movq 56(%rax), %op3
149*01826a49SYabin Cui    movq 64(%rax), %bits0
150*01826a49SYabin Cui    movq 72(%rax), %bits1
151*01826a49SYabin Cui    movq 80(%rax), %bits2
152*01826a49SYabin Cui    movq 88(%rax), %bits3
153*01826a49SYabin Cui    movq 96(%rax), %dtable
154*01826a49SYabin Cui    push %rax      /* argument */
155*01826a49SYabin Cui    push 104(%rax) /* ilowest */
156*01826a49SYabin Cui    push 112(%rax) /* oend */
157*01826a49SYabin Cui    push %olimit   /* olimit space */
158*01826a49SYabin Cui
159*01826a49SYabin Cui    subq $24, %rsp
160*01826a49SYabin Cui
161*01826a49SYabin Cui.L_4X1_compute_olimit:
162*01826a49SYabin Cui    /* Computes how many iterations we can do safely
163*01826a49SYabin Cui     * %r15, %rax may be clobbered
164*01826a49SYabin Cui     * rbx, rdx must be saved
165*01826a49SYabin Cui     * op3 & ip0 mustn't be clobbered
166*01826a49SYabin Cui     */
167*01826a49SYabin Cui    movq %rbx, 0(%rsp)
168*01826a49SYabin Cui    movq %rdx, 8(%rsp)
169*01826a49SYabin Cui
170*01826a49SYabin Cui    movq 32(%rsp), %rax /* rax = oend */
171*01826a49SYabin Cui    subq %op3,    %rax  /* rax = oend - op3 */
172*01826a49SYabin Cui
173*01826a49SYabin Cui    /* r15 = (oend - op3) / 5 */
174*01826a49SYabin Cui    movabsq $-3689348814741910323, %rdx
175*01826a49SYabin Cui    mulq %rdx
176*01826a49SYabin Cui    movq %rdx, %r15
177*01826a49SYabin Cui    shrq $2, %r15
178*01826a49SYabin Cui
179*01826a49SYabin Cui    movq %ip0,     %rax /* rax = ip0 */
180*01826a49SYabin Cui    movq 40(%rsp), %rdx /* rdx = ilowest */
181*01826a49SYabin Cui    subq %rdx,     %rax /* rax = ip0 - ilowest */
182*01826a49SYabin Cui    movq %rax,     %rbx /* rbx = ip0 - ilowest */
183*01826a49SYabin Cui
184*01826a49SYabin Cui    /* rdx = (ip0 - ilowest) / 7 */
185*01826a49SYabin Cui    movabsq $2635249153387078803, %rdx
186*01826a49SYabin Cui    mulq %rdx
187*01826a49SYabin Cui    subq %rdx, %rbx
188*01826a49SYabin Cui    shrq %rbx
189*01826a49SYabin Cui    addq %rbx, %rdx
190*01826a49SYabin Cui    shrq $2, %rdx
191*01826a49SYabin Cui
192*01826a49SYabin Cui    /* r15 = min(%rdx, %r15) */
193*01826a49SYabin Cui    cmpq %rdx, %r15
194*01826a49SYabin Cui    cmova %rdx, %r15
195*01826a49SYabin Cui
196*01826a49SYabin Cui    /* r15 = r15 * 5 */
197*01826a49SYabin Cui    leaq (%r15, %r15, 4), %r15
198*01826a49SYabin Cui
199*01826a49SYabin Cui    /* olimit = op3 + r15 */
200*01826a49SYabin Cui    addq %op3, %olimit
201*01826a49SYabin Cui
202*01826a49SYabin Cui    movq 8(%rsp), %rdx
203*01826a49SYabin Cui    movq 0(%rsp), %rbx
204*01826a49SYabin Cui
205*01826a49SYabin Cui    /* If (op3 + 20 > olimit) */
206*01826a49SYabin Cui    movq %op3, %rax    /* rax = op3 */
207*01826a49SYabin Cui    cmpq %rax, %olimit /* op3 == olimit */
208*01826a49SYabin Cui    je .L_4X1_exit
209*01826a49SYabin Cui
210*01826a49SYabin Cui    /* If (ip1 < ip0) go to exit */
211*01826a49SYabin Cui    cmpq %ip0, %ip1
212*01826a49SYabin Cui    jb .L_4X1_exit
213*01826a49SYabin Cui
214*01826a49SYabin Cui    /* If (ip2 < ip1) go to exit */
215*01826a49SYabin Cui    cmpq %ip1, %ip2
216*01826a49SYabin Cui    jb .L_4X1_exit
217*01826a49SYabin Cui
218*01826a49SYabin Cui    /* If (ip3 < ip2) go to exit */
219*01826a49SYabin Cui    cmpq %ip2, %ip3
220*01826a49SYabin Cui    jb .L_4X1_exit
221*01826a49SYabin Cui
222*01826a49SYabin Cui/* Reads top 11 bits from bits[n]
223*01826a49SYabin Cui * Loads dt[bits[n]] into var[n]
224*01826a49SYabin Cui */
225*01826a49SYabin Cui#define GET_NEXT_DELT(n)                \
226*01826a49SYabin Cui    movq $53, %var##n;                  \
227*01826a49SYabin Cui    shrxq %var##n, %bits##n, %var##n;   \
228*01826a49SYabin Cui    movzwl (%dtable,%var##n,2),%vard##n
229*01826a49SYabin Cui
230*01826a49SYabin Cui/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
231*01826a49SYabin Cui * Moves var[n] to %rax
232*01826a49SYabin Cui * bits[n] <<= var[n] & 63
233*01826a49SYabin Cui * op[n][idx] = %rax >> 8
234*01826a49SYabin Cui * %ah is a way to access bits [8, 16) of %rax
235*01826a49SYabin Cui */
236*01826a49SYabin Cui#define DECODE_FROM_DELT(n, idx)       \
237*01826a49SYabin Cui    movq %var##n, %rax;                \
238*01826a49SYabin Cui    shlxq %var##n, %bits##n, %bits##n; \
239*01826a49SYabin Cui    movb %ah, idx(%op##n)
240*01826a49SYabin Cui
241*01826a49SYabin Cui/* Assumes GET_NEXT_DELT has been called.
242*01826a49SYabin Cui * Calls DECODE_FROM_DELT then GET_NEXT_DELT
243*01826a49SYabin Cui */
244*01826a49SYabin Cui#define DECODE_AND_GET_NEXT(n, idx) \
245*01826a49SYabin Cui    DECODE_FROM_DELT(n, idx);       \
246*01826a49SYabin Cui    GET_NEXT_DELT(n)                \
247*01826a49SYabin Cui
248*01826a49SYabin Cui/* // ctz & nbBytes is stored in bits[n]
249*01826a49SYabin Cui * // nbBits is stored in %rax
250*01826a49SYabin Cui * ctz  = CTZ[bits[n]]
251*01826a49SYabin Cui * nbBits  = ctz & 7
252*01826a49SYabin Cui * nbBytes = ctz >> 3
253*01826a49SYabin Cui * op[n]  += 5
254*01826a49SYabin Cui * ip[n]  -= nbBytes
255*01826a49SYabin Cui * // Note: x86-64 is little-endian ==> no bswap
256*01826a49SYabin Cui * bits[n] = MEM_readST(ip[n]) | 1
257*01826a49SYabin Cui * bits[n] <<= nbBits
258*01826a49SYabin Cui */
259*01826a49SYabin Cui#define RELOAD_BITS(n)             \
260*01826a49SYabin Cui    bsfq %bits##n, %bits##n;       \
261*01826a49SYabin Cui    movq %bits##n, %rax;           \
262*01826a49SYabin Cui    andq $7, %rax;                 \
263*01826a49SYabin Cui    shrq $3, %bits##n;             \
264*01826a49SYabin Cui    leaq 5(%op##n), %op##n;        \
265*01826a49SYabin Cui    subq %bits##n, %ip##n;         \
266*01826a49SYabin Cui    movq (%ip##n), %bits##n;       \
267*01826a49SYabin Cui    orq $1, %bits##n;              \
268*01826a49SYabin Cui    shlx %rax, %bits##n, %bits##n
269*01826a49SYabin Cui
270*01826a49SYabin Cui    /* Store clobbered variables on the stack */
271*01826a49SYabin Cui    movq %olimit, 24(%rsp)
272*01826a49SYabin Cui    movq %ip1, 0(%rsp)
273*01826a49SYabin Cui    movq %ip2, 8(%rsp)
274*01826a49SYabin Cui    movq %ip3, 16(%rsp)
275*01826a49SYabin Cui
276*01826a49SYabin Cui    /* Call GET_NEXT_DELT for each stream */
277*01826a49SYabin Cui    FOR_EACH_STREAM(GET_NEXT_DELT)
278*01826a49SYabin Cui
279*01826a49SYabin Cui    .p2align 6
280*01826a49SYabin Cui
281*01826a49SYabin Cui.L_4X1_loop_body:
282*01826a49SYabin Cui    /* Decode 5 symbols in each of the 4 streams (20 total)
283*01826a49SYabin Cui     * Must have called GET_NEXT_DELT for each stream
284*01826a49SYabin Cui     */
285*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
286*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
287*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
288*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
289*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
290*01826a49SYabin Cui
291*01826a49SYabin Cui    /* Load ip[1,2,3] from stack (var[] aliases them)
292*01826a49SYabin Cui     * ip[] is needed for RELOAD_BITS
293*01826a49SYabin Cui     * Each will be stored back to the stack after RELOAD
294*01826a49SYabin Cui     */
295*01826a49SYabin Cui    movq 0(%rsp), %ip1
296*01826a49SYabin Cui    movq 8(%rsp), %ip2
297*01826a49SYabin Cui    movq 16(%rsp), %ip3
298*01826a49SYabin Cui
299*01826a49SYabin Cui    /* Reload each stream & fetch the next table entry
300*01826a49SYabin Cui     * to prepare for the next iteration
301*01826a49SYabin Cui     */
302*01826a49SYabin Cui    RELOAD_BITS(0)
303*01826a49SYabin Cui    GET_NEXT_DELT(0)
304*01826a49SYabin Cui
305*01826a49SYabin Cui    RELOAD_BITS(1)
306*01826a49SYabin Cui    movq %ip1, 0(%rsp)
307*01826a49SYabin Cui    GET_NEXT_DELT(1)
308*01826a49SYabin Cui
309*01826a49SYabin Cui    RELOAD_BITS(2)
310*01826a49SYabin Cui    movq %ip2, 8(%rsp)
311*01826a49SYabin Cui    GET_NEXT_DELT(2)
312*01826a49SYabin Cui
313*01826a49SYabin Cui    RELOAD_BITS(3)
314*01826a49SYabin Cui    movq %ip3, 16(%rsp)
315*01826a49SYabin Cui    GET_NEXT_DELT(3)
316*01826a49SYabin Cui
317*01826a49SYabin Cui    /* If op3 < olimit: continue the loop */
318*01826a49SYabin Cui    cmp %op3, 24(%rsp)
319*01826a49SYabin Cui    ja .L_4X1_loop_body
320*01826a49SYabin Cui
321*01826a49SYabin Cui    /* Reload ip[1,2,3] from stack */
322*01826a49SYabin Cui    movq 0(%rsp), %ip1
323*01826a49SYabin Cui    movq 8(%rsp), %ip2
324*01826a49SYabin Cui    movq 16(%rsp), %ip3
325*01826a49SYabin Cui
326*01826a49SYabin Cui    /* Re-compute olimit */
327*01826a49SYabin Cui    jmp .L_4X1_compute_olimit
328*01826a49SYabin Cui
329*01826a49SYabin Cui#undef GET_NEXT_DELT
330*01826a49SYabin Cui#undef DECODE_FROM_DELT
331*01826a49SYabin Cui#undef DECODE
332*01826a49SYabin Cui#undef RELOAD_BITS
333*01826a49SYabin Cui.L_4X1_exit:
334*01826a49SYabin Cui    addq $24, %rsp
335*01826a49SYabin Cui
336*01826a49SYabin Cui    /* Restore stack (oend & olimit) */
337*01826a49SYabin Cui    pop %rax /* olimit */
338*01826a49SYabin Cui    pop %rax /* oend */
339*01826a49SYabin Cui    pop %rax /* ilowest */
340*01826a49SYabin Cui    pop %rax /* arg */
341*01826a49SYabin Cui
342*01826a49SYabin Cui    /* Save ip / op / bits */
343*01826a49SYabin Cui    movq %ip0,  0(%rax)
344*01826a49SYabin Cui    movq %ip1,  8(%rax)
345*01826a49SYabin Cui    movq %ip2, 16(%rax)
346*01826a49SYabin Cui    movq %ip3, 24(%rax)
347*01826a49SYabin Cui    movq %op0, 32(%rax)
348*01826a49SYabin Cui    movq %op1, 40(%rax)
349*01826a49SYabin Cui    movq %op2, 48(%rax)
350*01826a49SYabin Cui    movq %op3, 56(%rax)
351*01826a49SYabin Cui    movq %bits0, 64(%rax)
352*01826a49SYabin Cui    movq %bits1, 72(%rax)
353*01826a49SYabin Cui    movq %bits2, 80(%rax)
354*01826a49SYabin Cui    movq %bits3, 88(%rax)
355*01826a49SYabin Cui
356*01826a49SYabin Cui    /* Restore registers */
357*01826a49SYabin Cui    pop %r15
358*01826a49SYabin Cui    pop %r14
359*01826a49SYabin Cui    pop %r13
360*01826a49SYabin Cui    pop %r12
361*01826a49SYabin Cui    pop %r11
362*01826a49SYabin Cui    pop %r10
363*01826a49SYabin Cui    pop %r9
364*01826a49SYabin Cui    pop %r8
365*01826a49SYabin Cui    pop %rdi
366*01826a49SYabin Cui    pop %rsi
367*01826a49SYabin Cui    pop %rbp
368*01826a49SYabin Cui    pop %rdx
369*01826a49SYabin Cui    pop %rcx
370*01826a49SYabin Cui    pop %rbx
371*01826a49SYabin Cui    pop %rax
372*01826a49SYabin Cui    ret
373*01826a49SYabin Cui
374*01826a49SYabin Cui_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
375*01826a49SYabin CuiHUF_decompress4X2_usingDTable_internal_fast_asm_loop:
376*01826a49SYabin Cui    ZSTD_CET_ENDBRANCH
377*01826a49SYabin Cui    /* Save all registers - even if they are callee saved for simplicity. */
378*01826a49SYabin Cui    push %rax
379*01826a49SYabin Cui    push %rbx
380*01826a49SYabin Cui    push %rcx
381*01826a49SYabin Cui    push %rdx
382*01826a49SYabin Cui    push %rbp
383*01826a49SYabin Cui    push %rsi
384*01826a49SYabin Cui    push %rdi
385*01826a49SYabin Cui    push %r8
386*01826a49SYabin Cui    push %r9
387*01826a49SYabin Cui    push %r10
388*01826a49SYabin Cui    push %r11
389*01826a49SYabin Cui    push %r12
390*01826a49SYabin Cui    push %r13
391*01826a49SYabin Cui    push %r14
392*01826a49SYabin Cui    push %r15
393*01826a49SYabin Cui
394*01826a49SYabin Cui    movq %rdi, %rax
395*01826a49SYabin Cui    movq  0(%rax), %ip0
396*01826a49SYabin Cui    movq  8(%rax), %ip1
397*01826a49SYabin Cui    movq 16(%rax), %ip2
398*01826a49SYabin Cui    movq 24(%rax), %ip3
399*01826a49SYabin Cui    movq 32(%rax), %op0
400*01826a49SYabin Cui    movq 40(%rax), %op1
401*01826a49SYabin Cui    movq 48(%rax), %op2
402*01826a49SYabin Cui    movq 56(%rax), %op3
403*01826a49SYabin Cui    movq 64(%rax), %bits0
404*01826a49SYabin Cui    movq 72(%rax), %bits1
405*01826a49SYabin Cui    movq 80(%rax), %bits2
406*01826a49SYabin Cui    movq 88(%rax), %bits3
407*01826a49SYabin Cui    movq 96(%rax), %dtable
408*01826a49SYabin Cui    push %rax      /* argument */
409*01826a49SYabin Cui    push %rax      /* olimit */
410*01826a49SYabin Cui    push 104(%rax) /* ilowest */
411*01826a49SYabin Cui
412*01826a49SYabin Cui    movq 112(%rax), %rax
413*01826a49SYabin Cui    push %rax /* oend3 */
414*01826a49SYabin Cui
415*01826a49SYabin Cui    movq %op3, %rax
416*01826a49SYabin Cui    push %rax /* oend2 */
417*01826a49SYabin Cui
418*01826a49SYabin Cui    movq %op2, %rax
419*01826a49SYabin Cui    push %rax /* oend1 */
420*01826a49SYabin Cui
421*01826a49SYabin Cui    movq %op1, %rax
422*01826a49SYabin Cui    push %rax /* oend0 */
423*01826a49SYabin Cui
424*01826a49SYabin Cui    /* Scratch space */
425*01826a49SYabin Cui    subq $8, %rsp
426*01826a49SYabin Cui
427*01826a49SYabin Cui.L_4X2_compute_olimit:
428*01826a49SYabin Cui    /* Computes how many iterations we can do safely
429*01826a49SYabin Cui     * %r15, %rax may be clobbered
430*01826a49SYabin Cui     * rdx must be saved
431*01826a49SYabin Cui     * op[1,2,3,4] & ip0 mustn't be clobbered
432*01826a49SYabin Cui     */
433*01826a49SYabin Cui    movq %rdx, 0(%rsp)
434*01826a49SYabin Cui
435*01826a49SYabin Cui    /* We can consume up to 7 input bytes each iteration. */
436*01826a49SYabin Cui    movq %ip0,     %rax  /* rax = ip0 */
437*01826a49SYabin Cui    movq 40(%rsp), %rdx  /* rdx = ilowest */
438*01826a49SYabin Cui    subq %rdx,     %rax  /* rax = ip0 - ilowest */
439*01826a49SYabin Cui    movq %rax,    %r15   /* r15 = ip0 - ilowest */
440*01826a49SYabin Cui
441*01826a49SYabin Cui    /* rdx = rax / 7 */
442*01826a49SYabin Cui    movabsq $2635249153387078803, %rdx
443*01826a49SYabin Cui    mulq %rdx
444*01826a49SYabin Cui    subq %rdx, %r15
445*01826a49SYabin Cui    shrq %r15
446*01826a49SYabin Cui    addq %r15, %rdx
447*01826a49SYabin Cui    shrq $2, %rdx
448*01826a49SYabin Cui
449*01826a49SYabin Cui    /* r15 = (ip0 - ilowest) / 7 */
450*01826a49SYabin Cui    movq %rdx, %r15
451*01826a49SYabin Cui
452*01826a49SYabin Cui    /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
453*01826a49SYabin Cui    movq 8(%rsp),  %rax /* rax = oend0 */
454*01826a49SYabin Cui    subq %op0,     %rax /* rax = oend0 - op0 */
455*01826a49SYabin Cui    movq 16(%rsp), %rdx /* rdx = oend1 */
456*01826a49SYabin Cui    subq %op1,     %rdx /* rdx = oend1 - op1 */
457*01826a49SYabin Cui
458*01826a49SYabin Cui    cmpq  %rax,    %rdx
459*01826a49SYabin Cui    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
460*01826a49SYabin Cui
461*01826a49SYabin Cui    movq 24(%rsp), %rax /* rax = oend2 */
462*01826a49SYabin Cui    subq %op2,     %rax /* rax = oend2 - op2 */
463*01826a49SYabin Cui
464*01826a49SYabin Cui    cmpq  %rax,    %rdx
465*01826a49SYabin Cui    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
466*01826a49SYabin Cui
467*01826a49SYabin Cui    movq 32(%rsp), %rax /* rax = oend3 */
468*01826a49SYabin Cui    subq %op3,     %rax /* rax = oend3 - op3 */
469*01826a49SYabin Cui
470*01826a49SYabin Cui    cmpq  %rax,    %rdx
471*01826a49SYabin Cui    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
472*01826a49SYabin Cui
473*01826a49SYabin Cui    movabsq $-3689348814741910323, %rax
474*01826a49SYabin Cui    mulq %rdx
475*01826a49SYabin Cui    shrq $3,       %rdx /* rdx = rdx / 10 */
476*01826a49SYabin Cui
477*01826a49SYabin Cui    /* r15 = min(%rdx, %r15) */
478*01826a49SYabin Cui    cmpq  %rdx, %r15
479*01826a49SYabin Cui    cmova %rdx, %r15
480*01826a49SYabin Cui
481*01826a49SYabin Cui    /* olimit = op3 + 5 * r15 */
482*01826a49SYabin Cui    movq %r15, %rax
483*01826a49SYabin Cui    leaq (%op3, %rax, 4), %olimit
484*01826a49SYabin Cui    addq %rax, %olimit
485*01826a49SYabin Cui
486*01826a49SYabin Cui    movq 0(%rsp), %rdx
487*01826a49SYabin Cui
488*01826a49SYabin Cui    /* If (op3 + 10 > olimit) */
489*01826a49SYabin Cui    movq %op3, %rax    /* rax = op3 */
490*01826a49SYabin Cui    cmpq %rax, %olimit /* op3 == olimit */
491*01826a49SYabin Cui    je .L_4X2_exit
492*01826a49SYabin Cui
493*01826a49SYabin Cui    /* If (ip1 < ip0) go to exit */
494*01826a49SYabin Cui    cmpq %ip0, %ip1
495*01826a49SYabin Cui    jb .L_4X2_exit
496*01826a49SYabin Cui
497*01826a49SYabin Cui    /* If (ip2 < ip1) go to exit */
498*01826a49SYabin Cui    cmpq %ip1, %ip2
499*01826a49SYabin Cui    jb .L_4X2_exit
500*01826a49SYabin Cui
501*01826a49SYabin Cui    /* If (ip3 < ip2) go to exit */
502*01826a49SYabin Cui    cmpq %ip2, %ip3
503*01826a49SYabin Cui    jb .L_4X2_exit
504*01826a49SYabin Cui
505*01826a49SYabin Cui#define DECODE(n, idx)              \
506*01826a49SYabin Cui    movq %bits##n, %rax;            \
507*01826a49SYabin Cui    shrq $53, %rax;                 \
508*01826a49SYabin Cui    movzwl 0(%dtable,%rax,4),%r8d;  \
509*01826a49SYabin Cui    movzbl 2(%dtable,%rax,4),%r15d; \
510*01826a49SYabin Cui    movzbl 3(%dtable,%rax,4),%eax;  \
511*01826a49SYabin Cui    movw %r8w, (%op##n);            \
512*01826a49SYabin Cui    shlxq %r15, %bits##n, %bits##n; \
513*01826a49SYabin Cui    addq %rax, %op##n
514*01826a49SYabin Cui
515*01826a49SYabin Cui#define RELOAD_BITS(n)              \
516*01826a49SYabin Cui    bsfq %bits##n, %bits##n;        \
517*01826a49SYabin Cui    movq %bits##n, %rax;            \
518*01826a49SYabin Cui    shrq $3, %bits##n;              \
519*01826a49SYabin Cui    andq $7, %rax;                  \
520*01826a49SYabin Cui    subq %bits##n, %ip##n;          \
521*01826a49SYabin Cui    movq (%ip##n), %bits##n;        \
522*01826a49SYabin Cui    orq $1, %bits##n;               \
523*01826a49SYabin Cui    shlxq %rax, %bits##n, %bits##n
524*01826a49SYabin Cui
525*01826a49SYabin Cui
526*01826a49SYabin Cui    movq %olimit, 48(%rsp)
527*01826a49SYabin Cui
528*01826a49SYabin Cui    .p2align 6
529*01826a49SYabin Cui
530*01826a49SYabin Cui.L_4X2_loop_body:
531*01826a49SYabin Cui    /* We clobber r8, so store it on the stack */
532*01826a49SYabin Cui    movq %r8, 0(%rsp)
533*01826a49SYabin Cui
534*01826a49SYabin Cui    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
535*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
536*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
537*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
538*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
539*01826a49SYabin Cui    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
540*01826a49SYabin Cui
541*01826a49SYabin Cui    /* Reload r8 */
542*01826a49SYabin Cui    movq 0(%rsp), %r8
543*01826a49SYabin Cui
544*01826a49SYabin Cui    FOR_EACH_STREAM(RELOAD_BITS)
545*01826a49SYabin Cui
546*01826a49SYabin Cui    cmp %op3, 48(%rsp)
547*01826a49SYabin Cui    ja .L_4X2_loop_body
548*01826a49SYabin Cui    jmp .L_4X2_compute_olimit
549*01826a49SYabin Cui
550*01826a49SYabin Cui#undef DECODE
551*01826a49SYabin Cui#undef RELOAD_BITS
552*01826a49SYabin Cui.L_4X2_exit:
553*01826a49SYabin Cui    addq $8, %rsp
554*01826a49SYabin Cui    /* Restore stack (oend & olimit) */
555*01826a49SYabin Cui    pop %rax /* oend0 */
556*01826a49SYabin Cui    pop %rax /* oend1 */
557*01826a49SYabin Cui    pop %rax /* oend2 */
558*01826a49SYabin Cui    pop %rax /* oend3 */
559*01826a49SYabin Cui    pop %rax /* ilowest */
560*01826a49SYabin Cui    pop %rax /* olimit */
561*01826a49SYabin Cui    pop %rax /* arg */
562*01826a49SYabin Cui
563*01826a49SYabin Cui    /* Save ip / op / bits */
564*01826a49SYabin Cui    movq %ip0,  0(%rax)
565*01826a49SYabin Cui    movq %ip1,  8(%rax)
566*01826a49SYabin Cui    movq %ip2, 16(%rax)
567*01826a49SYabin Cui    movq %ip3, 24(%rax)
568*01826a49SYabin Cui    movq %op0, 32(%rax)
569*01826a49SYabin Cui    movq %op1, 40(%rax)
570*01826a49SYabin Cui    movq %op2, 48(%rax)
571*01826a49SYabin Cui    movq %op3, 56(%rax)
572*01826a49SYabin Cui    movq %bits0, 64(%rax)
573*01826a49SYabin Cui    movq %bits1, 72(%rax)
574*01826a49SYabin Cui    movq %bits2, 80(%rax)
575*01826a49SYabin Cui    movq %bits3, 88(%rax)
576*01826a49SYabin Cui
577*01826a49SYabin Cui    /* Restore registers */
578*01826a49SYabin Cui    pop %r15
579*01826a49SYabin Cui    pop %r14
580*01826a49SYabin Cui    pop %r13
581*01826a49SYabin Cui    pop %r12
582*01826a49SYabin Cui    pop %r11
583*01826a49SYabin Cui    pop %r10
584*01826a49SYabin Cui    pop %r9
585*01826a49SYabin Cui    pop %r8
586*01826a49SYabin Cui    pop %rdi
587*01826a49SYabin Cui    pop %rsi
588*01826a49SYabin Cui    pop %rbp
589*01826a49SYabin Cui    pop %rdx
590*01826a49SYabin Cui    pop %rcx
591*01826a49SYabin Cui    pop %rbx
592*01826a49SYabin Cui    pop %rax
593*01826a49SYabin Cui    ret
594*01826a49SYabin Cui
595*01826a49SYabin Cui#endif
596