xref: /aosp_15_r20/external/zstd/lib/decompress/huf_decompress_amd64.S (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11#include "../common/portability_macros.h"
12
13#if defined(__ELF__) && defined(__GNUC__)
14/* Stack marking
15 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16 */
17.section .note.GNU-stack,"",%progbits
18
19#if defined(__aarch64__)
20/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21 * See: https://github.com/facebook/zstd/issues/3841
22 * See: https://gcc.godbolt.org/z/sqr5T4ffK
23 * See: https://lore.kernel.org/linux-arm-kernel/[email protected]/
24 * See: https://reviews.llvm.org/D62609
25 */
26.pushsection .note.gnu.property, "a"
27.p2align 3
28.long 4                 /* size of the name - "GNU\0" */
29.long 0x10              /* size of descriptor */
30.long 0x5               /* NT_GNU_PROPERTY_TYPE_0 */
31.asciz "GNU"
32.long 0xc0000000        /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33.long 4                 /* pr_datasz - 4 bytes */
34.long 3                 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35.p2align 3              /* pr_padding - bring everything to 8 byte alignment */
36.popsection
37#endif
38
39#endif
40
41#if ZSTD_ENABLE_ASM_X86_64_BMI2
42
43/* Calling convention:
44 *
45 * %rdi contains the first argument: HUF_DecompressAsmArgs*.
46 * %rbp isn't maintained (no frame pointer).
47 * %rsp contains the stack pointer that grows down.
48 *      No red-zone is assumed, only addresses >= %rsp are used.
49 * All register contents are preserved.
50 *
51 * TODO: Support Windows calling convention.
52 */
53
54ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
55ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
56ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
57ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
58.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
61.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
62.text
63
64/* Sets up register mappings for clarity.
65 * op[], bits[], dtable & ip[0] each get their own register.
66 * ip[1,2,3] & olimit alias var[].
67 * %rax is a scratch register.
68 */
69
70#define op0    rsi
71#define op1    rbx
72#define op2    rcx
73#define op3    rdi
74
75#define ip0    r8
76#define ip1    r9
77#define ip2    r10
78#define ip3    r11
79
80#define bits0  rbp
81#define bits1  rdx
82#define bits2  r12
83#define bits3  r13
84#define dtable r14
85#define olimit r15
86
87/* var[] aliases ip[1,2,3] & olimit
88 * ip[1,2,3] are saved every iteration.
89 * olimit is only used in compute_olimit.
90 */
91#define var0   r15
92#define var1   r9
93#define var2   r10
94#define var3   r11
95
96/* 32-bit var registers */
97#define vard0  r15d
98#define vard1  r9d
99#define vard2  r10d
100#define vard3  r11d
101
102/* Calls X(N) for each stream 0, 1, 2, 3. */
103#define FOR_EACH_STREAM(X) \
104    X(0);                  \
105    X(1);                  \
106    X(2);                  \
107    X(3)
108
109/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
110#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
111    X(0, idx);                             \
112    X(1, idx);                             \
113    X(2, idx);                             \
114    X(3, idx)
115
116/* Define both _HUF_* & HUF_* symbols because MacOS
117 * C symbols are prefixed with '_' & Linux symbols aren't.
118 */
119_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
120HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
121    ZSTD_CET_ENDBRANCH
122    /* Save all registers - even if they are callee saved for simplicity. */
123    push %rax
124    push %rbx
125    push %rcx
126    push %rdx
127    push %rbp
128    push %rsi
129    push %rdi
130    push %r8
131    push %r9
132    push %r10
133    push %r11
134    push %r12
135    push %r13
136    push %r14
137    push %r15
138
139    /* Read HUF_DecompressAsmArgs* args from %rax */
140    movq %rdi, %rax
141    movq  0(%rax), %ip0
142    movq  8(%rax), %ip1
143    movq 16(%rax), %ip2
144    movq 24(%rax), %ip3
145    movq 32(%rax), %op0
146    movq 40(%rax), %op1
147    movq 48(%rax), %op2
148    movq 56(%rax), %op3
149    movq 64(%rax), %bits0
150    movq 72(%rax), %bits1
151    movq 80(%rax), %bits2
152    movq 88(%rax), %bits3
153    movq 96(%rax), %dtable
154    push %rax      /* argument */
155    push 104(%rax) /* ilowest */
156    push 112(%rax) /* oend */
157    push %olimit   /* olimit space */
158
159    subq $24, %rsp
160
161.L_4X1_compute_olimit:
162    /* Computes how many iterations we can do safely
163     * %r15, %rax may be clobbered
164     * rbx, rdx must be saved
165     * op3 & ip0 mustn't be clobbered
166     */
167    movq %rbx, 0(%rsp)
168    movq %rdx, 8(%rsp)
169
170    movq 32(%rsp), %rax /* rax = oend */
171    subq %op3,    %rax  /* rax = oend - op3 */
172
173    /* r15 = (oend - op3) / 5 */
174    movabsq $-3689348814741910323, %rdx
175    mulq %rdx
176    movq %rdx, %r15
177    shrq $2, %r15
178
179    movq %ip0,     %rax /* rax = ip0 */
180    movq 40(%rsp), %rdx /* rdx = ilowest */
181    subq %rdx,     %rax /* rax = ip0 - ilowest */
182    movq %rax,     %rbx /* rbx = ip0 - ilowest */
183
184    /* rdx = (ip0 - ilowest) / 7 */
185    movabsq $2635249153387078803, %rdx
186    mulq %rdx
187    subq %rdx, %rbx
188    shrq %rbx
189    addq %rbx, %rdx
190    shrq $2, %rdx
191
192    /* r15 = min(%rdx, %r15) */
193    cmpq %rdx, %r15
194    cmova %rdx, %r15
195
196    /* r15 = r15 * 5 */
197    leaq (%r15, %r15, 4), %r15
198
199    /* olimit = op3 + r15 */
200    addq %op3, %olimit
201
202    movq 8(%rsp), %rdx
203    movq 0(%rsp), %rbx
204
205    /* If (op3 + 20 > olimit) */
206    movq %op3, %rax    /* rax = op3 */
207    cmpq %rax, %olimit /* op3 == olimit */
208    je .L_4X1_exit
209
210    /* If (ip1 < ip0) go to exit */
211    cmpq %ip0, %ip1
212    jb .L_4X1_exit
213
214    /* If (ip2 < ip1) go to exit */
215    cmpq %ip1, %ip2
216    jb .L_4X1_exit
217
218    /* If (ip3 < ip2) go to exit */
219    cmpq %ip2, %ip3
220    jb .L_4X1_exit
221
222/* Reads top 11 bits from bits[n]
223 * Loads dt[bits[n]] into var[n]
224 */
225#define GET_NEXT_DELT(n)                \
226    movq $53, %var##n;                  \
227    shrxq %var##n, %bits##n, %var##n;   \
228    movzwl (%dtable,%var##n,2),%vard##n
229
230/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
231 * Moves var[n] to %rax
232 * bits[n] <<= var[n] & 63
233 * op[n][idx] = %rax >> 8
234 * %ah is a way to access bits [8, 16) of %rax
235 */
236#define DECODE_FROM_DELT(n, idx)       \
237    movq %var##n, %rax;                \
238    shlxq %var##n, %bits##n, %bits##n; \
239    movb %ah, idx(%op##n)
240
241/* Assumes GET_NEXT_DELT has been called.
242 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
243 */
244#define DECODE_AND_GET_NEXT(n, idx) \
245    DECODE_FROM_DELT(n, idx);       \
246    GET_NEXT_DELT(n)                \
247
248/* // ctz & nbBytes is stored in bits[n]
249 * // nbBits is stored in %rax
250 * ctz  = CTZ[bits[n]]
251 * nbBits  = ctz & 7
252 * nbBytes = ctz >> 3
253 * op[n]  += 5
254 * ip[n]  -= nbBytes
255 * // Note: x86-64 is little-endian ==> no bswap
256 * bits[n] = MEM_readST(ip[n]) | 1
257 * bits[n] <<= nbBits
258 */
259#define RELOAD_BITS(n)             \
260    bsfq %bits##n, %bits##n;       \
261    movq %bits##n, %rax;           \
262    andq $7, %rax;                 \
263    shrq $3, %bits##n;             \
264    leaq 5(%op##n), %op##n;        \
265    subq %bits##n, %ip##n;         \
266    movq (%ip##n), %bits##n;       \
267    orq $1, %bits##n;              \
268    shlx %rax, %bits##n, %bits##n
269
270    /* Store clobbered variables on the stack */
271    movq %olimit, 24(%rsp)
272    movq %ip1, 0(%rsp)
273    movq %ip2, 8(%rsp)
274    movq %ip3, 16(%rsp)
275
276    /* Call GET_NEXT_DELT for each stream */
277    FOR_EACH_STREAM(GET_NEXT_DELT)
278
279    .p2align 6
280
281.L_4X1_loop_body:
282    /* Decode 5 symbols in each of the 4 streams (20 total)
283     * Must have called GET_NEXT_DELT for each stream
284     */
285    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
286    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
287    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
288    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
289    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
290
291    /* Load ip[1,2,3] from stack (var[] aliases them)
292     * ip[] is needed for RELOAD_BITS
293     * Each will be stored back to the stack after RELOAD
294     */
295    movq 0(%rsp), %ip1
296    movq 8(%rsp), %ip2
297    movq 16(%rsp), %ip3
298
299    /* Reload each stream & fetch the next table entry
300     * to prepare for the next iteration
301     */
302    RELOAD_BITS(0)
303    GET_NEXT_DELT(0)
304
305    RELOAD_BITS(1)
306    movq %ip1, 0(%rsp)
307    GET_NEXT_DELT(1)
308
309    RELOAD_BITS(2)
310    movq %ip2, 8(%rsp)
311    GET_NEXT_DELT(2)
312
313    RELOAD_BITS(3)
314    movq %ip3, 16(%rsp)
315    GET_NEXT_DELT(3)
316
317    /* If op3 < olimit: continue the loop */
318    cmp %op3, 24(%rsp)
319    ja .L_4X1_loop_body
320
321    /* Reload ip[1,2,3] from stack */
322    movq 0(%rsp), %ip1
323    movq 8(%rsp), %ip2
324    movq 16(%rsp), %ip3
325
326    /* Re-compute olimit */
327    jmp .L_4X1_compute_olimit
328
329#undef GET_NEXT_DELT
330#undef DECODE_FROM_DELT
331#undef DECODE
332#undef RELOAD_BITS
333.L_4X1_exit:
334    addq $24, %rsp
335
336    /* Restore stack (oend & olimit) */
337    pop %rax /* olimit */
338    pop %rax /* oend */
339    pop %rax /* ilowest */
340    pop %rax /* arg */
341
342    /* Save ip / op / bits */
343    movq %ip0,  0(%rax)
344    movq %ip1,  8(%rax)
345    movq %ip2, 16(%rax)
346    movq %ip3, 24(%rax)
347    movq %op0, 32(%rax)
348    movq %op1, 40(%rax)
349    movq %op2, 48(%rax)
350    movq %op3, 56(%rax)
351    movq %bits0, 64(%rax)
352    movq %bits1, 72(%rax)
353    movq %bits2, 80(%rax)
354    movq %bits3, 88(%rax)
355
356    /* Restore registers */
357    pop %r15
358    pop %r14
359    pop %r13
360    pop %r12
361    pop %r11
362    pop %r10
363    pop %r9
364    pop %r8
365    pop %rdi
366    pop %rsi
367    pop %rbp
368    pop %rdx
369    pop %rcx
370    pop %rbx
371    pop %rax
372    ret
373
374_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
375HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
376    ZSTD_CET_ENDBRANCH
377    /* Save all registers - even if they are callee saved for simplicity. */
378    push %rax
379    push %rbx
380    push %rcx
381    push %rdx
382    push %rbp
383    push %rsi
384    push %rdi
385    push %r8
386    push %r9
387    push %r10
388    push %r11
389    push %r12
390    push %r13
391    push %r14
392    push %r15
393
394    movq %rdi, %rax
395    movq  0(%rax), %ip0
396    movq  8(%rax), %ip1
397    movq 16(%rax), %ip2
398    movq 24(%rax), %ip3
399    movq 32(%rax), %op0
400    movq 40(%rax), %op1
401    movq 48(%rax), %op2
402    movq 56(%rax), %op3
403    movq 64(%rax), %bits0
404    movq 72(%rax), %bits1
405    movq 80(%rax), %bits2
406    movq 88(%rax), %bits3
407    movq 96(%rax), %dtable
408    push %rax      /* argument */
409    push %rax      /* olimit */
410    push 104(%rax) /* ilowest */
411
412    movq 112(%rax), %rax
413    push %rax /* oend3 */
414
415    movq %op3, %rax
416    push %rax /* oend2 */
417
418    movq %op2, %rax
419    push %rax /* oend1 */
420
421    movq %op1, %rax
422    push %rax /* oend0 */
423
424    /* Scratch space */
425    subq $8, %rsp
426
427.L_4X2_compute_olimit:
428    /* Computes how many iterations we can do safely
429     * %r15, %rax may be clobbered
430     * rdx must be saved
431     * op[1,2,3,4] & ip0 mustn't be clobbered
432     */
433    movq %rdx, 0(%rsp)
434
435    /* We can consume up to 7 input bytes each iteration. */
436    movq %ip0,     %rax  /* rax = ip0 */
437    movq 40(%rsp), %rdx  /* rdx = ilowest */
438    subq %rdx,     %rax  /* rax = ip0 - ilowest */
439    movq %rax,    %r15   /* r15 = ip0 - ilowest */
440
441    /* rdx = rax / 7 */
442    movabsq $2635249153387078803, %rdx
443    mulq %rdx
444    subq %rdx, %r15
445    shrq %r15
446    addq %r15, %rdx
447    shrq $2, %rdx
448
449    /* r15 = (ip0 - ilowest) / 7 */
450    movq %rdx, %r15
451
452    /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
453    movq 8(%rsp),  %rax /* rax = oend0 */
454    subq %op0,     %rax /* rax = oend0 - op0 */
455    movq 16(%rsp), %rdx /* rdx = oend1 */
456    subq %op1,     %rdx /* rdx = oend1 - op1 */
457
458    cmpq  %rax,    %rdx
459    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
460
461    movq 24(%rsp), %rax /* rax = oend2 */
462    subq %op2,     %rax /* rax = oend2 - op2 */
463
464    cmpq  %rax,    %rdx
465    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
466
467    movq 32(%rsp), %rax /* rax = oend3 */
468    subq %op3,     %rax /* rax = oend3 - op3 */
469
470    cmpq  %rax,    %rdx
471    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
472
473    movabsq $-3689348814741910323, %rax
474    mulq %rdx
475    shrq $3,       %rdx /* rdx = rdx / 10 */
476
477    /* r15 = min(%rdx, %r15) */
478    cmpq  %rdx, %r15
479    cmova %rdx, %r15
480
481    /* olimit = op3 + 5 * r15 */
482    movq %r15, %rax
483    leaq (%op3, %rax, 4), %olimit
484    addq %rax, %olimit
485
486    movq 0(%rsp), %rdx
487
488    /* If (op3 + 10 > olimit) */
489    movq %op3, %rax    /* rax = op3 */
490    cmpq %rax, %olimit /* op3 == olimit */
491    je .L_4X2_exit
492
493    /* If (ip1 < ip0) go to exit */
494    cmpq %ip0, %ip1
495    jb .L_4X2_exit
496
497    /* If (ip2 < ip1) go to exit */
498    cmpq %ip1, %ip2
499    jb .L_4X2_exit
500
501    /* If (ip3 < ip2) go to exit */
502    cmpq %ip2, %ip3
503    jb .L_4X2_exit
504
505#define DECODE(n, idx)              \
506    movq %bits##n, %rax;            \
507    shrq $53, %rax;                 \
508    movzwl 0(%dtable,%rax,4),%r8d;  \
509    movzbl 2(%dtable,%rax,4),%r15d; \
510    movzbl 3(%dtable,%rax,4),%eax;  \
511    movw %r8w, (%op##n);            \
512    shlxq %r15, %bits##n, %bits##n; \
513    addq %rax, %op##n
514
515#define RELOAD_BITS(n)              \
516    bsfq %bits##n, %bits##n;        \
517    movq %bits##n, %rax;            \
518    shrq $3, %bits##n;              \
519    andq $7, %rax;                  \
520    subq %bits##n, %ip##n;          \
521    movq (%ip##n), %bits##n;        \
522    orq $1, %bits##n;               \
523    shlxq %rax, %bits##n, %bits##n
524
525
526    movq %olimit, 48(%rsp)
527
528    .p2align 6
529
530.L_4X2_loop_body:
531    /* We clobber r8, so store it on the stack */
532    movq %r8, 0(%rsp)
533
534    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
535    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
536    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
537    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
538    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
539    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
540
541    /* Reload r8 */
542    movq 0(%rsp), %r8
543
544    FOR_EACH_STREAM(RELOAD_BITS)
545
546    cmp %op3, 48(%rsp)
547    ja .L_4X2_loop_body
548    jmp .L_4X2_compute_olimit
549
550#undef DECODE
551#undef RELOAD_BITS
552.L_4X2_exit:
553    addq $8, %rsp
554    /* Restore stack (oend & olimit) */
555    pop %rax /* oend0 */
556    pop %rax /* oend1 */
557    pop %rax /* oend2 */
558    pop %rax /* oend3 */
559    pop %rax /* ilowest */
560    pop %rax /* olimit */
561    pop %rax /* arg */
562
563    /* Save ip / op / bits */
564    movq %ip0,  0(%rax)
565    movq %ip1,  8(%rax)
566    movq %ip2, 16(%rax)
567    movq %ip3, 24(%rax)
568    movq %op0, 32(%rax)
569    movq %op1, 40(%rax)
570    movq %op2, 48(%rax)
571    movq %op3, 56(%rax)
572    movq %bits0, 64(%rax)
573    movq %bits1, 72(%rax)
574    movq %bits2, 80(%rax)
575    movq %bits3, 88(%rax)
576
577    /* Restore registers */
578    pop %r15
579    pop %r14
580    pop %r13
581    pop %r12
582    pop %r11
583    pop %r10
584    pop %r9
585    pop %r8
586    pop %rdi
587    pop %rsi
588    pop %rbp
589    pop %rdx
590    pop %rcx
591    pop %rbx
592    pop %rax
593    ret
594
595#endif
596