1/* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11#include "../common/portability_macros.h" 12 13#if defined(__ELF__) && defined(__GNUC__) 14/* Stack marking 15 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart 16 */ 17.section .note.GNU-stack,"",%progbits 18 19#if defined(__aarch64__) 20/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64. 21 * See: https://github.com/facebook/zstd/issues/3841 22 * See: https://gcc.godbolt.org/z/sqr5T4ffK 23 * See: https://lore.kernel.org/linux-arm-kernel/[email protected]/ 24 * See: https://reviews.llvm.org/D62609 25 */ 26.pushsection .note.gnu.property, "a" 27.p2align 3 28.long 4 /* size of the name - "GNU\0" */ 29.long 0x10 /* size of descriptor */ 30.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */ 31.asciz "GNU" 32.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */ 33.long 4 /* pr_datasz - 4 bytes */ 34.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ 35.p2align 3 /* pr_padding - bring everything to 8 byte alignment */ 36.popsection 37#endif 38 39#endif 40 41#if ZSTD_ENABLE_ASM_X86_64_BMI2 42 43/* Calling convention: 44 * 45 * %rdi contains the first argument: HUF_DecompressAsmArgs*. 46 * %rbp isn't maintained (no frame pointer). 47 * %rsp contains the stack pointer that grows down. 48 * No red-zone is assumed, only addresses >= %rsp are used. 49 * All register contents are preserved. 50 * 51 * TODO: Support Windows calling convention. 52 */ 53 54ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) 55ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) 56ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) 57ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) 58.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop 59.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop 60.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop 61.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop 62.text 63 64/* Sets up register mappings for clarity. 65 * op[], bits[], dtable & ip[0] each get their own register. 66 * ip[1,2,3] & olimit alias var[]. 67 * %rax is a scratch register. 68 */ 69 70#define op0 rsi 71#define op1 rbx 72#define op2 rcx 73#define op3 rdi 74 75#define ip0 r8 76#define ip1 r9 77#define ip2 r10 78#define ip3 r11 79 80#define bits0 rbp 81#define bits1 rdx 82#define bits2 r12 83#define bits3 r13 84#define dtable r14 85#define olimit r15 86 87/* var[] aliases ip[1,2,3] & olimit 88 * ip[1,2,3] are saved every iteration. 89 * olimit is only used in compute_olimit. 90 */ 91#define var0 r15 92#define var1 r9 93#define var2 r10 94#define var3 r11 95 96/* 32-bit var registers */ 97#define vard0 r15d 98#define vard1 r9d 99#define vard2 r10d 100#define vard3 r11d 101 102/* Calls X(N) for each stream 0, 1, 2, 3. */ 103#define FOR_EACH_STREAM(X) \ 104 X(0); \ 105 X(1); \ 106 X(2); \ 107 X(3) 108 109/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ 110#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ 111 X(0, idx); \ 112 X(1, idx); \ 113 X(2, idx); \ 114 X(3, idx) 115 116/* Define both _HUF_* & HUF_* symbols because MacOS 117 * C symbols are prefixed with '_' & Linux symbols aren't. 118 */ 119_HUF_decompress4X1_usingDTable_internal_fast_asm_loop: 120HUF_decompress4X1_usingDTable_internal_fast_asm_loop: 121 ZSTD_CET_ENDBRANCH 122 /* Save all registers - even if they are callee saved for simplicity. */ 123 push %rax 124 push %rbx 125 push %rcx 126 push %rdx 127 push %rbp 128 push %rsi 129 push %rdi 130 push %r8 131 push %r9 132 push %r10 133 push %r11 134 push %r12 135 push %r13 136 push %r14 137 push %r15 138 139 /* Read HUF_DecompressAsmArgs* args from %rax */ 140 movq %rdi, %rax 141 movq 0(%rax), %ip0 142 movq 8(%rax), %ip1 143 movq 16(%rax), %ip2 144 movq 24(%rax), %ip3 145 movq 32(%rax), %op0 146 movq 40(%rax), %op1 147 movq 48(%rax), %op2 148 movq 56(%rax), %op3 149 movq 64(%rax), %bits0 150 movq 72(%rax), %bits1 151 movq 80(%rax), %bits2 152 movq 88(%rax), %bits3 153 movq 96(%rax), %dtable 154 push %rax /* argument */ 155 push 104(%rax) /* ilowest */ 156 push 112(%rax) /* oend */ 157 push %olimit /* olimit space */ 158 159 subq $24, %rsp 160 161.L_4X1_compute_olimit: 162 /* Computes how many iterations we can do safely 163 * %r15, %rax may be clobbered 164 * rbx, rdx must be saved 165 * op3 & ip0 mustn't be clobbered 166 */ 167 movq %rbx, 0(%rsp) 168 movq %rdx, 8(%rsp) 169 170 movq 32(%rsp), %rax /* rax = oend */ 171 subq %op3, %rax /* rax = oend - op3 */ 172 173 /* r15 = (oend - op3) / 5 */ 174 movabsq $-3689348814741910323, %rdx 175 mulq %rdx 176 movq %rdx, %r15 177 shrq $2, %r15 178 179 movq %ip0, %rax /* rax = ip0 */ 180 movq 40(%rsp), %rdx /* rdx = ilowest */ 181 subq %rdx, %rax /* rax = ip0 - ilowest */ 182 movq %rax, %rbx /* rbx = ip0 - ilowest */ 183 184 /* rdx = (ip0 - ilowest) / 7 */ 185 movabsq $2635249153387078803, %rdx 186 mulq %rdx 187 subq %rdx, %rbx 188 shrq %rbx 189 addq %rbx, %rdx 190 shrq $2, %rdx 191 192 /* r15 = min(%rdx, %r15) */ 193 cmpq %rdx, %r15 194 cmova %rdx, %r15 195 196 /* r15 = r15 * 5 */ 197 leaq (%r15, %r15, 4), %r15 198 199 /* olimit = op3 + r15 */ 200 addq %op3, %olimit 201 202 movq 8(%rsp), %rdx 203 movq 0(%rsp), %rbx 204 205 /* If (op3 + 20 > olimit) */ 206 movq %op3, %rax /* rax = op3 */ 207 cmpq %rax, %olimit /* op3 == olimit */ 208 je .L_4X1_exit 209 210 /* If (ip1 < ip0) go to exit */ 211 cmpq %ip0, %ip1 212 jb .L_4X1_exit 213 214 /* If (ip2 < ip1) go to exit */ 215 cmpq %ip1, %ip2 216 jb .L_4X1_exit 217 218 /* If (ip3 < ip2) go to exit */ 219 cmpq %ip2, %ip3 220 jb .L_4X1_exit 221 222/* Reads top 11 bits from bits[n] 223 * Loads dt[bits[n]] into var[n] 224 */ 225#define GET_NEXT_DELT(n) \ 226 movq $53, %var##n; \ 227 shrxq %var##n, %bits##n, %var##n; \ 228 movzwl (%dtable,%var##n,2),%vard##n 229 230/* var[n] must contain the DTable entry computed with GET_NEXT_DELT 231 * Moves var[n] to %rax 232 * bits[n] <<= var[n] & 63 233 * op[n][idx] = %rax >> 8 234 * %ah is a way to access bits [8, 16) of %rax 235 */ 236#define DECODE_FROM_DELT(n, idx) \ 237 movq %var##n, %rax; \ 238 shlxq %var##n, %bits##n, %bits##n; \ 239 movb %ah, idx(%op##n) 240 241/* Assumes GET_NEXT_DELT has been called. 242 * Calls DECODE_FROM_DELT then GET_NEXT_DELT 243 */ 244#define DECODE_AND_GET_NEXT(n, idx) \ 245 DECODE_FROM_DELT(n, idx); \ 246 GET_NEXT_DELT(n) \ 247 248/* // ctz & nbBytes is stored in bits[n] 249 * // nbBits is stored in %rax 250 * ctz = CTZ[bits[n]] 251 * nbBits = ctz & 7 252 * nbBytes = ctz >> 3 253 * op[n] += 5 254 * ip[n] -= nbBytes 255 * // Note: x86-64 is little-endian ==> no bswap 256 * bits[n] = MEM_readST(ip[n]) | 1 257 * bits[n] <<= nbBits 258 */ 259#define RELOAD_BITS(n) \ 260 bsfq %bits##n, %bits##n; \ 261 movq %bits##n, %rax; \ 262 andq $7, %rax; \ 263 shrq $3, %bits##n; \ 264 leaq 5(%op##n), %op##n; \ 265 subq %bits##n, %ip##n; \ 266 movq (%ip##n), %bits##n; \ 267 orq $1, %bits##n; \ 268 shlx %rax, %bits##n, %bits##n 269 270 /* Store clobbered variables on the stack */ 271 movq %olimit, 24(%rsp) 272 movq %ip1, 0(%rsp) 273 movq %ip2, 8(%rsp) 274 movq %ip3, 16(%rsp) 275 276 /* Call GET_NEXT_DELT for each stream */ 277 FOR_EACH_STREAM(GET_NEXT_DELT) 278 279 .p2align 6 280 281.L_4X1_loop_body: 282 /* Decode 5 symbols in each of the 4 streams (20 total) 283 * Must have called GET_NEXT_DELT for each stream 284 */ 285 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) 286 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) 287 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) 288 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) 289 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) 290 291 /* Load ip[1,2,3] from stack (var[] aliases them) 292 * ip[] is needed for RELOAD_BITS 293 * Each will be stored back to the stack after RELOAD 294 */ 295 movq 0(%rsp), %ip1 296 movq 8(%rsp), %ip2 297 movq 16(%rsp), %ip3 298 299 /* Reload each stream & fetch the next table entry 300 * to prepare for the next iteration 301 */ 302 RELOAD_BITS(0) 303 GET_NEXT_DELT(0) 304 305 RELOAD_BITS(1) 306 movq %ip1, 0(%rsp) 307 GET_NEXT_DELT(1) 308 309 RELOAD_BITS(2) 310 movq %ip2, 8(%rsp) 311 GET_NEXT_DELT(2) 312 313 RELOAD_BITS(3) 314 movq %ip3, 16(%rsp) 315 GET_NEXT_DELT(3) 316 317 /* If op3 < olimit: continue the loop */ 318 cmp %op3, 24(%rsp) 319 ja .L_4X1_loop_body 320 321 /* Reload ip[1,2,3] from stack */ 322 movq 0(%rsp), %ip1 323 movq 8(%rsp), %ip2 324 movq 16(%rsp), %ip3 325 326 /* Re-compute olimit */ 327 jmp .L_4X1_compute_olimit 328 329#undef GET_NEXT_DELT 330#undef DECODE_FROM_DELT 331#undef DECODE 332#undef RELOAD_BITS 333.L_4X1_exit: 334 addq $24, %rsp 335 336 /* Restore stack (oend & olimit) */ 337 pop %rax /* olimit */ 338 pop %rax /* oend */ 339 pop %rax /* ilowest */ 340 pop %rax /* arg */ 341 342 /* Save ip / op / bits */ 343 movq %ip0, 0(%rax) 344 movq %ip1, 8(%rax) 345 movq %ip2, 16(%rax) 346 movq %ip3, 24(%rax) 347 movq %op0, 32(%rax) 348 movq %op1, 40(%rax) 349 movq %op2, 48(%rax) 350 movq %op3, 56(%rax) 351 movq %bits0, 64(%rax) 352 movq %bits1, 72(%rax) 353 movq %bits2, 80(%rax) 354 movq %bits3, 88(%rax) 355 356 /* Restore registers */ 357 pop %r15 358 pop %r14 359 pop %r13 360 pop %r12 361 pop %r11 362 pop %r10 363 pop %r9 364 pop %r8 365 pop %rdi 366 pop %rsi 367 pop %rbp 368 pop %rdx 369 pop %rcx 370 pop %rbx 371 pop %rax 372 ret 373 374_HUF_decompress4X2_usingDTable_internal_fast_asm_loop: 375HUF_decompress4X2_usingDTable_internal_fast_asm_loop: 376 ZSTD_CET_ENDBRANCH 377 /* Save all registers - even if they are callee saved for simplicity. */ 378 push %rax 379 push %rbx 380 push %rcx 381 push %rdx 382 push %rbp 383 push %rsi 384 push %rdi 385 push %r8 386 push %r9 387 push %r10 388 push %r11 389 push %r12 390 push %r13 391 push %r14 392 push %r15 393 394 movq %rdi, %rax 395 movq 0(%rax), %ip0 396 movq 8(%rax), %ip1 397 movq 16(%rax), %ip2 398 movq 24(%rax), %ip3 399 movq 32(%rax), %op0 400 movq 40(%rax), %op1 401 movq 48(%rax), %op2 402 movq 56(%rax), %op3 403 movq 64(%rax), %bits0 404 movq 72(%rax), %bits1 405 movq 80(%rax), %bits2 406 movq 88(%rax), %bits3 407 movq 96(%rax), %dtable 408 push %rax /* argument */ 409 push %rax /* olimit */ 410 push 104(%rax) /* ilowest */ 411 412 movq 112(%rax), %rax 413 push %rax /* oend3 */ 414 415 movq %op3, %rax 416 push %rax /* oend2 */ 417 418 movq %op2, %rax 419 push %rax /* oend1 */ 420 421 movq %op1, %rax 422 push %rax /* oend0 */ 423 424 /* Scratch space */ 425 subq $8, %rsp 426 427.L_4X2_compute_olimit: 428 /* Computes how many iterations we can do safely 429 * %r15, %rax may be clobbered 430 * rdx must be saved 431 * op[1,2,3,4] & ip0 mustn't be clobbered 432 */ 433 movq %rdx, 0(%rsp) 434 435 /* We can consume up to 7 input bytes each iteration. */ 436 movq %ip0, %rax /* rax = ip0 */ 437 movq 40(%rsp), %rdx /* rdx = ilowest */ 438 subq %rdx, %rax /* rax = ip0 - ilowest */ 439 movq %rax, %r15 /* r15 = ip0 - ilowest */ 440 441 /* rdx = rax / 7 */ 442 movabsq $2635249153387078803, %rdx 443 mulq %rdx 444 subq %rdx, %r15 445 shrq %r15 446 addq %r15, %rdx 447 shrq $2, %rdx 448 449 /* r15 = (ip0 - ilowest) / 7 */ 450 movq %rdx, %r15 451 452 /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ 453 movq 8(%rsp), %rax /* rax = oend0 */ 454 subq %op0, %rax /* rax = oend0 - op0 */ 455 movq 16(%rsp), %rdx /* rdx = oend1 */ 456 subq %op1, %rdx /* rdx = oend1 - op1 */ 457 458 cmpq %rax, %rdx 459 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 460 461 movq 24(%rsp), %rax /* rax = oend2 */ 462 subq %op2, %rax /* rax = oend2 - op2 */ 463 464 cmpq %rax, %rdx 465 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 466 467 movq 32(%rsp), %rax /* rax = oend3 */ 468 subq %op3, %rax /* rax = oend3 - op3 */ 469 470 cmpq %rax, %rdx 471 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 472 473 movabsq $-3689348814741910323, %rax 474 mulq %rdx 475 shrq $3, %rdx /* rdx = rdx / 10 */ 476 477 /* r15 = min(%rdx, %r15) */ 478 cmpq %rdx, %r15 479 cmova %rdx, %r15 480 481 /* olimit = op3 + 5 * r15 */ 482 movq %r15, %rax 483 leaq (%op3, %rax, 4), %olimit 484 addq %rax, %olimit 485 486 movq 0(%rsp), %rdx 487 488 /* If (op3 + 10 > olimit) */ 489 movq %op3, %rax /* rax = op3 */ 490 cmpq %rax, %olimit /* op3 == olimit */ 491 je .L_4X2_exit 492 493 /* If (ip1 < ip0) go to exit */ 494 cmpq %ip0, %ip1 495 jb .L_4X2_exit 496 497 /* If (ip2 < ip1) go to exit */ 498 cmpq %ip1, %ip2 499 jb .L_4X2_exit 500 501 /* If (ip3 < ip2) go to exit */ 502 cmpq %ip2, %ip3 503 jb .L_4X2_exit 504 505#define DECODE(n, idx) \ 506 movq %bits##n, %rax; \ 507 shrq $53, %rax; \ 508 movzwl 0(%dtable,%rax,4),%r8d; \ 509 movzbl 2(%dtable,%rax,4),%r15d; \ 510 movzbl 3(%dtable,%rax,4),%eax; \ 511 movw %r8w, (%op##n); \ 512 shlxq %r15, %bits##n, %bits##n; \ 513 addq %rax, %op##n 514 515#define RELOAD_BITS(n) \ 516 bsfq %bits##n, %bits##n; \ 517 movq %bits##n, %rax; \ 518 shrq $3, %bits##n; \ 519 andq $7, %rax; \ 520 subq %bits##n, %ip##n; \ 521 movq (%ip##n), %bits##n; \ 522 orq $1, %bits##n; \ 523 shlxq %rax, %bits##n, %bits##n 524 525 526 movq %olimit, 48(%rsp) 527 528 .p2align 6 529 530.L_4X2_loop_body: 531 /* We clobber r8, so store it on the stack */ 532 movq %r8, 0(%rsp) 533 534 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ 535 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) 536 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) 537 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) 538 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) 539 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) 540 541 /* Reload r8 */ 542 movq 0(%rsp), %r8 543 544 FOR_EACH_STREAM(RELOAD_BITS) 545 546 cmp %op3, 48(%rsp) 547 ja .L_4X2_loop_body 548 jmp .L_4X2_compute_olimit 549 550#undef DECODE 551#undef RELOAD_BITS 552.L_4X2_exit: 553 addq $8, %rsp 554 /* Restore stack (oend & olimit) */ 555 pop %rax /* oend0 */ 556 pop %rax /* oend1 */ 557 pop %rax /* oend2 */ 558 pop %rax /* oend3 */ 559 pop %rax /* ilowest */ 560 pop %rax /* olimit */ 561 pop %rax /* arg */ 562 563 /* Save ip / op / bits */ 564 movq %ip0, 0(%rax) 565 movq %ip1, 8(%rax) 566 movq %ip2, 16(%rax) 567 movq %ip3, 24(%rax) 568 movq %op0, 32(%rax) 569 movq %op1, 40(%rax) 570 movq %op2, 48(%rax) 571 movq %op3, 56(%rax) 572 movq %bits0, 64(%rax) 573 movq %bits1, 72(%rax) 574 movq %bits2, 80(%rax) 575 movq %bits3, 88(%rax) 576 577 /* Restore registers */ 578 pop %r15 579 pop %r14 580 pop %r13 581 pop %r12 582 pop %r11 583 pop %r10 584 pop %r9 585 pop %r8 586 pop %rdi 587 pop %rsi 588 pop %rbp 589 pop %rdx 590 pop %rcx 591 pop %rbx 592 pop %rax 593 ret 594 595#endif 596