1; XzCrc64Opt.asm -- CRC64 calculation : optimized version 2; 2023-12-08 : Igor Pavlov : Public domain 3 4include 7zAsm.asm 5 6MY_ASM_START 7 8NUM_WORDS equ 3 9 10if (NUM_WORDS lt 1) or (NUM_WORDS gt 64) 11.err <num_words_IS_INCORRECT> 12endif 13 14NUM_SKIP_BYTES equ ((NUM_WORDS - 2) * 4) 15 16 17MOVZXLO macro dest:req, src:req 18 movzx dest, @CatStr(src, _L) 19endm 20 21MOVZXHI macro dest:req, src:req 22 movzx dest, @CatStr(src, _H) 23endm 24 25 26ifdef x64 27 28rD equ r11 29rN equ r10 30rT equ r9 31 32CRC_OP macro op:req, dest:req, src:req, t:req 33 op dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)] 34endm 35 36CRC_XOR macro dest:req, src:req, t:req 37 CRC_OP xor, dest, src, t 38endm 39 40CRC_MOV macro dest:req, src:req, t:req 41 CRC_OP mov, dest, src, t 42endm 43 44CRC1b macro 45 movzx x6, BYTE PTR [rD] 46 inc rD 47 MOVZXLO x3, x0 48 xor x6, x3 49 shr r0, 8 50 CRC_XOR r0, x6, 0 51 dec rN 52endm 53 54 55; ALIGN_MASK is 3 or 7 bytes alignment: 56ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4) 57 58if NUM_WORDS eq 1 59 60src_rN_offset equ 4 61; + 4 for prefetching next 4-bytes after current iteration 62NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 4) 63SRCDAT4 equ DWORD PTR [rN + rD * 1] 64 65XOR_NEXT macro 66 mov x1, [rD] 67 xor r0, r1 68endm 69 70else ; NUM_WORDS > 1 71 72src_rN_offset equ 8 73; + 8 for prefetching next 8-bytes after current iteration 74NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8) 75 76XOR_NEXT macro 77 xor r0, QWORD PTR [rD] ; 64-bit read, can be unaligned 78endm 79 80; 32-bit or 64-bit 81LOAD_SRC_MULT4 macro dest:req, word_index:req 82 mov dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset]; 83endm 84 85endif 86 87 88 89MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4 90 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 91 92 mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7 93 mov rD, REG_ABI_PARAM_1 ; r11 <- r2 / r6 94 mov rN, REG_ABI_PARAM_2 ; r10 <- r8 / r2 95if (IS_LINUX gt 0) 96 mov rT, REG_ABI_PARAM_3 ; r9 <- r9 / r1 97endif 98 99 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK 100 jb crc_end 101@@: 102 test rD, ALIGN_MASK 103 jz @F 104 CRC1b 105 jmp @B 106@@: 107 XOR_NEXT 108 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] 109 sub rD, rN 110 add rN, src_rN_offset 111 112align 16 113@@: 114 115if NUM_WORDS eq 1 116 117 mov x1, x0 118 shr x1, 8 119 MOVZXLO x3, x1 120 MOVZXLO x2, x0 121 shr x1, 8 122 shr r0, 32 123 xor x0, SRCDAT4 124 CRC_XOR r0, x2, 3 125 CRC_XOR r0, x3, 2 126 MOVZXLO x2, x1 127 shr x1, 8 128 CRC_XOR r0, x2, 1 129 CRC_XOR r0, x1, 0 130 131else ; NUM_WORDS > 1 132 133if NUM_WORDS ne 2 134 k = 2 135 while k lt NUM_WORDS 136 137 LOAD_SRC_MULT4 x1, k 138 crc_op1 textequ <xor> 139 140 if k eq 2 141 if (NUM_WORDS and 1) 142 LOAD_SRC_MULT4 x7, NUM_WORDS ; aligned 32-bit 143 LOAD_SRC_MULT4 x6, NUM_WORDS + 1 ; aligned 32-bit 144 shl r6, 32 145 else 146 LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit 147 crc_op1 textequ <mov> 148 endif 149 endif 150 table = 4 * (NUM_WORDS - 1 - k) 151 MOVZXLO x3, x1 152 CRC_OP crc_op1, r7, x3, 3 + table 153 MOVZXHI x3, x1 154 shr x1, 16 155 CRC_XOR r6, x3, 2 + table 156 MOVZXLO x3, x1 157 shr x1, 8 158 CRC_XOR r7, x3, 1 + table 159 CRC_XOR r6, x1, 0 + table 160 k = k + 1 161 endm 162 crc_op2 textequ <xor> 163 164else ; NUM_WORDS == 2 165 LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit 166 crc_op2 textequ <mov> 167endif ; NUM_WORDS == 2 168 169 MOVZXHI x3, x0 170 MOVZXLO x2, x0 171 mov r1, r0 172 shr r1, 32 173 shr x0, 16 174 CRC_XOR r6, x2, NUM_SKIP_BYTES + 7 175 CRC_OP crc_op2, r7, x3, NUM_SKIP_BYTES + 6 176 MOVZXLO x2, x0 177 MOVZXHI x5, x1 178 MOVZXLO x3, x1 179 shr x0, 8 180 shr x1, 16 181 CRC_XOR r7, x2, NUM_SKIP_BYTES + 5 182 CRC_XOR r6, x3, NUM_SKIP_BYTES + 3 183 CRC_XOR r7, x0, NUM_SKIP_BYTES + 4 184 CRC_XOR r6, x5, NUM_SKIP_BYTES + 2 185 MOVZXLO x2, x1 186 shr x1, 8 187 CRC_XOR r7, x2, NUM_SKIP_BYTES + 1 188 CRC_MOV r0, x1, NUM_SKIP_BYTES + 0 189 xor r0, r6 190 xor r0, r7 191 192endif ; NUM_WORDS > 1 193 add rD, NUM_WORDS * 4 194 jnc @B 195 196 sub rN, src_rN_offset 197 add rD, rN 198 XOR_NEXT 199 add rN, NUM_BYTES_LIMIT - 1 200 sub rN, rD 201 202crc_end: 203 test rN, rN 204 jz func_end 205@@: 206 CRC1b 207 jnz @B 208func_end: 209 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 210MY_ENDP 211 212 213 214else 215; ================================================================== 216; x86 (32-bit) 217 218rD equ r7 219rN equ r1 220rT equ r5 221 222xA equ x6 223xA_R equ r6 224 225ifdef x64 226 num_VAR equ r8 227else 228 229crc_OFFS equ (REG_SIZE * 5) 230 231if (IS_CDECL gt 0) or (IS_LINUX gt 0) 232 ; cdecl or (GNU fastcall) stack: 233 ; (UInt32 *) table 234 ; size_t size 235 ; void * data 236 ; (UInt64) crc 237 ; ret-ip <-(r4) 238 data_OFFS equ (8 + crc_OFFS) 239 size_OFFS equ (REG_SIZE + data_OFFS) 240 table_OFFS equ (REG_SIZE + size_OFFS) 241 num_VAR equ [r4 + size_OFFS] 242 table_VAR equ [r4 + table_OFFS] 243else 244 ; Windows fastcall: 245 ; r1 = data, r2 = size 246 ; stack: 247 ; (UInt32 *) table 248 ; (UInt64) crc 249 ; ret-ip <-(r4) 250 table_OFFS equ (8 + crc_OFFS) 251 table_VAR equ [r4 + table_OFFS] 252 num_VAR equ table_VAR 253endif 254endif ; x64 255 256SRCDAT4 equ DWORD PTR [rN + rD * 1] 257 258CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req 259 op dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4] 260endm 261 262CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req 263 CRC_1 op0, dest0, src, t, 0 264 CRC_1 op1, dest1, src, t, 1 265endm 266 267CRC_XOR macro dest0:req, dest1:req, src:req, t:req 268 CRC xor, xor, dest0, dest1, src, t 269endm 270 271 272CRC1b macro 273 movzx xA, BYTE PTR [rD] 274 inc rD 275 MOVZXLO x3, x0 276 xor xA, x3 277 shrd x0, x2, 8 278 shr x2, 8 279 CRC_XOR x0, x2, xA, 0 280 dec rN 281endm 282 283 284MY_PROLOG_BASE macro 285 MY_PUSH_4_REGS 286ifdef x64 287 mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7 288 mov rT, REG_ABI_PARAM_3 ; r5 <- r9 / r1 289 mov rN, REG_ABI_PARAM_2 ; r1 <- r8 / r2 290 mov rD, REG_ABI_PARAM_1 ; r7 <- r2 / r6 291 mov r2, r0 292 shr r2, 32 293 mov x0, x0 294else 295 if (IS_CDECL gt 0) or (IS_LINUX gt 0) 296 proc_numParams = proc_numParams + 2 ; for ABI_LINUX 297 mov rN, [r4 + size_OFFS] 298 mov rD, [r4 + data_OFFS] 299 else 300 mov rD, REG_ABI_PARAM_0 ; r7 <- r1 : (data) 301 mov rN, REG_ABI_PARAM_1 ; r1 <- r2 : (size) 302 endif 303 mov x0, [r4 + crc_OFFS] 304 mov x2, [r4 + crc_OFFS + 4] 305 mov rT, table_VAR 306endif 307endm 308 309 310MY_EPILOG_BASE macro crc_end:req, func_end:req 311crc_end: 312 test rN, rN 313 jz func_end 314@@: 315 CRC1b 316 jnz @B 317func_end: 318ifdef x64 319 shl r2, 32 320 xor r0, r2 321endif 322 MY_POP_4_REGS 323endm 324 325 326; ALIGN_MASK is 3 or 7 bytes alignment: 327ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4) 328 329if (NUM_WORDS eq 1) 330 331NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4) 332 333MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5 334 MY_PROLOG_BASE 335 336 cmp rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK 337 jb crc_end_4 338@@: 339 test rD, ALIGN_MASK 340 jz @F 341 CRC1b 342 jmp @B 343@@: 344 xor x0, [rD] 345 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)] 346 sub rD, rN 347 add rN, 4 348 349 MOVZXLO xA, x0 350align 16 351@@: 352 mov x3, SRCDAT4 353 xor x3, x2 354 shr x0, 8 355 CRC xor, mov, x3, x2, xA, 3 356 MOVZXLO xA, x0 357 shr x0, 8 358 ; MOVZXHI xA, x0 359 ; shr x0, 16 360 CRC_XOR x3, x2, xA, 2 361 362 MOVZXLO xA, x0 363 shr x0, 8 364 CRC_XOR x3, x2, xA, 1 365 CRC_XOR x3, x2, x0, 0 366 MOVZXLO xA, x3 367 mov x0, x3 368 369 add rD, 4 370 jnc @B 371 372 sub rN, 4 373 add rD, rN 374 xor x0, [rD] 375 add rN, NUM_BYTES_LIMIT_T4 - 1 376 sub rN, rD 377 MY_EPILOG_BASE crc_end_4, func_end_4 378MY_ENDP 379 380else ; NUM_WORDS > 1 381 382SHR_X macro x, imm 383 shr x, imm 384endm 385 386 387ITER_1 macro v0, v1, a, off 388 MOVZXLO xA, a 389 SHR_X a, 8 390 CRC_XOR v0, v1, xA, off 391endm 392 393 394ITER_4 macro v0, v1, a, off 395if 0 eq 0 396 ITER_1 v0, v1, a, off + 3 397 ITER_1 v0, v1, a, off + 2 398 ITER_1 v0, v1, a, off + 1 399 CRC_XOR v0, v1, a, off 400elseif 0 eq 0 401 MOVZXLO xA, a 402 CRC_XOR v0, v1, xA, off + 3 403 mov xA, a 404 ror a, 16 ; 32-bit ror 405 shr xA, 24 406 CRC_XOR v0, v1, xA, off 407 MOVZXLO xA, a 408 SHR_X a, 24 409 CRC_XOR v0, v1, xA, off + 1 410 CRC_XOR v0, v1, a, off + 2 411else 412 ; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction 413 MOVZXLO xA, a 414 CRC_XOR v0, v1, xA, off + 3 415 MOVZXHI xA, a 416 SHR_X a, 16 417 CRC_XOR v0, v1, xA, off + 2 418 MOVZXLO xA, a 419 SHR_X a, 8 420 CRC_XOR v0, v1, xA, off + 1 421 CRC_XOR v0, v1, a, off 422endif 423endm 424 425 426 427ITER_1_PAIR macro v0, v1, a0, a1, off 428 ITER_1 v0, v1, a0, off + 4 429 ITER_1 v0, v1, a1, off 430endm 431 432src_rD_offset equ 8 433STEP_SIZE equ (NUM_WORDS * 4) 434 435ITER_12_NEXT macro op, index, v0, v1 436 op v0, DWORD PTR [rD + (index + 1) * STEP_SIZE - src_rD_offset] 437 op v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset] 438endm 439 440ITER_12 macro index, a0, a1, v0, v1 441 442 if NUM_SKIP_BYTES eq 0 443 ITER_12_NEXT mov, index, v0, v1 444 else 445 k = 0 446 while k lt NUM_SKIP_BYTES 447 movzx xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset] 448 if k eq 0 449 CRC mov, mov, v0, v1, xA, NUM_SKIP_BYTES - 1 - k 450 else 451 CRC_XOR v0, v1, xA, NUM_SKIP_BYTES - 1 - k 452 endif 453 k = k + 1 454 endm 455 ITER_12_NEXT xor, index, v0, v1 456 endif 457 458if 0 eq 0 459 ITER_4 v0, v1, a0, NUM_SKIP_BYTES + 4 460 ITER_4 v0, v1, a1, NUM_SKIP_BYTES 461else ; interleave version is faster/slower for different processors 462 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3 463 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2 464 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1 465 CRC_XOR v0, v1, a0, NUM_SKIP_BYTES + 4 466 CRC_XOR v0, v1, a1, NUM_SKIP_BYTES 467endif 468endm 469 470; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads) 471UNROLL_CNT equ (2 * 1) 472NUM_BYTES_LIMIT equ (STEP_SIZE * UNROLL_CNT + 8) 473 474MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5 475 MY_PROLOG_BASE 476 477 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK 478 jb crc_end_12 479@@: 480 test rD, ALIGN_MASK 481 jz @F 482 CRC1b 483 jmp @B 484@@: 485 xor x0, [rD] 486 xor x2, [rD + 4] 487 add rD, src_rD_offset 488 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] 489 mov num_VAR, rN 490 491align 16 492@@: 493 i = 0 494 rept UNROLL_CNT 495 if (i and 1) eq 0 496 ITER_12 i, x0, x2, x1, x3 497 else 498 ITER_12 i, x1, x3, x0, x2 499 endif 500 i = i + 1 501 endm 502 503 if (UNROLL_CNT and 1) 504 mov x0, x1 505 mov x2, x3 506 endif 507 add rD, STEP_SIZE * UNROLL_CNT 508 cmp rD, num_VAR 509 jb @B 510 511 mov rN, num_VAR 512 add rN, NUM_BYTES_LIMIT - 1 513 sub rN, rD 514 sub rD, src_rD_offset 515 xor x0, [rD] 516 xor x2, [rD + 4] 517 518 MY_EPILOG_BASE crc_end_12, func_end_12 519MY_ENDP 520 521endif ; (NUM_WORDS > 1) 522endif ; ! x64 523end 524