1*f6dc9357SAndroid Build Coastguard Worker; 7zCrcOpt.asm -- CRC32 calculation : optimized version 2*f6dc9357SAndroid Build Coastguard Worker; 2023-12-08 : Igor Pavlov : Public domain 3*f6dc9357SAndroid Build Coastguard Worker 4*f6dc9357SAndroid Build Coastguard Workerinclude 7zAsm.asm 5*f6dc9357SAndroid Build Coastguard Worker 6*f6dc9357SAndroid Build Coastguard WorkerMY_ASM_START 7*f6dc9357SAndroid Build Coastguard Worker 8*f6dc9357SAndroid Build Coastguard WorkerNUM_WORDS equ 3 9*f6dc9357SAndroid Build Coastguard WorkerUNROLL_CNT equ 2 10*f6dc9357SAndroid Build Coastguard Worker 11*f6dc9357SAndroid Build Coastguard Workerif (NUM_WORDS lt 1) or (NUM_WORDS gt 64) 12*f6dc9357SAndroid Build Coastguard Worker.err <NUM_WORDS_IS_INCORRECT> 13*f6dc9357SAndroid Build Coastguard Workerendif 14*f6dc9357SAndroid Build Coastguard Workerif (UNROLL_CNT lt 1) 15*f6dc9357SAndroid Build Coastguard Worker.err <UNROLL_CNT_IS_INCORRECT> 16*f6dc9357SAndroid Build Coastguard Workerendif 17*f6dc9357SAndroid Build Coastguard Worker 18*f6dc9357SAndroid Build Coastguard WorkerrD equ r2 19*f6dc9357SAndroid Build Coastguard WorkerrD_x equ x2 20*f6dc9357SAndroid Build Coastguard WorkerrN equ r7 21*f6dc9357SAndroid Build Coastguard WorkerrT equ r5 22*f6dc9357SAndroid Build Coastguard Worker 23*f6dc9357SAndroid Build Coastguard Workerifndef x64 24*f6dc9357SAndroid Build Coastguard Worker if (IS_CDECL gt 0) 25*f6dc9357SAndroid Build Coastguard Worker crc_OFFS equ (REG_SIZE * 5) 26*f6dc9357SAndroid Build Coastguard Worker data_OFFS equ (REG_SIZE + crc_OFFS) 27*f6dc9357SAndroid Build Coastguard Worker size_OFFS equ (REG_SIZE + data_OFFS) 28*f6dc9357SAndroid Build Coastguard Worker else 29*f6dc9357SAndroid Build Coastguard Worker size_OFFS equ (REG_SIZE * 5) 30*f6dc9357SAndroid Build Coastguard Worker endif 31*f6dc9357SAndroid Build Coastguard Worker table_OFFS equ (REG_SIZE + size_OFFS) 32*f6dc9357SAndroid Build Coastguard Workerendif 33*f6dc9357SAndroid Build Coastguard Worker 34*f6dc9357SAndroid Build Coastguard Worker; rN + rD is same speed as rD, but we reduce one instruction in loop 35*f6dc9357SAndroid Build Coastguard WorkerSRCDAT_1 equ rN + rD * 1 + 1 * 36*f6dc9357SAndroid Build Coastguard WorkerSRCDAT_4 equ rN + rD * 1 + 4 * 37*f6dc9357SAndroid Build Coastguard Worker 38*f6dc9357SAndroid Build Coastguard WorkerCRC macro op:req, dest:req, src:req, t:req 39*f6dc9357SAndroid Build Coastguard Worker op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)] 40*f6dc9357SAndroid Build Coastguard Workerendm 41*f6dc9357SAndroid Build Coastguard Worker 42*f6dc9357SAndroid Build Coastguard WorkerCRC_XOR macro dest:req, src:req, t:req 43*f6dc9357SAndroid Build Coastguard Worker CRC xor, dest, src, t 44*f6dc9357SAndroid Build Coastguard Workerendm 45*f6dc9357SAndroid Build Coastguard Worker 46*f6dc9357SAndroid Build Coastguard WorkerCRC_MOV macro dest:req, src:req, t:req 47*f6dc9357SAndroid Build Coastguard Worker CRC mov, dest, src, t 48*f6dc9357SAndroid Build Coastguard Workerendm 49*f6dc9357SAndroid Build Coastguard Worker 50*f6dc9357SAndroid Build Coastguard WorkerMOVZXLO macro dest:req, src:req 51*f6dc9357SAndroid Build Coastguard Worker movzx dest, @CatStr(src, _L) 52*f6dc9357SAndroid Build Coastguard Workerendm 53*f6dc9357SAndroid Build Coastguard Worker 54*f6dc9357SAndroid Build Coastguard WorkerMOVZXHI macro dest:req, src:req 55*f6dc9357SAndroid Build Coastguard Worker movzx dest, @CatStr(src, _H) 56*f6dc9357SAndroid Build Coastguard Workerendm 57*f6dc9357SAndroid Build Coastguard Worker 58*f6dc9357SAndroid Build Coastguard Worker; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest 59*f6dc9357SAndroid Build Coastguard Worker; movzx x3, x0_L sometimes is 0 cycles latency (not always) 60*f6dc9357SAndroid Build Coastguard Worker; movzx x3, x0_L sometimes is 0.5 cycles latency 61*f6dc9357SAndroid Build Coastguard Worker; movzx x3, x0_H is 2 cycles latency in some cpus 62*f6dc9357SAndroid Build Coastguard Worker 63*f6dc9357SAndroid Build Coastguard WorkerCRC1b macro 64*f6dc9357SAndroid Build Coastguard Worker movzx x6, byte ptr [rD] 65*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x3, x0 66*f6dc9357SAndroid Build Coastguard Worker inc rD 67*f6dc9357SAndroid Build Coastguard Worker shr x0, 8 68*f6dc9357SAndroid Build Coastguard Worker xor x6, x3 69*f6dc9357SAndroid Build Coastguard Worker CRC_XOR x0, x6, 0 70*f6dc9357SAndroid Build Coastguard Worker dec rN 71*f6dc9357SAndroid Build Coastguard Workerendm 72*f6dc9357SAndroid Build Coastguard Worker 73*f6dc9357SAndroid Build Coastguard WorkerLOAD_1 macro dest:req, t:req, iter:req, index:req 74*f6dc9357SAndroid Build Coastguard Worker movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] 75*f6dc9357SAndroid Build Coastguard Workerendm 76*f6dc9357SAndroid Build Coastguard Worker 77*f6dc9357SAndroid Build Coastguard WorkerLOAD_2 macro dest:req, t:req, iter:req, index:req 78*f6dc9357SAndroid Build Coastguard Worker movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] 79*f6dc9357SAndroid Build Coastguard Workerendm 80*f6dc9357SAndroid Build Coastguard Worker 81*f6dc9357SAndroid Build Coastguard WorkerCRC_QUAD macro nn, t:req, iter:req 82*f6dc9357SAndroid Build Coastguard Workerifdef x64 83*f6dc9357SAndroid Build Coastguard Worker ; paired memory loads give 1-3% speed gain, but it uses more registers 84*f6dc9357SAndroid Build Coastguard Worker LOAD_2 x3, t, iter, 0 85*f6dc9357SAndroid Build Coastguard Worker LOAD_2 x9, t, iter, 2 86*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x3 87*f6dc9357SAndroid Build Coastguard Worker shr x3, 8 88*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 3 89*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x9 90*f6dc9357SAndroid Build Coastguard Worker shr x9, 8 91*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, t * 4 + 2 92*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 1 93*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x9, t * 4 + 0 94*f6dc9357SAndroid Build Coastguard Workerelseif 0 95*f6dc9357SAndroid Build Coastguard Worker LOAD_2 x3, t, iter, 0 96*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x3 97*f6dc9357SAndroid Build Coastguard Worker shr x3, 8 98*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 3 99*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, t * 4 + 2 100*f6dc9357SAndroid Build Coastguard Worker LOAD_2 x3, t, iter, 2 101*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x3 102*f6dc9357SAndroid Build Coastguard Worker shr x3, 8 103*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 1 104*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, t * 4 + 0 105*f6dc9357SAndroid Build Coastguard Workerelseif 0 106*f6dc9357SAndroid Build Coastguard Worker LOAD_1 x3, t, iter, 0 107*f6dc9357SAndroid Build Coastguard Worker LOAD_1 x6, t, iter, 1 108*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, t * 4 + 3 109*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 2 110*f6dc9357SAndroid Build Coastguard Worker LOAD_1 x3, t, iter, 2 111*f6dc9357SAndroid Build Coastguard Worker LOAD_1 x6, t, iter, 3 112*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, t * 4 + 1 113*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 0 114*f6dc9357SAndroid Build Coastguard Workerelse 115*f6dc9357SAndroid Build Coastguard Worker ; 32-bit load is better if there is only one read port (core2) 116*f6dc9357SAndroid Build Coastguard Worker ; but that code can be slower if there are 2 read ports (snb) 117*f6dc9357SAndroid Build Coastguard Worker mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)] 118*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x3 119*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 3 120*f6dc9357SAndroid Build Coastguard Worker MOVZXHI x6, x3 121*f6dc9357SAndroid Build Coastguard Worker shr x3, 16 122*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 2 123*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x3 124*f6dc9357SAndroid Build Coastguard Worker shr x3, 8 125*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, t * 4 + 1 126*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, t * 4 + 0 127*f6dc9357SAndroid Build Coastguard Workerendif 128*f6dc9357SAndroid Build Coastguard Workerendm 129*f6dc9357SAndroid Build Coastguard Worker 130*f6dc9357SAndroid Build Coastguard Worker 131*f6dc9357SAndroid Build Coastguard WorkerLAST equ (4 * (NUM_WORDS - 1)) 132*f6dc9357SAndroid Build Coastguard Worker 133*f6dc9357SAndroid Build Coastguard WorkerCRC_ITER macro qq, nn, iter 134*f6dc9357SAndroid Build Coastguard Worker mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))] 135*f6dc9357SAndroid Build Coastguard Worker 136*f6dc9357SAndroid Build Coastguard Worker i = 0 137*f6dc9357SAndroid Build Coastguard Worker rept NUM_WORDS - 1 138*f6dc9357SAndroid Build Coastguard Worker CRC_QUAD nn, i, iter 139*f6dc9357SAndroid Build Coastguard Worker i = i + 1 140*f6dc9357SAndroid Build Coastguard Worker endm 141*f6dc9357SAndroid Build Coastguard Worker 142*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, qq 143*f6dc9357SAndroid Build Coastguard Worker mov x3, qq 144*f6dc9357SAndroid Build Coastguard Worker shr x3, 24 145*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, LAST + 3 146*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x3, LAST + 0 147*f6dc9357SAndroid Build Coastguard Worker ror qq, 16 148*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, qq 149*f6dc9357SAndroid Build Coastguard Worker shr qq, 24 150*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, x6, LAST + 1 151*f6dc9357SAndroid Build Coastguard Workerif ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1)) 152*f6dc9357SAndroid Build Coastguard Worker CRC_MOV qq, qq, LAST + 2 153*f6dc9357SAndroid Build Coastguard Worker xor qq, nn 154*f6dc9357SAndroid Build Coastguard Workerelse 155*f6dc9357SAndroid Build Coastguard Worker CRC_XOR nn, qq, LAST + 2 156*f6dc9357SAndroid Build Coastguard Workerendif 157*f6dc9357SAndroid Build Coastguard Workerendm 158*f6dc9357SAndroid Build Coastguard Worker 159*f6dc9357SAndroid Build Coastguard Worker 160*f6dc9357SAndroid Build Coastguard Worker; + 4 for prefetching next 4-bytes after current iteration 161*f6dc9357SAndroid Build Coastguard WorkerNUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4) 162*f6dc9357SAndroid Build Coastguard WorkerALIGN_MASK equ 3 163*f6dc9357SAndroid Build Coastguard Worker 164*f6dc9357SAndroid Build Coastguard Worker 165*f6dc9357SAndroid Build Coastguard Worker; MY_PROC @CatStr(CrcUpdateT, 12), 4 166*f6dc9357SAndroid Build Coastguard WorkerMY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4 167*f6dc9357SAndroid Build Coastguard Worker MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 168*f6dc9357SAndroid Build Coastguard Worker ifdef x64 169*f6dc9357SAndroid Build Coastguard Worker mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux) 170*f6dc9357SAndroid Build Coastguard Worker mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux) 171*f6dc9357SAndroid Build Coastguard Worker mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux) 172*f6dc9357SAndroid Build Coastguard Worker ; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win) 173*f6dc9357SAndroid Build Coastguard Worker if (IS_LINUX gt 0) 174*f6dc9357SAndroid Build Coastguard Worker mov rD, REG_ABI_PARAM_1 ; r2 = r6 175*f6dc9357SAndroid Build Coastguard Worker endif 176*f6dc9357SAndroid Build Coastguard Worker else 177*f6dc9357SAndroid Build Coastguard Worker if (IS_CDECL gt 0) 178*f6dc9357SAndroid Build Coastguard Worker mov x0, [r4 + crc_OFFS] 179*f6dc9357SAndroid Build Coastguard Worker mov rD, [r4 + data_OFFS] 180*f6dc9357SAndroid Build Coastguard Worker else 181*f6dc9357SAndroid Build Coastguard Worker mov x0, REG_ABI_PARAM_0_x 182*f6dc9357SAndroid Build Coastguard Worker endif 183*f6dc9357SAndroid Build Coastguard Worker mov rN, [r4 + size_OFFS] 184*f6dc9357SAndroid Build Coastguard Worker mov rT, [r4 + table_OFFS] 185*f6dc9357SAndroid Build Coastguard Worker endif 186*f6dc9357SAndroid Build Coastguard Worker 187*f6dc9357SAndroid Build Coastguard Worker cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK 188*f6dc9357SAndroid Build Coastguard Worker jb crc_end 189*f6dc9357SAndroid Build Coastguard Worker@@: 190*f6dc9357SAndroid Build Coastguard Worker test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK 191*f6dc9357SAndroid Build Coastguard Worker jz @F 192*f6dc9357SAndroid Build Coastguard Worker CRC1b 193*f6dc9357SAndroid Build Coastguard Worker jmp @B 194*f6dc9357SAndroid Build Coastguard Worker@@: 195*f6dc9357SAndroid Build Coastguard Worker xor x0, dword ptr [rD] 196*f6dc9357SAndroid Build Coastguard Worker lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] 197*f6dc9357SAndroid Build Coastguard Worker sub rD, rN 198*f6dc9357SAndroid Build Coastguard Worker 199*f6dc9357SAndroid Build Coastguard Workeralign 16 200*f6dc9357SAndroid Build Coastguard Worker@@: 201*f6dc9357SAndroid Build Coastguard Workerunr_index = 0 202*f6dc9357SAndroid Build Coastguard Workerwhile unr_index lt UNROLL_CNT 203*f6dc9357SAndroid Build Coastguard Worker if (unr_index and 1) eq 0 204*f6dc9357SAndroid Build Coastguard Worker CRC_ITER x0, x1, unr_index 205*f6dc9357SAndroid Build Coastguard Worker else 206*f6dc9357SAndroid Build Coastguard Worker CRC_ITER x1, x0, unr_index 207*f6dc9357SAndroid Build Coastguard Worker endif 208*f6dc9357SAndroid Build Coastguard Worker unr_index = unr_index + 1 209*f6dc9357SAndroid Build Coastguard Workerendm 210*f6dc9357SAndroid Build Coastguard Worker 211*f6dc9357SAndroid Build Coastguard Worker add rD, NUM_WORDS * 4 * UNROLL_CNT 212*f6dc9357SAndroid Build Coastguard Worker jnc @B 213*f6dc9357SAndroid Build Coastguard Worker 214*f6dc9357SAndroid Build Coastguard Workerif 0 215*f6dc9357SAndroid Build Coastguard Worker ; byte verson 216*f6dc9357SAndroid Build Coastguard Worker add rD, rN 217*f6dc9357SAndroid Build Coastguard Worker xor x0, dword ptr [rD] 218*f6dc9357SAndroid Build Coastguard Worker add rN, NUM_BYTES_LIMIT - 1 219*f6dc9357SAndroid Build Coastguard Workerelse 220*f6dc9357SAndroid Build Coastguard Worker ; 4-byte version 221*f6dc9357SAndroid Build Coastguard Worker add rN, 4 * NUM_WORDS * UNROLL_CNT 222*f6dc9357SAndroid Build Coastguard Worker sub rD, 4 * NUM_WORDS * UNROLL_CNT 223*f6dc9357SAndroid Build Coastguard Worker@@: 224*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x3, x0 225*f6dc9357SAndroid Build Coastguard Worker MOVZXHI x1, x0 226*f6dc9357SAndroid Build Coastguard Worker shr x0, 16 227*f6dc9357SAndroid Build Coastguard Worker MOVZXLO x6, x0 228*f6dc9357SAndroid Build Coastguard Worker shr x0, 8 229*f6dc9357SAndroid Build Coastguard Worker CRC_MOV x0, x0, 0 230*f6dc9357SAndroid Build Coastguard Worker CRC_XOR x0, x3, 3 231*f6dc9357SAndroid Build Coastguard Worker CRC_XOR x0, x1, 2 232*f6dc9357SAndroid Build Coastguard Worker CRC_XOR x0, x6, 1 233*f6dc9357SAndroid Build Coastguard Worker 234*f6dc9357SAndroid Build Coastguard Worker add rD, 4 235*f6dc9357SAndroid Build Coastguard Workerif (NUM_WORDS * UNROLL_CNT) ne 1 236*f6dc9357SAndroid Build Coastguard Worker jc @F 237*f6dc9357SAndroid Build Coastguard Worker xor x0, [SRCDAT_4 0] 238*f6dc9357SAndroid Build Coastguard Worker jmp @B 239*f6dc9357SAndroid Build Coastguard Worker@@: 240*f6dc9357SAndroid Build Coastguard Workerendif 241*f6dc9357SAndroid Build Coastguard Worker add rD, rN 242*f6dc9357SAndroid Build Coastguard Worker add rN, 4 - 1 243*f6dc9357SAndroid Build Coastguard Worker 244*f6dc9357SAndroid Build Coastguard Workerendif 245*f6dc9357SAndroid Build Coastguard Worker 246*f6dc9357SAndroid Build Coastguard Worker sub rN, rD 247*f6dc9357SAndroid Build Coastguard Workercrc_end: 248*f6dc9357SAndroid Build Coastguard Worker test rN, rN 249*f6dc9357SAndroid Build Coastguard Worker jz func_end 250*f6dc9357SAndroid Build Coastguard Worker@@: 251*f6dc9357SAndroid Build Coastguard Worker CRC1b 252*f6dc9357SAndroid Build Coastguard Worker jnz @B 253*f6dc9357SAndroid Build Coastguard Worker 254*f6dc9357SAndroid Build Coastguard Workerfunc_end: 255*f6dc9357SAndroid Build Coastguard Worker MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 256*f6dc9357SAndroid Build Coastguard WorkerMY_ENDP 257*f6dc9357SAndroid Build Coastguard Worker 258*f6dc9357SAndroid Build Coastguard Workerend 259