xref: /aosp_15_r20/external/lzma/Asm/x86/7zCrcOpt.asm (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1*f6dc9357SAndroid Build Coastguard Worker; 7zCrcOpt.asm -- CRC32 calculation : optimized version
2*f6dc9357SAndroid Build Coastguard Worker; 2023-12-08 : Igor Pavlov : Public domain
3*f6dc9357SAndroid Build Coastguard Worker
4*f6dc9357SAndroid Build Coastguard Workerinclude 7zAsm.asm
5*f6dc9357SAndroid Build Coastguard Worker
6*f6dc9357SAndroid Build Coastguard WorkerMY_ASM_START
7*f6dc9357SAndroid Build Coastguard Worker
8*f6dc9357SAndroid Build Coastguard WorkerNUM_WORDS       equ     3
9*f6dc9357SAndroid Build Coastguard WorkerUNROLL_CNT      equ     2
10*f6dc9357SAndroid Build Coastguard Worker
11*f6dc9357SAndroid Build Coastguard Workerif (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
12*f6dc9357SAndroid Build Coastguard Worker.err <NUM_WORDS_IS_INCORRECT>
13*f6dc9357SAndroid Build Coastguard Workerendif
14*f6dc9357SAndroid Build Coastguard Workerif (UNROLL_CNT lt 1)
15*f6dc9357SAndroid Build Coastguard Worker.err <UNROLL_CNT_IS_INCORRECT>
16*f6dc9357SAndroid Build Coastguard Workerendif
17*f6dc9357SAndroid Build Coastguard Worker
18*f6dc9357SAndroid Build Coastguard WorkerrD      equ  r2
19*f6dc9357SAndroid Build Coastguard WorkerrD_x    equ  x2
20*f6dc9357SAndroid Build Coastguard WorkerrN      equ  r7
21*f6dc9357SAndroid Build Coastguard WorkerrT      equ  r5
22*f6dc9357SAndroid Build Coastguard Worker
23*f6dc9357SAndroid Build Coastguard Workerifndef x64
24*f6dc9357SAndroid Build Coastguard Worker    if (IS_CDECL gt 0)
25*f6dc9357SAndroid Build Coastguard Worker        crc_OFFS    equ (REG_SIZE * 5)
26*f6dc9357SAndroid Build Coastguard Worker        data_OFFS   equ (REG_SIZE + crc_OFFS)
27*f6dc9357SAndroid Build Coastguard Worker        size_OFFS   equ (REG_SIZE + data_OFFS)
28*f6dc9357SAndroid Build Coastguard Worker    else
29*f6dc9357SAndroid Build Coastguard Worker        size_OFFS   equ (REG_SIZE * 5)
30*f6dc9357SAndroid Build Coastguard Worker    endif
31*f6dc9357SAndroid Build Coastguard Worker        table_OFFS  equ (REG_SIZE + size_OFFS)
32*f6dc9357SAndroid Build Coastguard Workerendif
33*f6dc9357SAndroid Build Coastguard Worker
34*f6dc9357SAndroid Build Coastguard Worker; rN + rD is same speed as rD, but we reduce one instruction in loop
35*f6dc9357SAndroid Build Coastguard WorkerSRCDAT_1        equ     rN + rD * 1 + 1 *
36*f6dc9357SAndroid Build Coastguard WorkerSRCDAT_4        equ     rN + rD * 1 + 4 *
37*f6dc9357SAndroid Build Coastguard Worker
38*f6dc9357SAndroid Build Coastguard WorkerCRC macro op:req, dest:req, src:req, t:req
39*f6dc9357SAndroid Build Coastguard Worker        op      dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)]
40*f6dc9357SAndroid Build Coastguard Workerendm
41*f6dc9357SAndroid Build Coastguard Worker
42*f6dc9357SAndroid Build Coastguard WorkerCRC_XOR macro dest:req, src:req, t:req
43*f6dc9357SAndroid Build Coastguard Worker        CRC     xor, dest, src, t
44*f6dc9357SAndroid Build Coastguard Workerendm
45*f6dc9357SAndroid Build Coastguard Worker
46*f6dc9357SAndroid Build Coastguard WorkerCRC_MOV macro dest:req, src:req, t:req
47*f6dc9357SAndroid Build Coastguard Worker        CRC     mov, dest, src, t
48*f6dc9357SAndroid Build Coastguard Workerendm
49*f6dc9357SAndroid Build Coastguard Worker
50*f6dc9357SAndroid Build Coastguard WorkerMOVZXLO macro dest:req, src:req
51*f6dc9357SAndroid Build Coastguard Worker        movzx   dest, @CatStr(src, _L)
52*f6dc9357SAndroid Build Coastguard Workerendm
53*f6dc9357SAndroid Build Coastguard Worker
54*f6dc9357SAndroid Build Coastguard WorkerMOVZXHI macro dest:req, src:req
55*f6dc9357SAndroid Build Coastguard Worker        movzx   dest, @CatStr(src, _H)
56*f6dc9357SAndroid Build Coastguard Workerendm
57*f6dc9357SAndroid Build Coastguard Worker
58*f6dc9357SAndroid Build Coastguard Worker; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest
59*f6dc9357SAndroid Build Coastguard Worker; movzx x3, x0_L sometimes is 0   cycles latency (not always)
60*f6dc9357SAndroid Build Coastguard Worker; movzx x3, x0_L sometimes is 0.5 cycles latency
61*f6dc9357SAndroid Build Coastguard Worker; movzx x3, x0_H is 2 cycles latency in some cpus
62*f6dc9357SAndroid Build Coastguard Worker
63*f6dc9357SAndroid Build Coastguard WorkerCRC1b macro
64*f6dc9357SAndroid Build Coastguard Worker        movzx   x6, byte ptr [rD]
65*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x3, x0
66*f6dc9357SAndroid Build Coastguard Worker        inc     rD
67*f6dc9357SAndroid Build Coastguard Worker        shr     x0, 8
68*f6dc9357SAndroid Build Coastguard Worker        xor     x6, x3
69*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR x0, x6, 0
70*f6dc9357SAndroid Build Coastguard Worker        dec     rN
71*f6dc9357SAndroid Build Coastguard Workerendm
72*f6dc9357SAndroid Build Coastguard Worker
73*f6dc9357SAndroid Build Coastguard WorkerLOAD_1 macro dest:req, t:req, iter:req, index:req
74*f6dc9357SAndroid Build Coastguard Worker        movzx   dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
75*f6dc9357SAndroid Build Coastguard Workerendm
76*f6dc9357SAndroid Build Coastguard Worker
77*f6dc9357SAndroid Build Coastguard WorkerLOAD_2 macro dest:req, t:req, iter:req, index:req
78*f6dc9357SAndroid Build Coastguard Worker        movzx   dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
79*f6dc9357SAndroid Build Coastguard Workerendm
80*f6dc9357SAndroid Build Coastguard Worker
81*f6dc9357SAndroid Build Coastguard WorkerCRC_QUAD macro nn, t:req, iter:req
82*f6dc9357SAndroid Build Coastguard Workerifdef x64
83*f6dc9357SAndroid Build Coastguard Worker        ; paired memory loads give 1-3% speed gain, but it uses more registers
84*f6dc9357SAndroid Build Coastguard Worker        LOAD_2  x3, t, iter, 0
85*f6dc9357SAndroid Build Coastguard Worker        LOAD_2  x9, t, iter, 2
86*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x3
87*f6dc9357SAndroid Build Coastguard Worker        shr     x3, 8
88*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 3
89*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x9
90*f6dc9357SAndroid Build Coastguard Worker        shr     x9, 8
91*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, t * 4 + 2
92*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 1
93*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x9, t * 4 + 0
94*f6dc9357SAndroid Build Coastguard Workerelseif 0
95*f6dc9357SAndroid Build Coastguard Worker        LOAD_2  x3, t, iter, 0
96*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x3
97*f6dc9357SAndroid Build Coastguard Worker        shr     x3, 8
98*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 3
99*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, t * 4 + 2
100*f6dc9357SAndroid Build Coastguard Worker        LOAD_2  x3, t, iter, 2
101*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x3
102*f6dc9357SAndroid Build Coastguard Worker        shr     x3, 8
103*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 1
104*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, t * 4 + 0
105*f6dc9357SAndroid Build Coastguard Workerelseif 0
106*f6dc9357SAndroid Build Coastguard Worker        LOAD_1  x3, t, iter, 0
107*f6dc9357SAndroid Build Coastguard Worker        LOAD_1  x6, t, iter, 1
108*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, t * 4 + 3
109*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 2
110*f6dc9357SAndroid Build Coastguard Worker        LOAD_1  x3, t, iter, 2
111*f6dc9357SAndroid Build Coastguard Worker        LOAD_1  x6, t, iter, 3
112*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, t * 4 + 1
113*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 0
114*f6dc9357SAndroid Build Coastguard Workerelse
115*f6dc9357SAndroid Build Coastguard Worker        ; 32-bit load is better if there is only one read port (core2)
116*f6dc9357SAndroid Build Coastguard Worker        ; but that code can be slower if there are 2 read ports (snb)
117*f6dc9357SAndroid Build Coastguard Worker        mov     x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter *  NUM_WORDS) + 0)]
118*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x3
119*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 3
120*f6dc9357SAndroid Build Coastguard Worker        MOVZXHI x6, x3
121*f6dc9357SAndroid Build Coastguard Worker        shr     x3, 16
122*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 2
123*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x3
124*f6dc9357SAndroid Build Coastguard Worker        shr     x3, 8
125*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, t * 4 + 1
126*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, t * 4 + 0
127*f6dc9357SAndroid Build Coastguard Workerendif
128*f6dc9357SAndroid Build Coastguard Workerendm
129*f6dc9357SAndroid Build Coastguard Worker
130*f6dc9357SAndroid Build Coastguard Worker
131*f6dc9357SAndroid Build Coastguard WorkerLAST    equ     (4 * (NUM_WORDS - 1))
132*f6dc9357SAndroid Build Coastguard Worker
133*f6dc9357SAndroid Build Coastguard WorkerCRC_ITER macro qq, nn, iter
134*f6dc9357SAndroid Build Coastguard Worker        mov     nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))]
135*f6dc9357SAndroid Build Coastguard Worker
136*f6dc9357SAndroid Build Coastguard Worker    i = 0
137*f6dc9357SAndroid Build Coastguard Worker    rept NUM_WORDS - 1
138*f6dc9357SAndroid Build Coastguard Worker        CRC_QUAD nn, i, iter
139*f6dc9357SAndroid Build Coastguard Worker        i = i + 1
140*f6dc9357SAndroid Build Coastguard Worker    endm
141*f6dc9357SAndroid Build Coastguard Worker
142*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, qq
143*f6dc9357SAndroid Build Coastguard Worker        mov     x3, qq
144*f6dc9357SAndroid Build Coastguard Worker        shr     x3, 24
145*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, LAST + 3
146*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x3, LAST + 0
147*f6dc9357SAndroid Build Coastguard Worker        ror     qq, 16
148*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, qq
149*f6dc9357SAndroid Build Coastguard Worker        shr     qq, 24
150*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, x6, LAST + 1
151*f6dc9357SAndroid Build Coastguard Workerif ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1))
152*f6dc9357SAndroid Build Coastguard Worker        CRC_MOV qq, qq, LAST + 2
153*f6dc9357SAndroid Build Coastguard Worker        xor     qq, nn
154*f6dc9357SAndroid Build Coastguard Workerelse
155*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR nn, qq, LAST + 2
156*f6dc9357SAndroid Build Coastguard Workerendif
157*f6dc9357SAndroid Build Coastguard Workerendm
158*f6dc9357SAndroid Build Coastguard Worker
159*f6dc9357SAndroid Build Coastguard Worker
160*f6dc9357SAndroid Build Coastguard Worker; + 4 for prefetching next 4-bytes after current iteration
161*f6dc9357SAndroid Build Coastguard WorkerNUM_BYTES_LIMIT equ     (NUM_WORDS * 4 * UNROLL_CNT + 4)
162*f6dc9357SAndroid Build Coastguard WorkerALIGN_MASK      equ     3
163*f6dc9357SAndroid Build Coastguard Worker
164*f6dc9357SAndroid Build Coastguard Worker
165*f6dc9357SAndroid Build Coastguard Worker; MY_PROC @CatStr(CrcUpdateT, 12), 4
166*f6dc9357SAndroid Build Coastguard WorkerMY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4
167*f6dc9357SAndroid Build Coastguard Worker        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
168*f6dc9357SAndroid Build Coastguard Worker    ifdef x64
169*f6dc9357SAndroid Build Coastguard Worker        mov     x0, REG_ABI_PARAM_0_x   ; x0 = x1(win) / x7(linux)
170*f6dc9357SAndroid Build Coastguard Worker        mov     rT, REG_ABI_PARAM_3     ; r5 = r9(win) / x1(linux)
171*f6dc9357SAndroid Build Coastguard Worker        mov     rN, REG_ABI_PARAM_2     ; r7 = r8(win) / r2(linux)
172*f6dc9357SAndroid Build Coastguard Worker        ; mov     rD, REG_ABI_PARAM_1     ; r2 = r2(win)
173*f6dc9357SAndroid Build Coastguard Worker      if  (IS_LINUX gt 0)
174*f6dc9357SAndroid Build Coastguard Worker        mov     rD, REG_ABI_PARAM_1     ; r2 = r6
175*f6dc9357SAndroid Build Coastguard Worker      endif
176*f6dc9357SAndroid Build Coastguard Worker    else
177*f6dc9357SAndroid Build Coastguard Worker      if  (IS_CDECL gt 0)
178*f6dc9357SAndroid Build Coastguard Worker        mov     x0, [r4 + crc_OFFS]
179*f6dc9357SAndroid Build Coastguard Worker        mov     rD, [r4 + data_OFFS]
180*f6dc9357SAndroid Build Coastguard Worker      else
181*f6dc9357SAndroid Build Coastguard Worker        mov     x0, REG_ABI_PARAM_0_x
182*f6dc9357SAndroid Build Coastguard Worker      endif
183*f6dc9357SAndroid Build Coastguard Worker        mov     rN, [r4 + size_OFFS]
184*f6dc9357SAndroid Build Coastguard Worker        mov     rT, [r4 + table_OFFS]
185*f6dc9357SAndroid Build Coastguard Worker    endif
186*f6dc9357SAndroid Build Coastguard Worker
187*f6dc9357SAndroid Build Coastguard Worker        cmp     rN, NUM_BYTES_LIMIT + ALIGN_MASK
188*f6dc9357SAndroid Build Coastguard Worker        jb      crc_end
189*f6dc9357SAndroid Build Coastguard Worker@@:
190*f6dc9357SAndroid Build Coastguard Worker        test    rD_x, ALIGN_MASK    ; test    rD, ALIGN_MASK
191*f6dc9357SAndroid Build Coastguard Worker        jz      @F
192*f6dc9357SAndroid Build Coastguard Worker        CRC1b
193*f6dc9357SAndroid Build Coastguard Worker        jmp     @B
194*f6dc9357SAndroid Build Coastguard Worker@@:
195*f6dc9357SAndroid Build Coastguard Worker        xor     x0, dword ptr [rD]
196*f6dc9357SAndroid Build Coastguard Worker        lea     rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
197*f6dc9357SAndroid Build Coastguard Worker        sub     rD, rN
198*f6dc9357SAndroid Build Coastguard Worker
199*f6dc9357SAndroid Build Coastguard Workeralign 16
200*f6dc9357SAndroid Build Coastguard Worker@@:
201*f6dc9357SAndroid Build Coastguard Workerunr_index = 0
202*f6dc9357SAndroid Build Coastguard Workerwhile unr_index lt UNROLL_CNT
203*f6dc9357SAndroid Build Coastguard Worker    if (unr_index and 1) eq 0
204*f6dc9357SAndroid Build Coastguard Worker        CRC_ITER x0, x1, unr_index
205*f6dc9357SAndroid Build Coastguard Worker    else
206*f6dc9357SAndroid Build Coastguard Worker        CRC_ITER x1, x0, unr_index
207*f6dc9357SAndroid Build Coastguard Worker    endif
208*f6dc9357SAndroid Build Coastguard Worker        unr_index = unr_index + 1
209*f6dc9357SAndroid Build Coastguard Workerendm
210*f6dc9357SAndroid Build Coastguard Worker
211*f6dc9357SAndroid Build Coastguard Worker        add     rD, NUM_WORDS * 4 * UNROLL_CNT
212*f6dc9357SAndroid Build Coastguard Worker        jnc     @B
213*f6dc9357SAndroid Build Coastguard Worker
214*f6dc9357SAndroid Build Coastguard Workerif 0
215*f6dc9357SAndroid Build Coastguard Worker        ; byte verson
216*f6dc9357SAndroid Build Coastguard Worker        add     rD, rN
217*f6dc9357SAndroid Build Coastguard Worker        xor     x0, dword ptr [rD]
218*f6dc9357SAndroid Build Coastguard Worker        add     rN, NUM_BYTES_LIMIT - 1
219*f6dc9357SAndroid Build Coastguard Workerelse
220*f6dc9357SAndroid Build Coastguard Worker        ; 4-byte version
221*f6dc9357SAndroid Build Coastguard Worker        add     rN, 4 * NUM_WORDS * UNROLL_CNT
222*f6dc9357SAndroid Build Coastguard Worker        sub     rD, 4 * NUM_WORDS * UNROLL_CNT
223*f6dc9357SAndroid Build Coastguard Worker@@:
224*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x3, x0
225*f6dc9357SAndroid Build Coastguard Worker        MOVZXHI x1, x0
226*f6dc9357SAndroid Build Coastguard Worker        shr     x0, 16
227*f6dc9357SAndroid Build Coastguard Worker        MOVZXLO x6, x0
228*f6dc9357SAndroid Build Coastguard Worker        shr     x0, 8
229*f6dc9357SAndroid Build Coastguard Worker        CRC_MOV x0, x0, 0
230*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR x0, x3, 3
231*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR x0, x1, 2
232*f6dc9357SAndroid Build Coastguard Worker        CRC_XOR x0, x6, 1
233*f6dc9357SAndroid Build Coastguard Worker
234*f6dc9357SAndroid Build Coastguard Worker        add     rD, 4
235*f6dc9357SAndroid Build Coastguard Workerif (NUM_WORDS * UNROLL_CNT) ne 1
236*f6dc9357SAndroid Build Coastguard Worker        jc      @F
237*f6dc9357SAndroid Build Coastguard Worker        xor     x0, [SRCDAT_4 0]
238*f6dc9357SAndroid Build Coastguard Worker        jmp     @B
239*f6dc9357SAndroid Build Coastguard Worker@@:
240*f6dc9357SAndroid Build Coastguard Workerendif
241*f6dc9357SAndroid Build Coastguard Worker        add     rD, rN
242*f6dc9357SAndroid Build Coastguard Worker        add     rN, 4 - 1
243*f6dc9357SAndroid Build Coastguard Worker
244*f6dc9357SAndroid Build Coastguard Workerendif
245*f6dc9357SAndroid Build Coastguard Worker
246*f6dc9357SAndroid Build Coastguard Worker        sub     rN, rD
247*f6dc9357SAndroid Build Coastguard Workercrc_end:
248*f6dc9357SAndroid Build Coastguard Worker        test    rN, rN
249*f6dc9357SAndroid Build Coastguard Worker        jz      func_end
250*f6dc9357SAndroid Build Coastguard Worker@@:
251*f6dc9357SAndroid Build Coastguard Worker        CRC1b
252*f6dc9357SAndroid Build Coastguard Worker        jnz     @B
253*f6dc9357SAndroid Build Coastguard Worker
254*f6dc9357SAndroid Build Coastguard Workerfunc_end:
255*f6dc9357SAndroid Build Coastguard Worker        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
256*f6dc9357SAndroid Build Coastguard WorkerMY_ENDP
257*f6dc9357SAndroid Build Coastguard Worker
258*f6dc9357SAndroid Build Coastguard Workerend
259