xref: /aosp_15_r20/external/lzma/Asm/x86/XzCrc64Opt.asm (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1; XzCrc64Opt.asm -- CRC64 calculation : optimized version
2; 2023-12-08 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8NUM_WORDS       equ     3
9
10if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
11.err <num_words_IS_INCORRECT>
12endif
13
14NUM_SKIP_BYTES  equ     ((NUM_WORDS - 2) * 4)
15
16
17MOVZXLO macro dest:req, src:req
18        movzx   dest, @CatStr(src, _L)
19endm
20
21MOVZXHI macro dest:req, src:req
22        movzx   dest, @CatStr(src, _H)
23endm
24
25
26ifdef x64
27
28rD      equ  r11
29rN      equ  r10
30rT      equ  r9
31
32CRC_OP macro op:req, dest:req, src:req, t:req
33        op      dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)]
34endm
35
36CRC_XOR macro dest:req, src:req, t:req
37        CRC_OP  xor, dest, src, t
38endm
39
40CRC_MOV macro dest:req, src:req, t:req
41        CRC_OP  mov, dest, src, t
42endm
43
44CRC1b macro
45        movzx   x6, BYTE PTR [rD]
46        inc     rD
47        MOVZXLO x3, x0
48        xor     x6, x3
49        shr     r0, 8
50        CRC_XOR r0, x6, 0
51        dec     rN
52endm
53
54
55; ALIGN_MASK is 3 or 7 bytes alignment:
56ALIGN_MASK      equ  (7 - (NUM_WORDS and 1) * 4)
57
58if NUM_WORDS eq 1
59
60src_rN_offset   equ  4
61; + 4 for prefetching next 4-bytes after current iteration
62NUM_BYTES_LIMIT equ  (NUM_WORDS * 4 + 4)
63SRCDAT4         equ  DWORD PTR [rN + rD * 1]
64
65XOR_NEXT macro
66        mov     x1, [rD]
67        xor     r0, r1
68endm
69
70else ; NUM_WORDS > 1
71
72src_rN_offset   equ 8
73; + 8 for prefetching next 8-bytes after current iteration
74NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8)
75
76XOR_NEXT macro
77        xor     r0, QWORD PTR [rD] ; 64-bit read, can be unaligned
78endm
79
80; 32-bit or 64-bit
81LOAD_SRC_MULT4 macro dest:req, word_index:req
82        mov     dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset];
83endm
84
85endif
86
87
88
89MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4
90        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
91
92        mov     r0, REG_ABI_PARAM_0   ; r0  <- r1 / r7
93        mov     rD, REG_ABI_PARAM_1   ; r11 <- r2 / r6
94        mov     rN, REG_ABI_PARAM_2   ; r10 <- r8 / r2
95if  (IS_LINUX gt 0)
96        mov     rT, REG_ABI_PARAM_3   ; r9  <- r9 / r1
97endif
98
99        cmp     rN, NUM_BYTES_LIMIT + ALIGN_MASK
100        jb      crc_end
101@@:
102        test    rD, ALIGN_MASK
103        jz      @F
104        CRC1b
105        jmp     @B
106@@:
107        XOR_NEXT
108        lea     rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
109        sub     rD, rN
110        add     rN, src_rN_offset
111
112align 16
113@@:
114
115if NUM_WORDS eq 1
116
117        mov     x1, x0
118        shr     x1, 8
119        MOVZXLO x3, x1
120        MOVZXLO x2, x0
121        shr     x1, 8
122        shr     r0, 32
123        xor     x0, SRCDAT4
124        CRC_XOR r0, x2, 3
125        CRC_XOR r0, x3, 2
126        MOVZXLO x2, x1
127        shr     x1, 8
128        CRC_XOR r0, x2, 1
129        CRC_XOR r0, x1, 0
130
131else ; NUM_WORDS > 1
132
133if NUM_WORDS ne 2
134  k = 2
135  while k lt NUM_WORDS
136
137        LOAD_SRC_MULT4  x1, k
138    crc_op1  textequ <xor>
139
140    if k eq 2
141      if (NUM_WORDS and 1)
142        LOAD_SRC_MULT4  x7, NUM_WORDS       ; aligned 32-bit
143        LOAD_SRC_MULT4  x6, NUM_WORDS + 1   ; aligned 32-bit
144        shl     r6, 32
145      else
146        LOAD_SRC_MULT4  r6, NUM_WORDS       ; aligned 64-bit
147        crc_op1  textequ <mov>
148      endif
149    endif
150        table = 4 * (NUM_WORDS - 1 - k)
151        MOVZXLO x3, x1
152        CRC_OP crc_op1, r7, x3, 3 + table
153        MOVZXHI x3, x1
154        shr     x1, 16
155        CRC_XOR r6, x3, 2 + table
156        MOVZXLO x3, x1
157        shr     x1, 8
158        CRC_XOR r7, x3, 1 + table
159        CRC_XOR r6, x1, 0 + table
160        k = k + 1
161  endm
162        crc_op2  textequ <xor>
163
164else ; NUM_WORDS == 2
165        LOAD_SRC_MULT4  r6, NUM_WORDS       ; aligned 64-bit
166        crc_op2  textequ <mov>
167endif ; NUM_WORDS == 2
168
169        MOVZXHI x3, x0
170        MOVZXLO x2, x0
171        mov     r1, r0
172        shr     r1, 32
173        shr     x0, 16
174        CRC_XOR r6, x2, NUM_SKIP_BYTES + 7
175        CRC_OP  crc_op2, r7, x3, NUM_SKIP_BYTES + 6
176        MOVZXLO x2, x0
177        MOVZXHI x5, x1
178        MOVZXLO x3, x1
179        shr     x0, 8
180        shr     x1, 16
181        CRC_XOR r7, x2, NUM_SKIP_BYTES + 5
182        CRC_XOR r6, x3, NUM_SKIP_BYTES + 3
183        CRC_XOR r7, x0, NUM_SKIP_BYTES + 4
184        CRC_XOR r6, x5, NUM_SKIP_BYTES + 2
185        MOVZXLO x2, x1
186        shr     x1, 8
187        CRC_XOR r7, x2, NUM_SKIP_BYTES + 1
188        CRC_MOV r0, x1, NUM_SKIP_BYTES + 0
189        xor     r0, r6
190        xor     r0, r7
191
192endif ; NUM_WORDS > 1
193        add     rD, NUM_WORDS * 4
194        jnc     @B
195
196        sub     rN, src_rN_offset
197        add     rD, rN
198        XOR_NEXT
199        add     rN, NUM_BYTES_LIMIT - 1
200        sub     rN, rD
201
202crc_end:
203        test    rN, rN
204        jz      func_end
205@@:
206        CRC1b
207        jnz      @B
208func_end:
209        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
210MY_ENDP
211
212
213
214else
215; ==================================================================
216; x86 (32-bit)
217
218rD      equ  r7
219rN      equ  r1
220rT      equ  r5
221
222xA      equ  x6
223xA_R    equ  r6
224
225ifdef x64
226    num_VAR     equ  r8
227else
228
229crc_OFFS  equ  (REG_SIZE * 5)
230
231if (IS_CDECL gt 0) or (IS_LINUX gt 0)
232    ; cdecl or (GNU fastcall) stack:
233    ;   (UInt32 *) table
234    ;   size_t     size
235    ;   void *     data
236    ;   (UInt64)   crc
237    ;   ret-ip <-(r4)
238    data_OFFS   equ  (8 + crc_OFFS)
239    size_OFFS   equ  (REG_SIZE + data_OFFS)
240    table_OFFS  equ  (REG_SIZE + size_OFFS)
241    num_VAR     equ  [r4 + size_OFFS]
242    table_VAR   equ  [r4 + table_OFFS]
243else
244    ; Windows fastcall:
245    ;   r1 = data, r2 = size
246    ; stack:
247    ;   (UInt32 *) table
248    ;   (UInt64)   crc
249    ;   ret-ip <-(r4)
250    table_OFFS  equ  (8 + crc_OFFS)
251    table_VAR   equ  [r4 + table_OFFS]
252    num_VAR     equ  table_VAR
253endif
254endif ; x64
255
256SRCDAT4         equ     DWORD PTR [rN + rD * 1]
257
258CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req
259        op      dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4]
260endm
261
262CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
263        CRC_1   op0, dest0, src, t, 0
264        CRC_1   op1, dest1, src, t, 1
265endm
266
267CRC_XOR macro dest0:req, dest1:req, src:req, t:req
268        CRC xor, xor, dest0, dest1, src, t
269endm
270
271
272CRC1b macro
273        movzx   xA, BYTE PTR [rD]
274        inc     rD
275        MOVZXLO x3, x0
276        xor     xA, x3
277        shrd    x0, x2, 8
278        shr     x2, 8
279        CRC_XOR x0, x2, xA, 0
280        dec     rN
281endm
282
283
284MY_PROLOG_BASE macro
285        MY_PUSH_4_REGS
286ifdef x64
287        mov     r0, REG_ABI_PARAM_0     ; r0 <- r1 / r7
288        mov     rT, REG_ABI_PARAM_3     ; r5 <- r9 / r1
289        mov     rN, REG_ABI_PARAM_2     ; r1 <- r8 / r2
290        mov     rD, REG_ABI_PARAM_1     ; r7 <- r2 / r6
291        mov     r2, r0
292        shr     r2, 32
293        mov     x0, x0
294else
295    if (IS_CDECL gt 0) or (IS_LINUX gt 0)
296        proc_numParams = proc_numParams + 2 ; for ABI_LINUX
297        mov     rN, [r4 + size_OFFS]
298        mov     rD, [r4 + data_OFFS]
299    else
300        mov     rD, REG_ABI_PARAM_0     ; r7 <- r1 : (data)
301        mov     rN, REG_ABI_PARAM_1     ; r1 <- r2 : (size)
302    endif
303        mov     x0, [r4 + crc_OFFS]
304        mov     x2, [r4 + crc_OFFS + 4]
305        mov     rT, table_VAR
306endif
307endm
308
309
310MY_EPILOG_BASE macro crc_end:req, func_end:req
311crc_end:
312        test    rN, rN
313        jz      func_end
314@@:
315        CRC1b
316        jnz      @B
317func_end:
318ifdef x64
319        shl     r2, 32
320        xor     r0, r2
321endif
322        MY_POP_4_REGS
323endm
324
325
326; ALIGN_MASK is 3 or 7 bytes alignment:
327ALIGN_MASK  equ     (7 - (NUM_WORDS and 1) * 4)
328
329if (NUM_WORDS eq 1)
330
331NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4)
332
333MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
334        MY_PROLOG_BASE
335
336        cmp     rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK
337        jb      crc_end_4
338@@:
339        test    rD, ALIGN_MASK
340        jz      @F
341        CRC1b
342        jmp     @B
343@@:
344        xor     x0, [rD]
345        lea     rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)]
346        sub     rD, rN
347        add     rN, 4
348
349        MOVZXLO xA, x0
350align 16
351@@:
352        mov     x3, SRCDAT4
353        xor     x3, x2
354        shr     x0, 8
355        CRC xor, mov, x3, x2, xA, 3
356        MOVZXLO xA, x0
357        shr     x0, 8
358        ; MOVZXHI  xA, x0
359        ; shr     x0, 16
360        CRC_XOR x3, x2, xA, 2
361
362        MOVZXLO xA, x0
363        shr     x0, 8
364        CRC_XOR x3, x2, xA, 1
365        CRC_XOR x3, x2, x0, 0
366        MOVZXLO xA, x3
367        mov     x0, x3
368
369        add     rD, 4
370        jnc     @B
371
372        sub     rN, 4
373        add     rD, rN
374        xor     x0, [rD]
375        add     rN, NUM_BYTES_LIMIT_T4 - 1
376        sub     rN, rD
377        MY_EPILOG_BASE crc_end_4, func_end_4
378MY_ENDP
379
380else ; NUM_WORDS > 1
381
382SHR_X macro x, imm
383        shr x, imm
384endm
385
386
387ITER_1 macro v0, v1, a, off
388        MOVZXLO xA, a
389        SHR_X   a, 8
390        CRC_XOR v0, v1, xA, off
391endm
392
393
394ITER_4 macro v0, v1, a, off
395if 0 eq 0
396        ITER_1  v0, v1, a, off + 3
397        ITER_1  v0, v1, a, off + 2
398        ITER_1  v0, v1, a, off + 1
399        CRC_XOR v0, v1, a, off
400elseif 0 eq 0
401        MOVZXLO xA, a
402        CRC_XOR v0, v1, xA, off + 3
403        mov     xA, a
404        ror     a, 16   ; 32-bit ror
405        shr     xA, 24
406        CRC_XOR v0, v1, xA, off
407        MOVZXLO xA, a
408        SHR_X   a, 24
409        CRC_XOR v0, v1, xA, off + 1
410        CRC_XOR v0, v1, a, off + 2
411else
412        ; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction
413        MOVZXLO xA, a
414        CRC_XOR v0, v1, xA, off + 3
415        MOVZXHI xA, a
416        SHR_X   a, 16
417        CRC_XOR v0, v1, xA, off + 2
418        MOVZXLO xA, a
419        SHR_X   a, 8
420        CRC_XOR v0, v1, xA, off + 1
421        CRC_XOR v0, v1, a, off
422endif
423endm
424
425
426
427ITER_1_PAIR macro v0, v1, a0, a1, off
428        ITER_1 v0, v1, a0, off + 4
429        ITER_1 v0, v1, a1, off
430endm
431
432src_rD_offset equ 8
433STEP_SIZE       equ     (NUM_WORDS * 4)
434
435ITER_12_NEXT macro op, index, v0, v1
436        op     v0, DWORD PTR [rD + (index + 1) * STEP_SIZE     - src_rD_offset]
437        op     v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset]
438endm
439
440ITER_12 macro index, a0, a1, v0, v1
441
442  if NUM_SKIP_BYTES  eq 0
443        ITER_12_NEXT mov, index, v0, v1
444  else
445    k = 0
446    while k lt NUM_SKIP_BYTES
447        movzx   xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset]
448      if k eq 0
449        CRC mov, mov,   v0, v1, xA, NUM_SKIP_BYTES - 1 - k
450      else
451        CRC_XOR         v0, v1, xA, NUM_SKIP_BYTES - 1 - k
452      endif
453      k = k + 1
454    endm
455        ITER_12_NEXT xor, index, v0, v1
456  endif
457
458if 0 eq 0
459        ITER_4  v0, v1, a0, NUM_SKIP_BYTES + 4
460        ITER_4  v0, v1, a1, NUM_SKIP_BYTES
461else ; interleave version is faster/slower for different processors
462        ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3
463        ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2
464        ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1
465        CRC_XOR     v0, v1, a0,     NUM_SKIP_BYTES + 4
466        CRC_XOR     v0, v1, a1,     NUM_SKIP_BYTES
467endif
468endm
469
470; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads)
471UNROLL_CNT      equ     (2 * 1)
472NUM_BYTES_LIMIT equ     (STEP_SIZE * UNROLL_CNT + 8)
473
474MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
475        MY_PROLOG_BASE
476
477        cmp     rN, NUM_BYTES_LIMIT + ALIGN_MASK
478        jb      crc_end_12
479@@:
480        test    rD, ALIGN_MASK
481        jz      @F
482        CRC1b
483        jmp     @B
484@@:
485        xor     x0, [rD]
486        xor     x2, [rD + 4]
487        add     rD, src_rD_offset
488        lea     rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
489        mov     num_VAR, rN
490
491align 16
492@@:
493    i = 0
494    rept UNROLL_CNT
495      if (i and 1) eq 0
496        ITER_12     i, x0, x2,  x1, x3
497      else
498        ITER_12     i, x1, x3,  x0, x2
499      endif
500      i = i + 1
501    endm
502
503    if (UNROLL_CNT and 1)
504        mov     x0, x1
505        mov     x2, x3
506    endif
507        add     rD, STEP_SIZE * UNROLL_CNT
508        cmp     rD, num_VAR
509        jb      @B
510
511        mov     rN, num_VAR
512        add     rN, NUM_BYTES_LIMIT - 1
513        sub     rN, rD
514        sub     rD, src_rD_offset
515        xor     x0, [rD]
516        xor     x2, [rD + 4]
517
518        MY_EPILOG_BASE crc_end_12, func_end_12
519MY_ENDP
520
521endif ; (NUM_WORDS > 1)
522endif ; ! x64
523end
524