Lines Matching +full:sub +full:- +full:block
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * AES-XTS for modern x86_64 CPUs
11 * This file implements AES-XTS for modern x86_64 CPUs. To handle the
16 * AES-NI + AVX
17 * - 128-bit vectors (1 AES block per vector)
18 * - VEX-coded instructions
19 * - xmm0-xmm15
20 * - This is for older CPUs that lack VAES but do have AVX.
23 * - 256-bit vectors (2 AES blocks per vector)
24 * - VEX-coded instructions
25 * - ymm0-ymm15
26 * - This is for CPUs that have VAES but lack AVX512 or AVX10,
30 * - 256-bit vectors (2 AES blocks per vector)
31 * - EVEX-coded instructions
32 * - ymm0-ymm31
33 * - This is for CPUs that have AVX512 but where using zmm registers causes
35 * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36 * To avoid confusion with 512-bit, we just write AVX10/256.
39 * - Same as the previous one, but upgrades to 512-bit vectors
40 * (4 AES blocks per vector) in zmm0-zmm31.
41 * - This is for CPUs that have good AVX512 or AVX10/512 support.
43 * This file doesn't have an implementation for AES-NI alone (without AVX), as
51 * The AES-XTS implementations in this file support everything required by the
52 * crypto API, including support for arbitrary input lengths and multi-part
54 * power-of-2 length inputs that are processed in a single part (disk sectors).
73 // on CPUs that don't support AVX10-style masking.
96 // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
110 // advanced to point to 7th-from-last round key
122 // %r9-r11 are available as temporaries.
124 // V0-V3 hold the data blocks during the main loop, or temporary values
125 // otherwise. V4-V5 hold temporary values.
127 // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
135 // V10-V13 are used for computing the next values of TWEAK[0-3].
141 // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
145 // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
149 // If 32 SIMD registers are available, then V16-V29 hold the remaining
150 // AES round keys, copied to all 128-bit lanes.
152 // AES-128, AES-192, and AES-256 use different numbers of round keys.
154 // keys to the *end* of this register range. I.e., AES-128 uses
155 // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
156 // (All also use KEY0 for the XOR-only "round" at the beginning.)
187 // V30-V31 are currently unused.
200 // Broadcast a 128-bit value into a vector.
224 // vpternlogd with immediate 0x96 is a three-argument XOR.
232 // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
256 vpsrlq $64 - VL/16, \src, \tmp1
265 // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
276 // Compute the second block of TWEAK0.
288 // Compute TWEAK[1-3] from TWEAK0.
289 vpsrlq $64 - 1*VL/16, TWEAK0, V0
290 vpsrlq $64 - 2*VL/16, TWEAK0, V2
291 vpsrlq $64 - 3*VL/16, TWEAK0, V4
355 // shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
356 // to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves
357 // instructions directly. The 128-bit right shift (vpsrldq) performs better
358 // than a 64-bit right shift on Intel CPUs in the context where it is used here,
362 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
364 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
366 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
368 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
390 // TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
416 // For AES-128, increment by 3*16, resulting in the 10 round keys (not
417 // counting the zero-th round key which was just loaded into KEY0) being
418 // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
419 // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
420 // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
423 // any round key be in the range [-96, 112], fitting in a signed byte.
424 // This shortens VEX-encoded instructions that access the later round
425 // keys which otherwise would need 4-byte offsets. Second, it makes it
426 // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
435 lea OFFS-16(KEY, KEYLEN64, 4), KEY
442 _vbroadcast128 -6*16(KEY), KEY1
443 _vbroadcast128 -5*16(KEY), KEY2
445 _vbroadcast128 -4*16(KEY), KEY3
446 _vbroadcast128 -3*16(KEY), KEY4
448 _vbroadcast128 -2*16(KEY), KEY5
449 _vbroadcast128 -1*16(KEY), KEY6
461 // Do a single non-last round of AES encryption (if \enc==1) or decryption (if
462 // \enc==0) on the block(s) in \data using the round key(s) in \key. The
481 // Do a single non-last round of AES en/decryption on the block(s) in \data,
482 // using the same key for all block(s). The round key is loaded from the
489 _vaes \enc, (\i-7)*16(KEY), \data
491 _vbroadcast128 (\i-7)*16(KEY), \tmp
497 // Do a single non-last round of AES en/decryption on the blocks in registers
498 // V0-V3, using the same key for all blocks. The round key is loaded from the
503 _tweak_step (2*(\i-5))
506 _tweak_step (2*(\i-5) + 1)
510 _vbroadcast128 (\i-7)*16(KEY), V4
511 _tweak_step (2*(\i-5))
514 _tweak_step (2*(\i-5) + 1)
521 // then XOR with \tweak again) of the block(s) in \data. To process a single
522 // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
556 // block length, exclude the last full block from the main loop by
559 // the last full block and the partial block specially at the end.
560 lea -16(LEN), %eax
565 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
571 // Compute the first set of tweaks TWEAK[0-3].
574 add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32
580 // XOR each source block with its tweak and the zero-th round key.
652 sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
653 sub $-4*VL, DST
654 add $-4*VL, LEN
658 // 4*VL. Handle it out-of-line in order to optimize for the common
660 test $4*VL-1, LEN8
674 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
683 sub $VL, LEN
686 add $VL-16, LEN // Undo extra sub of VL, then sub 16.
688 add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
700 sub $16, LEN
703 add $16, LEN // Undo the extra sub of 16.
711 // If encrypting, the main loop already encrypted the last full block to
714 sub $16, SRC
715 sub $16, DST
718 // If decrypting, the main loop didn't decrypt the last full block
720 // Do it now by advancing the tweak and decrypting the last full block.
728 mov $-1, %r9d
732 // Swap the first LEN bytes of the en/decryption of the last full block
733 // with the partial block. Note that to support in-place en/decryption,
734 // the load from the src partial block must happen before the store to
735 // the dst partial block.
742 // Load the src partial block, left-aligned. Note that to support
743 // in-place en/decryption, this must happen before the store to the dst
744 // partial block.
747 // Shift the first LEN bytes of the en/decryption of the last full block
749 // dst partial block. It also writes to the second part of the dst last
750 // full block, but that part is overwritten later.
754 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
755 sub LEN64, %r9
758 // Shift the src partial block to the beginning of its register.
761 // Do a blend to generate the src partial block followed by the second
762 // part of the en/decryption of the last full block.
765 // En/decrypt again and store the last full block.
775 // that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
785 lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
789 vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0
790 vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0
792 vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0
793 vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0
795 .irp i, -2,-1,0,1,2,3,4,5,6
803 // Below are the actual AES-XTS encryption and decryption functions,