aes-xts-avx-x86_64.S - OpenGrok cross reference for /linux-6.14.4/arch/x86/crypto/aes-xts-avx-x86

Lines Matching +full:sub +full:- +full:block
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3  * AES-XTS for modern x86_64 CPUs
11  * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
16  * AES-NI + AVX
17  *    - 128-bit vectors (1 AES block per vector)
18  *    - VEX-coded instructions
19  *    - xmm0-xmm15
20  *    - This is for older CPUs that lack VAES but do have AVX.
23  *    - 256-bit vectors (2 AES blocks per vector)
24  *    - VEX-coded instructions
25  *    - ymm0-ymm15
26  *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
30  *    - 256-bit vectors (2 AES blocks per vector)
31  *    - EVEX-coded instructions
32  *    - ymm0-ymm31
33  *    - This is for CPUs that have AVX512 but where using zmm registers causes
35  *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36  *      To avoid confusion with 512-bit, we just write AVX10/256.
39  *    - Same as the previous one, but upgrades to 512-bit vectors
40  *      (4 AES blocks per vector) in zmm0-zmm31.
41  *    - This is for CPUs that have good AVX512 or AVX10/512 support.
43  * This file doesn't have an implementation for AES-NI alone (without AVX), as
51  * The AES-XTS implementations in this file support everything required by the
52  * crypto API, including support for arbitrary input lengths and multi-part
54  * power-of-2 length inputs that are processed in a single part (disk sectors).
73 	// on CPUs that don't support AVX10-style masking.
96 	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
110 					// advanced to point to 7th-from-last round key
122 	// %r9-r11 are available as temporaries.
124 	// V0-V3 hold the data blocks during the main loop, or temporary values
125 	// otherwise.  V4-V5 hold temporary values.
127 	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
135 	// V10-V13 are used for computing the next values of TWEAK[0-3].
141 	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
145 	// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
149 	// If 32 SIMD registers are available, then V16-V29 hold the remaining
150 	// AES round keys, copied to all 128-bit lanes.
152 	// AES-128, AES-192, and AES-256 use different numbers of round keys.
154 	// keys to the *end* of this register range.  I.e., AES-128 uses
155 	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
156 	// (All also use KEY0 for the XOR-only "round" at the beginning.)
187 	// V30-V31 are currently unused.
200 // Broadcast a 128-bit value into a vector.
224 	// vpternlogd with immediate 0x96 is a three-argument XOR.
232 // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
256 	vpsrlq		$64 - VL/16, \src, \tmp1
265 // store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
276 	// Compute the second block of TWEAK0.
288 	// Compute TWEAK[1-3] from TWEAK0.
289 	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
290 	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
291 	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
355 // shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
356 // to do 128-bit wide shifts.  The 128-bit left shift (vpslldq) saves
357 // instructions directly.  The 128-bit right shift (vpsrldq) performs better
358 // than a 64-bit right shift on Intel CPUs in the context where it is used here,
362 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
364 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
366 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
368 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
390 // TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
416 	// For AES-128, increment by 3*16, resulting in the 10 round keys (not
417 	// counting the zero-th round key which was just loaded into KEY0) being
418 	// -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
419 	// 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
420 	// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
423 	// any round key be in the range [-96, 112], fitting in a signed byte.
424 	// This shortens VEX-encoded instructions that access the later round
425 	// keys which otherwise would need 4-byte offsets.  Second, it makes it
426 	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
435 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
442 	_vbroadcast128	-6*16(KEY), KEY1
443 	_vbroadcast128	-5*16(KEY), KEY2
445 	_vbroadcast128	-4*16(KEY), KEY3
446 	_vbroadcast128	-3*16(KEY), KEY4
448 	_vbroadcast128	-2*16(KEY), KEY5
449 	_vbroadcast128	-1*16(KEY), KEY6
461 // Do a single non-last round of AES encryption (if \enc==1) or decryption (if
462 // \enc==0) on the block(s) in \data using the round key(s) in \key.  The
481 // Do a single non-last round of AES en/decryption on the block(s) in \data,
482 // using the same key for all block(s).  The round key is loaded from the
489 	_vaes		\enc, (\i-7)*16(KEY), \data
491 	_vbroadcast128	(\i-7)*16(KEY), \tmp
497 // Do a single non-last round of AES en/decryption on the blocks in registers
498 // V0-V3, using the same key for all blocks.  The round key is loaded from the
503 	_tweak_step	(2*(\i-5))
506 	_tweak_step	(2*(\i-5) + 1)
510 	_vbroadcast128	(\i-7)*16(KEY), V4
511 	_tweak_step	(2*(\i-5))
514 	_tweak_step	(2*(\i-5) + 1)
521 // then XOR with \tweak again) of the block(s) in \data.  To process a single
522 // block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
556 	// block length, exclude the last full block from the main loop by
559 	// the last full block and the partial block specially at the end.
560 	lea		-16(LEN), %eax
565 	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
571 	// Compute the first set of tweaks TWEAK[0-3].
574 	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
580 	// XOR each source block with its tweak and the zero-th round key.
652 	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
653 	sub		$-4*VL, DST
654 	add		$-4*VL, LEN
658 	// 4*VL.  Handle it out-of-line in order to optimize for the common
660 	test		$4*VL-1, LEN8
674 	add		$3*VL, LEN	// Undo extra sub of 4*VL, then sub VL.
683 	sub		$VL, LEN
686 	add		$VL-16, LEN	// Undo extra sub of VL, then sub 16.
688 	add		$4*VL-16, LEN	// Undo extra sub of 4*VL, then sub 16.
700 	sub		$16, LEN
703 	add		$16, LEN	// Undo the extra sub of 16.
711 	// If encrypting, the main loop already encrypted the last full block to
714 	sub		$16, SRC
715 	sub		$16, DST
718 	// If decrypting, the main loop didn't decrypt the last full block
720 	// Do it now by advancing the tweak and decrypting the last full block.
728 	mov		$-1, %r9d
732 	// Swap the first LEN bytes of the en/decryption of the last full block
733 	// with the partial block.  Note that to support in-place en/decryption,
734 	// the load from the src partial block must happen before the store to
735 	// the dst partial block.
742 	// Load the src partial block, left-aligned.  Note that to support
743 	// in-place en/decryption, this must happen before the store to the dst
744 	// partial block.
747 	// Shift the first LEN bytes of the en/decryption of the last full block
749 	// dst partial block.  It also writes to the second part of the dst last
750 	// full block, but that part is overwritten later.
754 	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
755 	sub		LEN64, %r9
758 	// Shift the src partial block to the beginning of its register.
761 	// Do a blend to generate the src partial block followed by the second
762 	// part of the en/decryption of the last full block.
765 	// En/decrypt again and store the last full block.
775 // that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
785 	lea		-16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
789 	vaesenc		-6*16(TWEAK_KEY), %xmm0, %xmm0
790 	vaesenc		-5*16(TWEAK_KEY), %xmm0, %xmm0
792 	vaesenc		-4*16(TWEAK_KEY), %xmm0, %xmm0
793 	vaesenc		-3*16(TWEAK_KEY), %xmm0, %xmm0
795 .irp i, -2,-1,0,1,2,3,4,5,6
803 // Below are the actual AES-XTS encryption and decryption functions,