1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * AES-XTS for modern x86_64 CPUs
4 *
5 * Copyright 2024 Google LLC
6 *
7 * Author: Eric Biggers <[email protected]>
8 */
9
10/*
11 * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
12 * complexities of coding for x86 SIMD, e.g. where every vector length needs
13 * different code, it uses a macro to generate several implementations that
14 * share similar source code but are targeted at different CPUs, listed below:
15 *
16 * AES-NI + AVX
17 *    - 128-bit vectors (1 AES block per vector)
18 *    - VEX-coded instructions
19 *    - xmm0-xmm15
20 *    - This is for older CPUs that lack VAES but do have AVX.
21 *
22 * VAES + VPCLMULQDQ + AVX2
23 *    - 256-bit vectors (2 AES blocks per vector)
24 *    - VEX-coded instructions
25 *    - ymm0-ymm15
26 *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
27 *      e.g. Intel's Alder Lake and AMD's Zen 3.
28 *
29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2
30 *    - 256-bit vectors (2 AES blocks per vector)
31 *    - EVEX-coded instructions
32 *    - ymm0-ymm31
33 *    - This is for CPUs that have AVX512 but where using zmm registers causes
34 *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
35 *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36 *      To avoid confusion with 512-bit, we just write AVX10/256.
37 *
38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2
39 *    - Same as the previous one, but upgrades to 512-bit vectors
40 *      (4 AES blocks per vector) in zmm0-zmm31.
41 *    - This is for CPUs that have good AVX512 or AVX10/512 support.
42 *
43 * This file doesn't have an implementation for AES-NI alone (without AVX), as
44 * the lack of VEX would make all the assembly code different.
45 *
46 * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
47 * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
48 * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
49 * need to start also providing an implementation using VAES alone.
50 *
51 * The AES-XTS implementations in this file support everything required by the
52 * crypto API, including support for arbitrary input lengths and multi-part
53 * processing.  However, they are most heavily optimized for the common case of
54 * power-of-2 length inputs that are processed in a single part (disk sectors).
55 */
56
57#include <linux/linkage.h>
58#include <linux/cfi_types.h>
59
60.section .rodata
61.p2align 4
62.Lgf_poly:
63	// The low 64 bits of this value represent the polynomial x^7 + x^2 + x
64	// + 1.  It is the value that must be XOR'd into the low 64 bits of the
65	// tweak each time a 1 is carried out of the high 64 bits.
66	//
67	// The high 64 bits of this value is just the internal carry bit that
68	// exists when there's a carry out of the low 64 bits of the tweak.
69	.quad	0x87, 1
70
71	// This table contains constants for vpshufb and vpblendvb, used to
72	// handle variable byte shifts and blending during ciphertext stealing
73	// on CPUs that don't support AVX10-style masking.
74.Lcts_permute_table:
75	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
76	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
77	.byte	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
78	.byte	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
79	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
80	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
81.text
82
83.macro	_define_Vi	i
84.if VL == 16
85	.set	V\i,		%xmm\i
86.elseif VL == 32
87	.set	V\i,		%ymm\i
88.elseif VL == 64
89	.set	V\i,		%zmm\i
90.else
91	.error "Unsupported Vector Length (VL)"
92.endif
93.endm
94
95.macro _define_aliases
96	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
97	// are available, that map to the xmm, ymm, or zmm registers according
98	// to the selected Vector Length (VL).
99.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
100	_define_Vi	\i
101.endr
102.if USE_AVX10
103.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
104	_define_Vi	\i
105.endr
106.endif
107
108	// Function parameters
109	.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
110					// advanced to point to 7th-from-last round key
111	.set	SRC,		%rsi	// Pointer to next source data
112	.set	DST,		%rdx	// Pointer to next destination data
113	.set	LEN,		%ecx	// Remaining length in bytes
114	.set	LEN8,		%cl
115	.set	LEN64,		%rcx
116	.set	TWEAK,		%r8	// Pointer to next tweak
117
118	// %rax holds the AES key length in bytes.
119	.set	KEYLEN,		%eax
120	.set	KEYLEN64,	%rax
121
122	// %r9-r11 are available as temporaries.
123
124	// V0-V3 hold the data blocks during the main loop, or temporary values
125	// otherwise.  V4-V5 hold temporary values.
126
127	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
128	.set	TWEAK0_XMM,	%xmm6
129	.set	TWEAK0,		V6
130	.set	TWEAK1_XMM,	%xmm7
131	.set	TWEAK1,		V7
132	.set	TWEAK2,		V8
133	.set	TWEAK3,		V9
134
135	// V10-V13 are used for computing the next values of TWEAK[0-3].
136	.set	NEXT_TWEAK0,	V10
137	.set	NEXT_TWEAK1,	V11
138	.set	NEXT_TWEAK2,	V12
139	.set	NEXT_TWEAK3,	V13
140
141	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
142	.set	GF_POLY_XMM,	%xmm14
143	.set	GF_POLY,	V14
144
145	// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
146	.set	KEY0_XMM,	%xmm15
147	.set	KEY0,		V15
148
149	// If 32 SIMD registers are available, then V16-V29 hold the remaining
150	// AES round keys, copied to all 128-bit lanes.
151	//
152	// AES-128, AES-192, and AES-256 use different numbers of round keys.
153	// To allow handling all three variants efficiently, we align the round
154	// keys to the *end* of this register range.  I.e., AES-128 uses
155	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
156	// (All also use KEY0 for the XOR-only "round" at the beginning.)
157.if USE_AVX10
158	.set	KEY1_XMM,	%xmm16
159	.set	KEY1,		V16
160	.set	KEY2_XMM,	%xmm17
161	.set	KEY2,		V17
162	.set	KEY3_XMM,	%xmm18
163	.set	KEY3,		V18
164	.set	KEY4_XMM,	%xmm19
165	.set	KEY4,		V19
166	.set	KEY5_XMM,	%xmm20
167	.set	KEY5,		V20
168	.set	KEY6_XMM,	%xmm21
169	.set	KEY6,		V21
170	.set	KEY7_XMM,	%xmm22
171	.set	KEY7,		V22
172	.set	KEY8_XMM,	%xmm23
173	.set	KEY8,		V23
174	.set	KEY9_XMM,	%xmm24
175	.set	KEY9,		V24
176	.set	KEY10_XMM,	%xmm25
177	.set	KEY10,		V25
178	.set	KEY11_XMM,	%xmm26
179	.set	KEY11,		V26
180	.set	KEY12_XMM,	%xmm27
181	.set	KEY12,		V27
182	.set	KEY13_XMM,	%xmm28
183	.set	KEY13,		V28
184	.set	KEY14_XMM,	%xmm29
185	.set	KEY14,		V29
186.endif
187	// V30-V31 are currently unused.
188.endm
189
190// Move a vector between memory and a register.
191// The register operand must be in the first 16 vector registers.
192.macro	_vmovdqu	src, dst
193.if VL < 64
194	vmovdqu		\src, \dst
195.else
196	vmovdqu8	\src, \dst
197.endif
198.endm
199
200// Broadcast a 128-bit value into a vector.
201.macro	_vbroadcast128	src, dst
202.if VL == 16 && !USE_AVX10
203	vmovdqu		\src, \dst
204.elseif VL == 32 && !USE_AVX10
205	vbroadcasti128	\src, \dst
206.else
207	vbroadcasti32x4	\src, \dst
208.endif
209.endm
210
211// XOR two vectors together.
212// Any register operands must be in the first 16 vector registers.
213.macro	_vpxor	src1, src2, dst
214.if VL < 64
215	vpxor		\src1, \src2, \dst
216.else
217	vpxord		\src1, \src2, \dst
218.endif
219.endm
220
221// XOR three vectors together.
222.macro	_xor3	src1, src2, src3_and_dst
223.if USE_AVX10
224	// vpternlogd with immediate 0x96 is a three-argument XOR.
225	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
226.else
227	vpxor		\src1, \src3_and_dst, \src3_and_dst
228	vpxor		\src2, \src3_and_dst, \src3_and_dst
229.endif
230.endm
231
232// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
233// (by multiplying by the polynomial 'x') and write it to \dst.
234.macro	_next_tweak	src, tmp, dst
235	vpshufd		$0x13, \src, \tmp
236	vpaddq		\src, \src, \dst
237	vpsrad		$31, \tmp, \tmp
238.if USE_AVX10
239	vpternlogd	$0x78, GF_POLY_XMM, \tmp, \dst
240.else
241	vpand		GF_POLY_XMM, \tmp, \tmp
242	vpxor		\tmp, \dst, \dst
243.endif
244.endm
245
246// Given the XTS tweak(s) in the vector \src, compute the next vector of
247// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
248//
249// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
250// all tweaks in the vector in parallel.  If VL=16, we just do the regular
251// computation without vpclmulqdq, as it's the faster method for a single tweak.
252.macro	_next_tweakvec	src, tmp1, tmp2, dst
253.if VL == 16
254	_next_tweak	\src, \tmp1, \dst
255.else
256	vpsrlq		$64 - VL/16, \src, \tmp1
257	vpclmulqdq	$0x01, GF_POLY, \tmp1, \tmp2
258	vpslldq		$8, \tmp1, \tmp1
259	vpsllq		$VL/16, \src, \dst
260	_xor3		\tmp1, \tmp2, \dst
261.endif
262.endm
263
264// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
265// store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
266.macro	_compute_first_set_of_tweaks
267	vmovdqu		(TWEAK), TWEAK0_XMM
268	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
269.if VL == 16
270	// With VL=16, multiplying by x serially is fastest.
271	_next_tweak	TWEAK0, %xmm0, TWEAK1
272	_next_tweak	TWEAK1, %xmm0, TWEAK2
273	_next_tweak	TWEAK2, %xmm0, TWEAK3
274.else
275.if VL == 32
276	// Compute the second block of TWEAK0.
277	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
278	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
279.elseif VL == 64
280	// Compute the remaining blocks of TWEAK0.
281	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
282	_next_tweak	%xmm1, %xmm0, %xmm2
283	_next_tweak	%xmm2, %xmm0, %xmm3
284	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
285	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
286	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
287.endif
288	// Compute TWEAK[1-3] from TWEAK0.
289	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
290	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
291	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
292	vpclmulqdq	$0x01, GF_POLY, V0, V1
293	vpclmulqdq	$0x01, GF_POLY, V2, V3
294	vpclmulqdq	$0x01, GF_POLY, V4, V5
295	vpslldq		$8, V0, V0
296	vpslldq		$8, V2, V2
297	vpslldq		$8, V4, V4
298	vpsllq		$1*VL/16, TWEAK0, TWEAK1
299	vpsllq		$2*VL/16, TWEAK0, TWEAK2
300	vpsllq		$3*VL/16, TWEAK0, TWEAK3
301.if USE_AVX10
302	vpternlogd	$0x96, V0, V1, TWEAK1
303	vpternlogd	$0x96, V2, V3, TWEAK2
304	vpternlogd	$0x96, V4, V5, TWEAK3
305.else
306	vpxor		V0, TWEAK1, TWEAK1
307	vpxor		V2, TWEAK2, TWEAK2
308	vpxor		V4, TWEAK3, TWEAK3
309	vpxor		V1, TWEAK1, TWEAK1
310	vpxor		V3, TWEAK2, TWEAK2
311	vpxor		V5, TWEAK3, TWEAK3
312.endif
313.endif
314.endm
315
316// Do one step in computing the next set of tweaks using the method of just
317// multiplying by x repeatedly (the same method _next_tweak uses).
318.macro	_tweak_step_mulx	i
319.if \i == 0
320	.set PREV_TWEAK, TWEAK3
321	.set NEXT_TWEAK, NEXT_TWEAK0
322.elseif \i == 5
323	.set PREV_TWEAK, NEXT_TWEAK0
324	.set NEXT_TWEAK, NEXT_TWEAK1
325.elseif \i == 10
326	.set PREV_TWEAK, NEXT_TWEAK1
327	.set NEXT_TWEAK, NEXT_TWEAK2
328.elseif \i == 15
329	.set PREV_TWEAK, NEXT_TWEAK2
330	.set NEXT_TWEAK, NEXT_TWEAK3
331.endif
332.if \i >= 0 && \i < 20 && \i % 5 == 0
333	vpshufd		$0x13, PREV_TWEAK, V5
334.elseif \i >= 0 && \i < 20 && \i % 5 == 1
335	vpaddq		PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
336.elseif \i >= 0 && \i < 20 && \i % 5 == 2
337	vpsrad		$31, V5, V5
338.elseif \i >= 0 && \i < 20 && \i % 5 == 3
339	vpand		GF_POLY, V5, V5
340.elseif \i >= 0 && \i < 20 && \i % 5 == 4
341	vpxor		V5, NEXT_TWEAK, NEXT_TWEAK
342.elseif \i == 1000
343	vmovdqa		NEXT_TWEAK0, TWEAK0
344	vmovdqa		NEXT_TWEAK1, TWEAK1
345	vmovdqa		NEXT_TWEAK2, TWEAK2
346	vmovdqa		NEXT_TWEAK3, TWEAK3
347.endif
348.endm
349
350// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
351// (the same method _next_tweakvec uses for VL > 16).  This means multiplying
352// each tweak by x^(4*VL/16) independently.
353//
354// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed
355// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
356// to do 128-bit wide shifts.  The 128-bit left shift (vpslldq) saves
357// instructions directly.  The 128-bit right shift (vpsrldq) performs better
358// than a 64-bit right shift on Intel CPUs in the context where it is used here,
359// because it runs on a different execution port from the AES instructions.
360.macro	_tweak_step_pclmul	i
361.if \i == 0
362	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
363.elseif \i == 2
364	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
365.elseif \i == 4
366	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
367.elseif \i == 6
368	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
369.elseif \i == 8
370	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
371.elseif \i == 10
372	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
373.elseif \i == 12
374	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
375.elseif \i == 14
376	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
377.elseif \i == 1000
378	vpslldq		$(4*VL/16) / 8, TWEAK0, TWEAK0
379	vpslldq		$(4*VL/16) / 8, TWEAK1, TWEAK1
380	vpslldq		$(4*VL/16) / 8, TWEAK2, TWEAK2
381	vpslldq		$(4*VL/16) / 8, TWEAK3, TWEAK3
382	_vpxor		NEXT_TWEAK0, TWEAK0, TWEAK0
383	_vpxor		NEXT_TWEAK1, TWEAK1, TWEAK1
384	_vpxor		NEXT_TWEAK2, TWEAK2, TWEAK2
385	_vpxor		NEXT_TWEAK3, TWEAK3, TWEAK3
386.endif
387.endm
388
389// _tweak_step does one step of the computation of the next set of tweaks from
390// TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
391// \i that include at least 0 through 19, then 1000 which signals the last step.
392//
393// This is used to interleave the computation of the next set of tweaks with the
394// AES en/decryptions, which increases performance in some cases.  Clobbers V5.
395.macro	_tweak_step	i
396.if VL == 16
397	_tweak_step_mulx	\i
398.else
399	_tweak_step_pclmul	\i
400.endif
401.endm
402
403.macro	_setup_round_keys	enc
404
405	// Select either the encryption round keys or the decryption round keys.
406.if \enc
407	.set	OFFS, 0
408.else
409	.set	OFFS, 240
410.endif
411
412	// Load the round key for "round 0".
413	_vbroadcast128	OFFS(KEY), KEY0
414
415	// Increment KEY to make it so that 7*16(KEY) is the last round key.
416	// For AES-128, increment by 3*16, resulting in the 10 round keys (not
417	// counting the zero-th round key which was just loaded into KEY0) being
418	// -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
419	// 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
420	// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
421	//
422	// This rebasing provides two benefits.  First, it makes the offset to
423	// any round key be in the range [-96, 112], fitting in a signed byte.
424	// This shortens VEX-encoded instructions that access the later round
425	// keys which otherwise would need 4-byte offsets.  Second, it makes it
426	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
427	// beginning.  Skipping rounds at the end doesn't work as well because
428	// the last round needs different instructions.
429	//
430	// An alternative approach would be to roll up all the round loops.  We
431	// don't do that because (a) it isn't compatible with caching the round
432	// keys in registers which we do when possible (see below), (b) we
433	// interleave the AES rounds with the XTS tweak computation, and (c) it
434	// seems unwise to rely *too* heavily on the CPU's branch predictor.
435	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
436
437	// If all 32 SIMD registers are available, cache all the round keys.
438.if USE_AVX10
439	cmp		$24, KEYLEN
440	jl		.Laes128\@
441	je		.Laes192\@
442	_vbroadcast128	-6*16(KEY), KEY1
443	_vbroadcast128	-5*16(KEY), KEY2
444.Laes192\@:
445	_vbroadcast128	-4*16(KEY), KEY3
446	_vbroadcast128	-3*16(KEY), KEY4
447.Laes128\@:
448	_vbroadcast128	-2*16(KEY), KEY5
449	_vbroadcast128	-1*16(KEY), KEY6
450	_vbroadcast128	0*16(KEY), KEY7
451	_vbroadcast128	1*16(KEY), KEY8
452	_vbroadcast128	2*16(KEY), KEY9
453	_vbroadcast128	3*16(KEY), KEY10
454	_vbroadcast128	4*16(KEY), KEY11
455	_vbroadcast128	5*16(KEY), KEY12
456	_vbroadcast128	6*16(KEY), KEY13
457	_vbroadcast128	7*16(KEY), KEY14
458.endif
459.endm
460
461// Do a single non-last round of AES encryption (if \enc==1) or decryption (if
462// \enc==0) on the block(s) in \data using the round key(s) in \key.  The
463// register length determines the number of AES blocks en/decrypted.
464.macro	_vaes	enc, key, data
465.if \enc
466	vaesenc		\key, \data, \data
467.else
468	vaesdec		\key, \data, \data
469.endif
470.endm
471
472// Same as _vaes, but does the last round.
473.macro	_vaeslast	enc, key, data
474.if \enc
475	vaesenclast	\key, \data, \data
476.else
477	vaesdeclast	\key, \data, \data
478.endif
479.endm
480
481// Do a single non-last round of AES en/decryption on the block(s) in \data,
482// using the same key for all block(s).  The round key is loaded from the
483// appropriate register or memory location for round \i.  May clobber \tmp.
484.macro _vaes_1x		enc, i, xmm_suffix, data, tmp
485.if USE_AVX10
486	_vaes		\enc, KEY\i\xmm_suffix, \data
487.else
488.ifnb \xmm_suffix
489	_vaes		\enc, (\i-7)*16(KEY), \data
490.else
491	_vbroadcast128	(\i-7)*16(KEY), \tmp
492	_vaes		\enc, \tmp, \data
493.endif
494.endif
495.endm
496
497// Do a single non-last round of AES en/decryption on the blocks in registers
498// V0-V3, using the same key for all blocks.  The round key is loaded from the
499// appropriate register or memory location for round \i.  In addition, does two
500// steps of the computation of the next set of tweaks.  May clobber V4 and V5.
501.macro	_vaes_4x	enc, i
502.if USE_AVX10
503	_tweak_step	(2*(\i-5))
504	_vaes		\enc, KEY\i, V0
505	_vaes		\enc, KEY\i, V1
506	_tweak_step	(2*(\i-5) + 1)
507	_vaes		\enc, KEY\i, V2
508	_vaes		\enc, KEY\i, V3
509.else
510	_vbroadcast128	(\i-7)*16(KEY), V4
511	_tweak_step	(2*(\i-5))
512	_vaes		\enc, V4, V0
513	_vaes		\enc, V4, V1
514	_tweak_step	(2*(\i-5) + 1)
515	_vaes		\enc, V4, V2
516	_vaes		\enc, V4, V3
517.endif
518.endm
519
520// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
521// then XOR with \tweak again) of the block(s) in \data.  To process a single
522// block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
523// length VL, use V* registers and leave \xmm_suffix empty.  Clobbers \tmp.
524.macro	_aes_crypt	enc, xmm_suffix, tweak, data, tmp
525	_xor3		KEY0\xmm_suffix, \tweak, \data
526	cmp		$24, KEYLEN
527	jl		.Laes128\@
528	je		.Laes192\@
529	_vaes_1x	\enc, 1, \xmm_suffix, \data, tmp=\tmp
530	_vaes_1x	\enc, 2, \xmm_suffix, \data, tmp=\tmp
531.Laes192\@:
532	_vaes_1x	\enc, 3, \xmm_suffix, \data, tmp=\tmp
533	_vaes_1x	\enc, 4, \xmm_suffix, \data, tmp=\tmp
534.Laes128\@:
535.irp i, 5,6,7,8,9,10,11,12,13
536	_vaes_1x	\enc, \i, \xmm_suffix, \data, tmp=\tmp
537.endr
538.if USE_AVX10
539	vpxord		KEY14\xmm_suffix, \tweak, \tmp
540.else
541.ifnb \xmm_suffix
542	vpxor		7*16(KEY), \tweak, \tmp
543.else
544	_vbroadcast128	7*16(KEY), \tmp
545	vpxor		\tweak, \tmp, \tmp
546.endif
547.endif
548	_vaeslast	\enc, \tmp, \data
549.endm
550
551.macro	_aes_xts_crypt	enc
552	_define_aliases
553
554.if !\enc
555	// When decrypting a message whose length isn't a multiple of the AES
556	// block length, exclude the last full block from the main loop by
557	// subtracting 16 from LEN.  This is needed because ciphertext stealing
558	// decryption uses the last two tweaks in reverse order.  We'll handle
559	// the last full block and the partial block specially at the end.
560	lea		-16(LEN), %eax
561	test		$15, LEN8
562	cmovnz		%eax, LEN
563.endif
564
565	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
566	movl		480(KEY), KEYLEN
567
568	// Setup the pointer to the round keys and cache as many as possible.
569	_setup_round_keys	\enc
570
571	// Compute the first set of tweaks TWEAK[0-3].
572	_compute_first_set_of_tweaks
573
574	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
575	jl		.Lhandle_remainder\@
576
577.Lmain_loop\@:
578	// This is the main loop, en/decrypting 4*VL bytes per iteration.
579
580	// XOR each source block with its tweak and the zero-th round key.
581.if USE_AVX10
582	_vmovdqu	0*VL(SRC), V0
583	_vmovdqu	1*VL(SRC), V1
584	_vmovdqu	2*VL(SRC), V2
585	_vmovdqu	3*VL(SRC), V3
586	vpternlogd	$0x96, TWEAK0, KEY0, V0
587	vpternlogd	$0x96, TWEAK1, KEY0, V1
588	vpternlogd	$0x96, TWEAK2, KEY0, V2
589	vpternlogd	$0x96, TWEAK3, KEY0, V3
590.else
591	vpxor		0*VL(SRC), KEY0, V0
592	vpxor		1*VL(SRC), KEY0, V1
593	vpxor		2*VL(SRC), KEY0, V2
594	vpxor		3*VL(SRC), KEY0, V3
595	vpxor		TWEAK0, V0, V0
596	vpxor		TWEAK1, V1, V1
597	vpxor		TWEAK2, V2, V2
598	vpxor		TWEAK3, V3, V3
599.endif
600	cmp		$24, KEYLEN
601	jl		.Laes128\@
602	je		.Laes192\@
603	// Do all the AES rounds on the data blocks, interleaved with
604	// the computation of the next set of tweaks.
605	_vaes_4x	\enc, 1
606	_vaes_4x	\enc, 2
607.Laes192\@:
608	_vaes_4x	\enc, 3
609	_vaes_4x	\enc, 4
610.Laes128\@:
611.irp i, 5,6,7,8,9,10,11,12,13
612	_vaes_4x	\enc, \i
613.endr
614	// Do the last AES round, then XOR the results with the tweaks again.
615	// Reduce latency by doing the XOR before the vaesenclast, utilizing the
616	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
617	// (and likewise for vaesdeclast).
618.if USE_AVX10
619	_tweak_step	18
620	_tweak_step	19
621	vpxord		TWEAK0, KEY14, V4
622	vpxord		TWEAK1, KEY14, V5
623	_vaeslast	\enc, V4, V0
624	_vaeslast	\enc, V5, V1
625	vpxord		TWEAK2, KEY14, V4
626	vpxord		TWEAK3, KEY14, V5
627	_vaeslast	\enc, V4, V2
628	_vaeslast	\enc, V5, V3
629.else
630	_vbroadcast128	7*16(KEY), V4
631	_tweak_step	18 // uses V5
632	_tweak_step	19 // uses V5
633	vpxor		TWEAK0, V4, V5
634	_vaeslast	\enc, V5, V0
635	vpxor		TWEAK1, V4, V5
636	_vaeslast	\enc, V5, V1
637	vpxor		TWEAK2, V4, V5
638	vpxor		TWEAK3, V4, V4
639	_vaeslast	\enc, V5, V2
640	_vaeslast	\enc, V4, V3
641.endif
642
643	// Store the destination blocks.
644	_vmovdqu	V0, 0*VL(DST)
645	_vmovdqu	V1, 1*VL(DST)
646	_vmovdqu	V2, 2*VL(DST)
647	_vmovdqu	V3, 3*VL(DST)
648
649	// Finish computing the next set of tweaks.
650	_tweak_step	1000
651
652	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
653	sub		$-4*VL, DST
654	add		$-4*VL, LEN
655	jge		.Lmain_loop\@
656
657	// Check for the uncommon case where the data length isn't a multiple of
658	// 4*VL.  Handle it out-of-line in order to optimize for the common
659	// case.  In the common case, just fall through to the ret.
660	test		$4*VL-1, LEN8
661	jnz		.Lhandle_remainder\@
662.Ldone\@:
663	// Store the next tweak back to *TWEAK to support continuation calls.
664	vmovdqu		TWEAK0_XMM, (TWEAK)
665.if VL > 16
666	vzeroupper
667.endif
668	RET
669
670.Lhandle_remainder\@:
671
672	// En/decrypt any remaining full blocks, one vector at a time.
673.if VL > 16
674	add		$3*VL, LEN	// Undo extra sub of 4*VL, then sub VL.
675	jl		.Lvec_at_a_time_done\@
676.Lvec_at_a_time\@:
677	_vmovdqu	(SRC), V0
678	_aes_crypt	\enc, , TWEAK0, V0, tmp=V1
679	_vmovdqu	V0, (DST)
680	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
681	add		$VL, SRC
682	add		$VL, DST
683	sub		$VL, LEN
684	jge		.Lvec_at_a_time\@
685.Lvec_at_a_time_done\@:
686	add		$VL-16, LEN	// Undo extra sub of VL, then sub 16.
687.else
688	add		$4*VL-16, LEN	// Undo extra sub of 4*VL, then sub 16.
689.endif
690
691	// En/decrypt any remaining full blocks, one at a time.
692	jl		.Lblock_at_a_time_done\@
693.Lblock_at_a_time\@:
694	vmovdqu		(SRC), %xmm0
695	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
696	vmovdqu		%xmm0, (DST)
697	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
698	add		$16, SRC
699	add		$16, DST
700	sub		$16, LEN
701	jge		.Lblock_at_a_time\@
702.Lblock_at_a_time_done\@:
703	add		$16, LEN	// Undo the extra sub of 16.
704	// Now 0 <= LEN <= 15.  If LEN is zero, we're done.
705	jz		.Ldone\@
706
707	// Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
708	// Do ciphertext stealing to process the last 16 + LEN bytes.
709
710.if \enc
711	// If encrypting, the main loop already encrypted the last full block to
712	// create the CTS intermediate ciphertext.  Prepare for the rest of CTS
713	// by rewinding the pointers and loading the intermediate ciphertext.
714	sub		$16, SRC
715	sub		$16, DST
716	vmovdqu		(DST), %xmm0
717.else
718	// If decrypting, the main loop didn't decrypt the last full block
719	// because CTS decryption uses the last two tweaks in reverse order.
720	// Do it now by advancing the tweak and decrypting the last full block.
721	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
722	vmovdqu		(SRC), %xmm0
723	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
724.endif
725
726.if USE_AVX10
727	// Create a mask that has the first LEN bits set.
728	mov		$-1, %r9d
729	bzhi		LEN, %r9d, %r9d
730	kmovd		%r9d, %k1
731
732	// Swap the first LEN bytes of the en/decryption of the last full block
733	// with the partial block.  Note that to support in-place en/decryption,
734	// the load from the src partial block must happen before the store to
735	// the dst partial block.
736	vmovdqa		%xmm0, %xmm1
737	vmovdqu8	16(SRC), %xmm0{%k1}
738	vmovdqu8	%xmm1, 16(DST){%k1}
739.else
740	lea		.Lcts_permute_table(%rip), %r9
741
742	// Load the src partial block, left-aligned.  Note that to support
743	// in-place en/decryption, this must happen before the store to the dst
744	// partial block.
745	vmovdqu		(SRC, LEN64, 1), %xmm1
746
747	// Shift the first LEN bytes of the en/decryption of the last full block
748	// to the end of a register, then store it to DST+LEN.  This stores the
749	// dst partial block.  It also writes to the second part of the dst last
750	// full block, but that part is overwritten later.
751	vpshufb		(%r9, LEN64, 1), %xmm0, %xmm2
752	vmovdqu		%xmm2, (DST, LEN64, 1)
753
754	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
755	sub		LEN64, %r9
756	vmovdqu		32(%r9), %xmm3
757
758	// Shift the src partial block to the beginning of its register.
759	vpshufb		%xmm3, %xmm1, %xmm1
760
761	// Do a blend to generate the src partial block followed by the second
762	// part of the en/decryption of the last full block.
763	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
764.endif
765	// En/decrypt again and store the last full block.
766	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
767	vmovdqu		%xmm0, (DST)
768	jmp		.Ldone\@
769.endm
770
771// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
772//			   u8 iv[AES_BLOCK_SIZE]);
773//
774// Encrypt |iv| using the AES key |tweak_key| to get the first tweak.  Assumes
775// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
776SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
777	.set	TWEAK_KEY,	%rdi
778	.set	IV,		%rsi
779	.set	KEYLEN,		%eax
780	.set	KEYLEN64,	%rax
781
782	vmovdqu		(IV), %xmm0
783	vpxor		(TWEAK_KEY), %xmm0, %xmm0
784	movl		480(TWEAK_KEY), KEYLEN
785	lea		-16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
786	cmp		$24, KEYLEN
787	jl		.Lencrypt_iv_aes128
788	je		.Lencrypt_iv_aes192
789	vaesenc		-6*16(TWEAK_KEY), %xmm0, %xmm0
790	vaesenc		-5*16(TWEAK_KEY), %xmm0, %xmm0
791.Lencrypt_iv_aes192:
792	vaesenc		-4*16(TWEAK_KEY), %xmm0, %xmm0
793	vaesenc		-3*16(TWEAK_KEY), %xmm0, %xmm0
794.Lencrypt_iv_aes128:
795.irp i, -2,-1,0,1,2,3,4,5,6
796	vaesenc		\i*16(TWEAK_KEY), %xmm0, %xmm0
797.endr
798	vaesenclast	7*16(TWEAK_KEY), %xmm0, %xmm0
799	vmovdqu		%xmm0, (IV)
800	RET
801SYM_FUNC_END(aes_xts_encrypt_iv)
802
803// Below are the actual AES-XTS encryption and decryption functions,
804// instantiated from the above macro.  They all have the following prototype:
805//
806// void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
807//			  const u8 *src, u8 *dst, int len,
808//			  u8 tweak[AES_BLOCK_SIZE]);
809//
810// |key| is the data key.  |tweak| contains the next tweak; the encryption of
811// the original IV with the tweak key was already done.  This function supports
812// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
813// |len| must be a multiple of 16 except on the last call.  If |len| is a
814// multiple of 16, then this function updates |tweak| to contain the next tweak.
815
816.set	VL, 16
817.set	USE_AVX10, 0
818SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
819	_aes_xts_crypt	1
820SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
821SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
822	_aes_xts_crypt	0
823SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
824
825#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
826.set	VL, 32
827.set	USE_AVX10, 0
828SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
829	_aes_xts_crypt	1
830SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
831SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
832	_aes_xts_crypt	0
833SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
834
835.set	VL, 32
836.set	USE_AVX10, 1
837SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
838	_aes_xts_crypt	1
839SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
840SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
841	_aes_xts_crypt	0
842SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
843
844.set	VL, 64
845.set	USE_AVX10, 1
846SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
847	_aes_xts_crypt	1
848SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
849SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
850	_aes_xts_crypt	0
851SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
852#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
853